sudhirpgcmma02 commited on
Commit
8e0e7a8
·
verified ·
1 Parent(s): 270450c

Upload prep.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. prep.py +390 -0
prep.py ADDED
@@ -0,0 +1,390 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # for data manipulation
2
+ import pandas as pd
3
+ import sklearn
4
+ ## EDA
5
+ import matplotlib.pyplot as plt
6
+ import seaborn as sns
7
+ import math
8
+ from xgboost import XGBClassifier
9
+ # for creating a folder
10
+ import os
11
+ # for data preprocessing and pipeline creation
12
+ from sklearn.model_selection import train_test_split
13
+ # for converting text data in to numerical representation
14
+ from sklearn.preprocessing import LabelEncoder
15
+ from sklearn.preprocessing import StandardScaler
16
+ from sklearn.decomposition import PCA
17
+ # for hugging face space authentication to upload files
18
+ from huggingface_hub import login, HfApi, hf_hub_download
19
+ # format for EDA visualisation
20
+ sns.set(style="whitegrid", font_scale=1.1)
21
+ # Define constants for the dataset and output paths
22
+ api = HfApi(token=os.getenv("HF_TOKEN"))
23
+ # read data for Huggingface dataset space
24
+ DATASET_PATH = "hf://datasets/sudhirpgcmma02/Engine_PM/data/engine_data.csv"
25
+ df = pd.read_csv(DATASET_PATH)
26
+
27
+ ## EDA univariate / bivariate / multivarite analysis
28
+ EDA_df(df)
29
+
30
+ ################################# EDA ###########################################
31
+
32
+ def EDA_df(df):
33
+ # ===============================================
34
+ # EDA FOR FEATURES
35
+ #
36
+ # ===============================================
37
+ features=[
38
+ "Engine rpm",
39
+ "Lub oil pressure",
40
+ "Fuel pressure",
41
+ "Coolant pressure",
42
+ "lub oil temp",
43
+ "Coolant temp"
44
+ ]
45
+
46
+ # -----------------------------
47
+ # 1️ LOAD & BASIC INFORMATION
48
+ # -----------------------------
49
+
50
+ print("Shape:", df.shape)
51
+ display(df.head(3))
52
+ display(df.info())
53
+ display(df.describe().T
54
+ .style
55
+ .format("{:.2f}")
56
+ .background_gradient(cmap='Blues'))
57
+ ## normatlise
58
+ print(df['Engine Condition'].value_counts(normalize=True))
59
+
60
+ # Hanlding missing
61
+ print("missing values \n" ,df.isna().sum())
62
+
63
+ summary=pd.DataFrame(
64
+ {"Type":df.dtypes.values,
65
+ "Mean":df.mean(numeric_only=True).round(2),
66
+ "Max":df.max(numeric_only=True).round(2),
67
+ "Min":df.min(numeric_only=True).round(2),
68
+ "Missin (%)":df.isna().sum(),
69
+ "count":df.count()}
70
+ )
71
+ print("########### Summary : Table #1 ###############\n",summary)
72
+
73
+
74
+ # -----------------------------
75
+ # 2️ MISSING VALUES
76
+ # -----------------------------
77
+ missing = df.isnull().sum().sort_values(ascending=False)
78
+ if missing.any():
79
+ mv = pd.DataFrame({
80
+ "Missing Count": missing[missing > 0],
81
+ "Missing %": (missing[missing > 0]/len(df)*100).round(2)
82
+ })
83
+ display(mv)
84
+ plt.figure(figsize=(12,5))
85
+ ax=sns.barplot(x=mv.index[:20], y="Missing Count", data=mv, color='steelblue')
86
+ for container in ax.containers:
87
+ ax.bar_label(container,label_type='center')
88
+ ax.set_xticklabels(['Normal','Preventive Maintenance required'])
89
+ plt.xticks(rotation=90)
90
+ plt.title("Features Missing Values")
91
+ plt.show()
92
+ else:
93
+ print(" No missing values in the dataset")
94
+ # -----------------------------
95
+ # 3️ SPLIT FEATURE TYPES
96
+ # -----------------------------
97
+ num_cols = df.select_dtypes(include=['int64','float64']).columns.tolist()
98
+
99
+ print(f" ##################### Numeric Features: {len(num_cols)} ####################")
100
+
101
+ # -----------------------------
102
+ # 4️ Column char (Numeric)
103
+ # -----------------------------
104
+ print("\n📦 Bar Charts for Top Categorical Features")
105
+ i=0
106
+ for col in num_cols[:5]:
107
+ plt.figure(figsize=(8,4))
108
+ ax=sns.barplot(x='Engine Condition',y=col, data=df, estimator='mean', palette='viridis')
109
+ for container in ax.containers:
110
+ ax.bar_label(container,label_type='center',fmt='%.2f')
111
+ plt.title(f"Frequency Distribution: {col} | chart # {i+1}")
112
+ plt.legend(
113
+ title='Engine condition',
114
+ labels=['Normal (0)','Preventive Maintenance required (1)']
115
+ )
116
+ plt.tight_layout()
117
+ plt.show()
118
+ i+=1
119
+
120
+ # -----------------------------
121
+ # 5️ COLUMN (BAR) CHARTS (Categorical)
122
+ # -----------------------------
123
+
124
+
125
+ print("\n################### Histograms for Numeric Features ##################################")
126
+ i+=1
127
+ df_chart=df.melt(
128
+ id_vars="Engine Condition",
129
+ value_vars=features,
130
+ var_name="Sensor",
131
+ value_name="value"
132
+ )
133
+
134
+ plt.figure(figsize=(18,5))
135
+ ax=sns.barplot(x="Sensor",y="value",hue="Engine Condition",estimator="mean",errorbar=None,data=df_chart)
136
+ for container in ax.containers:
137
+ ax.bar_label(container,label_type='center',fmt='%.2f')
138
+ #ax.set_xticklabels(['Normal','Breakdown'])
139
+ ax.set_ylabel("Value (Actual)")
140
+ plt.title(f"Sensor vs Engine Condition | Chart {i}")
141
+
142
+ plt.legend(
143
+ title='Engine condition',
144
+ labels=['Normal (0)','Preventive Maintenance required (1)']
145
+ )
146
+ plt.tight_layout()
147
+ #plt.show()
148
+
149
+ df_stk=df.copy()
150
+ df_stk[features]=StandardScaler().fit_transform(df_stk[features])
151
+
152
+ df_long=df_stk.melt(
153
+ id_vars="Engine Condition",
154
+ value_vars=features,
155
+ var_name="Sensor",
156
+ value_name="value"
157
+ )
158
+ plt.figure(figsize=(18,5))
159
+ i+=1
160
+ ax=sns.barplot(x="Sensor",y="value",hue="Engine Condition",estimator="mean",ci=None,data=df_long)
161
+
162
+ for container in ax.containers:
163
+ ax.bar_label(container,label_type='center',fmt='%.2f')
164
+ handles,_=ax.get_legend_handles_labels()
165
+ ax.set_ylabel("Value (Normalised 0-1)")
166
+ plt.title(f"Sensor vs Engine Condition | Chart {i}")
167
+ plt.xticks(rotation=90)
168
+ plt.legend(
169
+ title='Engine condition',
170
+ labels=['Normal (0)','Preventive Maintenance required (1)']
171
+ )
172
+ plt.tight_layout()
173
+ plt.show()
174
+
175
+
176
+
177
+ # -----------------------------
178
+ # 6️ LINE CHART (Trend View)
179
+ # -----------------------------
180
+ print("\n📈 Line Chart for Numeric Feature Trends")
181
+ i+=1
182
+ plt.figure(figsize=(12,6))
183
+ df1=df.reset_index()
184
+ df1['step']=range(len(df))
185
+ ax=sns.lineplot(
186
+ data=df1,
187
+ x='Engine rpm',
188
+ y='Engine Condition',
189
+ color="steelblue",
190
+ label="Engine Condition"
191
+ )
192
+
193
+ sns.scatterplot(
194
+ data=df1[df1['Engine Condition']==1],
195
+ x='Engine rpm',
196
+ y='Engine Condition',
197
+ color='red',
198
+ marker="X",
199
+ s=80,
200
+ label="Preventive Maintenance "
201
+ )
202
+ plt.xlabel("Breakdonw obsrvation")
203
+ plt.ylabel("Engine condition")
204
+ plt.title(f"Engine Condition Trend | chart {i}")
205
+ plt.legend(
206
+ title='Engine condition',
207
+ labels=['Normal (0)','Preventive Maintenance required (1)']
208
+ )
209
+ plt.tight_layout()
210
+ plt.show()
211
+ # -----------------------------
212
+ # 7️ BOX PLOTS (Outlier View)
213
+ # -----------------------------
214
+ print("\n📦 Boxplots for Numeric Features")
215
+ i+=1
216
+ plt.figure(figsize=(16,8))
217
+ ax=sns.boxplot(data=df[num_cols[:10]], orient='h', palette='coolwarm')
218
+ plt.title(f"Boxplot Numeric Features | Chart {i}")
219
+ plt.show()
220
+
221
+ # -----------------------------
222
+ # 8️ STACKED COLUMN CHART
223
+ # -----------------------------
224
+ print("\n🧱 Stacked Bar Chart (Numeric grouped by Categorical Feature)")
225
+ trg="Engine Condition"
226
+ i+=1
227
+ #if len(num_cols) > 0:
228
+ # cat = cat_cols[0]
229
+ grouped = df.groupby(trg)[num_cols].mean().head(10)
230
+ ax=grouped.T.plot(kind='bar', stacked=True, figsize=(10,6), colormap='Spectral')
231
+ for container in ax.containers:
232
+ ax.bar_label(container,label_type='center',fmt='%.2f')
233
+ plt.title(f"Stacked Mean of {num_cols} | chart {i}")
234
+ plt.ylabel("Mean Value")
235
+ plt.legend(
236
+ title='Engine condition',
237
+ labels=['Normal (0)','Preventive Maintenance required (1)']
238
+ )
239
+ plt.show()
240
+
241
+ # -----------------------------
242
+ # 9️ PIE CHARTS (Numeical Composition)
243
+ # -----------------------------
244
+ print("\n🥧 Pie Charts for Features")
245
+ num_cols = df.select_dtypes(include=['int64','float64']).columns.tolist()
246
+ i+=1
247
+ for col in num_cols:
248
+ uniq=df[col].nunique()
249
+ plt.figure(figsize=(5,5))
250
+ if uniq <= 10:
251
+ cnt=df[col].value_counts()
252
+ label=cnt.index
253
+ wedg, txt, autotxt = plt.pie(cnt, labels=cnt.index, autopct='%1.1f%%', startangle=90)
254
+ plt.legend(wedg,
255
+ ['Normal (0)','Preventive Maintenance required (1)'],
256
+ title='Engine Condition',
257
+ loc='center left',
258
+ bbox_to_anchor=(1,0.5)
259
+ )
260
+ plt.title(f"Pie Chart of {col} | chart {i}")
261
+ plt.axis('equal')
262
+ plt.show()
263
+ i+=1
264
+
265
+ # -----------------------------
266
+ # 10 CORRELATION MATRIX + TABLE
267
+ # -----------------------------
268
+ print("\n🧩 Correlation Analysis")
269
+ corr = df[num_cols].corr()
270
+ i+=1
271
+ plt.figure(figsize=(12,10))
272
+ sns.heatmap(corr, cmap='coolwarm', center=0,annot=True,fmt =".2f" )
273
+ plt.title(f"Correlation Heatmap | Chart {i}")
274
+ plt.show()
275
+
276
+ # Top correlated pairs
277
+ corr_pairs = corr.unstack().sort_values(ascending=False)
278
+ corr_pairs = corr_pairs[corr_pairs < 1] # remove self correlation
279
+ top_corr = corr_pairs.head(20).to_frame("Correlation")
280
+ display(top_corr.style.background_gradient(cmap='RdYlGn'))
281
+
282
+ #############################################################
283
+ # 11 Histogram
284
+ #
285
+ ##############################################################
286
+ # target distribution
287
+ num_fea = df.select_dtypes(include=["int64","float64"]).columns
288
+ nf=len(num_fea)
289
+ col=4
290
+ i+=1
291
+ rows=math.ceil(nf/col)
292
+
293
+ plt.figure(figsize=(20,rows*4))
294
+
295
+ fig, axes = plt.subplots (
296
+ rows , col,
297
+ figsize=(22,rows *5),
298
+ constrained_layout=True
299
+ )
300
+ axes = axes.flatten()
301
+
302
+ for i,col in enumerate(num_fea):
303
+ #plt.subplot(len(num_fea)//3+1,3,i)
304
+ #plt.subplot(rows,col,i)
305
+ ax=axes[i]
306
+ sns.histplot(df[col],kde=True,bins=30,
307
+ ax= ax
308
+ )
309
+ ax.set_title(col, fontsize=12)
310
+ ax.tick_params(axis='both',labelsize=9)
311
+
312
+ for j in range( i+1 , len(axes)):
313
+ fig.delaxes(axes[j])
314
+
315
+ plt.title(f"Histogram for distribution of features | chart {i}")
316
+ plt.tight_layout()
317
+ plt.show()
318
+
319
+
320
+
321
+ ###############################################################################
322
+ # 12 PAIR plot #
323
+ ###############################################################################
324
+ i+=1
325
+ g=sns.pairplot(df[features+ ["Engine Condition"]],hue="Engine Condition",diag_kind="kde",corner=True)
326
+ g.fig.suptitle(f"Feature interaction char {i}")
327
+ n_lbl=['Normal (0)','Preventive Maintenance required (1)']
328
+ for t,l in zip(g._legend.texts,n_lbl):
329
+ t.set_text(l),
330
+ plt.show()
331
+
332
+ ###############################################################################
333
+ ## 13 Priciple componenet analysis #
334
+ ###############################################################################
335
+
336
+ x=df[features]
337
+ i+=1
338
+ y=df["Engine Condition"]
339
+
340
+ scaler=StandardScaler()
341
+ x_scaled=scaler.fit_transform(x)
342
+ pca=PCA(n_components=2)
343
+ x_pca=pca.fit_transform(x_scaled)
344
+ plt.figure(figsize=(5,5))
345
+ sns.scatterplot(x=x_pca[:,0],y=x_pca[:,1],hue=y,alpha=0.6)
346
+
347
+ plt.title(f"PCA of Features for Engine Condition | chart {i}")
348
+ plt.legend(
349
+ title='Engine condition',
350
+ labels=['Normal (0)','Preventive Maintenance required (1)']
351
+ )
352
+ plt.show()
353
+
354
+ #Features naming standardisation for easy handling
355
+ df.columns = (df.columns
356
+ .str.strip()
357
+ .str.replace(" ","_")
358
+ .str.replace(r"[^\w]","_",regex=True)
359
+ )
360
+
361
+
362
+
363
+ # Targe varaible intialisation
364
+ target_col = 'Engine_Condition'
365
+
366
+ # Split into X (features) and y (target)
367
+ X = df.drop(columns=[target_col])
368
+ y = df[target_col]
369
+
370
+ # Perform train-test split
371
+ Xtrain, Xtest, ytrain, ytest = train_test_split(
372
+ X, y, test_size=0.2, random_state=42
373
+ )
374
+
375
+ Xtrain.to_csv("Xtrain.csv",index=False)
376
+ Xtest.to_csv("Xtest.csv",index=False)
377
+ ytrain.to_csv("ytrain.csv",index=False)
378
+ ytest.to_csv("ytest.csv",index=False)
379
+
380
+
381
+ files = ["Xtrain.csv","Xtest.csv","ytrain.csv","ytest.csv"]
382
+
383
+ for file_path in files:
384
+ api.upload_file(
385
+ path_or_fileobj=file_path,
386
+ path_in_repo=file_path.split("/")[-1], # just the filename
387
+ repo_id="sudhirpgcmma02/Engine_PM",
388
+ repo_type="dataset",
389
+ )
390
+ print("Dataset after split loaded successfully to Huggingface.....")