opinder2906 commited on
Commit
efa5966
·
verified ·
1 Parent(s): 27b481e

Upload try (1).py

Browse files
Files changed (1) hide show
  1. try (1).py +343 -0
try (1).py ADDED
@@ -0,0 +1,343 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Try.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1OBe8cQMTtii9Xh1Ak5ayDewo_4UvTSD-
8
+ """
9
+
10
+ # Step 1: Imports & Data Load
11
+ import pandas as pd
12
+ import numpy as np
13
+ import seaborn as sns
14
+ import matplotlib.pyplot as plt
15
+
16
+ from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, GridSearchCV
17
+ from sklearn.preprocessing import LabelEncoder, StandardScaler
18
+ from sklearn.impute import SimpleImputer
19
+ from sklearn.decomposition import PCA
20
+ from sklearn.manifold import TSNE
21
+ from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier # Added RandomForestClassifier here
22
+
23
+ print("\n1. DATA LOADING & INITIAL INSPECTION …………………………………………")
24
+
25
+ url = "https://drive.google.com/uc?export=download&id=1QBTnXxORRbJzE5Z2aqKHsVqgB7mqowiN"
26
+ df = pd.read_csv(url)
27
+ print(df.head(3))
28
+ print("Shape:", df.shape)
29
+
30
+ # Check nulls
31
+ print(df.isna().sum())
32
+ # Fill object columns with mode, number columns with median
33
+ for col in df.select_dtypes(include='object').columns:
34
+ df[col] = df[col].fillna(df[col].mode()[0])
35
+ for col in df.select_dtypes(include=np.number).columns:
36
+ df[col] = df[col].fillna(df[col].median())
37
+
38
+ # Outlier removal (IQR method, numeric columns)
39
+ num_cols = df.select_dtypes(include=np.number).columns
40
+ Q1 = df[num_cols].quantile(0.25)
41
+ Q3 = df[num_cols].quantile(0.75)
42
+ IQR = Q3 - Q1
43
+ mask = ~((df[num_cols] < (Q1 - 1.5 * IQR)) | (df[num_cols] > (Q3 + 1.5 * IQR))).any(axis=1)
44
+ df = df[mask]
45
+
46
+ # Encode categorical columns
47
+ from sklearn.preprocessing import LabelEncoder
48
+ cat_cols = df.select_dtypes(include='object').columns
49
+ le_dict = {}
50
+ for col in cat_cols:
51
+ le = LabelEncoder()
52
+ df[col] = le.fit_transform(df[col])
53
+ le_dict[col] = le # Save for later decoding if needed
54
+
55
+ print(df.head())
56
+
57
+ # Univariate analysis: Numeric
58
+ num_cols = df.select_dtypes(include=['int64', 'float64']).columns
59
+ for col in num_cols:
60
+ plt.figure(figsize=(6,3))
61
+ sns.histplot(df[col].dropna(), kde=True)
62
+ plt.title(f'Distribution of {col}')
63
+ plt.show()
64
+
65
+ if 'Make' in df.columns and 'Electric Range' in df.columns:
66
+ plt.figure(figsize=(12,6))
67
+ sns.boxplot(x='Make', y='Electric Range', data=df)
68
+ plt.xticks(rotation=90)
69
+ plt.title('Electric Range by Make')
70
+ plt.show()
71
+
72
+ # Pairplot of main variables (sample for large datasets)
73
+ sample_df = df.sample(min(1000, len(df)), random_state=42)
74
+ if len(num_cols) > 1:
75
+ sns.pairplot(sample_df[num_cols])
76
+ plt.suptitle('Pairplot of Numeric Features', y=1.02)
77
+ plt.show()
78
+
79
+ import matplotlib.pyplot as plt
80
+ import seaborn as sns
81
+
82
+ # Assume df is already loaded
83
+ num_cols = df.select_dtypes(include=['int64', 'float64']).columns
84
+ corr = df[num_cols].corr()
85
+
86
+ plt.figure(figsize=(10, 7))
87
+ sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm')
88
+ plt.title('Correlation Heatmap for Numeric Columns')
89
+ plt.show()
90
+
91
+ from sklearn.preprocessing import StandardScaler
92
+ from sklearn.ensemble import RandomForestClassifier
93
+
94
+ # Example new feature: Vehicle Age (if 'Model Year' exists)
95
+ if 'Model Year' in df.columns:
96
+ df['Vehicle_Age'] = 2025 - df['Model Year']
97
+
98
+ # Scaling
99
+ scaler = StandardScaler()
100
+ X_scaled = scaler.fit_transform(df.drop('Electric Range', axis=1)) # Assume Electric Range is your target
101
+
102
+ # Feature Selection (Random Forest importance)
103
+ y = (df['Electric Range'] > df['Electric Range'].median()).astype(int) # Binary target
104
+ rf_fs = RandomForestClassifier(n_estimators=100, random_state=42)
105
+ rf_fs.fit(X_scaled, y)
106
+ importances = rf_fs.feature_importances_
107
+ top_idx = np.argsort(importances)[::-1][:10]
108
+ top_features = df.drop('Electric Range', axis=1).columns[top_idx]
109
+ print("Top features:", top_features)
110
+
111
+ # Feature extraction (PCA)
112
+ from sklearn.decomposition import PCA
113
+ pca = PCA(n_components=2, random_state=42)
114
+ X_pca = pca.fit_transform(df[top_features])
115
+
116
+ import matplotlib.pyplot as plt
117
+ plt.figure(figsize=(7,5))
118
+ plt.scatter(X_pca[:,0], X_pca[:,1], c=y, cmap='viridis', alpha=0.5)
119
+ plt.title("PCA of Top Features")
120
+ plt.xlabel("PC1")
121
+ plt.ylabel("PC2")
122
+ plt.show()
123
+
124
+ from sklearn.model_selection import train_test_split
125
+
126
+ # Subsample (optional, for balanced classes)
127
+ df_balanced = df.groupby(y).apply(lambda x: x.sample(min(len(x), 300), random_state=42)).reset_index(drop=True)
128
+ X = df_balanced[top_features]
129
+ y_bal = (df_balanced['Electric Range'] > df_balanced['Electric Range'].median()).astype(int)
130
+ X_train, X_test, y_train, y_test = train_test_split(X, y_bal, test_size=0.3, random_state=42, stratify=y_bal)
131
+
132
+ from sklearn.decomposition import PCA
133
+ from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
134
+ import matplotlib.pyplot as plt
135
+ import seaborn as sns
136
+
137
+ # Apply PCA
138
+ pca = PCA(n_components=2)
139
+ X_pca = pca.fit_transform(X_train)
140
+
141
+ # Plot PCA results
142
+ plt.figure(figsize=(8, 6))
143
+ sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=y_train, palette='Set1', s=60)
144
+ plt.title("PCA - First 2 Principal Components")
145
+ plt.xlabel("PC1")
146
+ plt.ylabel("PC2")
147
+ plt.legend(title="Electric Vehicle Type") # Note: The legend title 'Cover_Type' might be a copy-paste error from another project. It should ideally reflect the actual target variable name if desired.
148
+ plt.grid(True)
149
+ plt.tight_layout()
150
+ plt.show()
151
+
152
+ # Apply LDA
153
+ # Change n_components to 1 as max_components is min(n_features, n_classes - 1) = min(10, 2 - 1) = 1
154
+ lda = LDA(n_components=1)
155
+ X_lda = lda.fit_transform(X_train, y_train)
156
+
157
+ # Plot LDA results
158
+ plt.figure(figsize=(8, 6))
159
+ # LDA with n_components=1 results in a 1D array. You typically plot this on a line or use a histogram.
160
+ # Plotting against a dummy variable or the class label itself can show separation.
161
+ # Here, we plot it on the x-axis against a constant y-value or jittered y-values for visualization.
162
+ # A more informative plot might be a histogram of LD1 values for each class.
163
+ sns.histplot(x=X_lda[:, 0], hue=y_train, kde=True, palette='Set2')
164
+ plt.title("LDA - First Linear Discriminant")
165
+ plt.xlabel("LD1")
166
+ plt.ylabel("Density")
167
+ plt.legend(title="Electric Vehicle Type")
168
+ plt.grid(True)
169
+ plt.tight_layout()
170
+ plt.show()
171
+
172
+ from sklearn.linear_model import LogisticRegression
173
+ from sklearn.svm import SVC
174
+ from sklearn.ensemble import GradientBoostingClassifier
175
+ from sklearn.naive_bayes import GaussianNB
176
+ from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, RocCurveDisplay
177
+ import matplotlib.pyplot as plt
178
+
179
+ # Store models and results
180
+ models = {
181
+ 'Logistic Regression': LogisticRegression(max_iter=1000, penalty='l2', random_state=42),
182
+ 'SVM': SVC(kernel='rbf', C=1.0, probability=True, random_state=42),
183
+ 'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42),
184
+ 'Naive Bayes': GaussianNB()
185
+ }
186
+
187
+ for name, model in models.items():
188
+ model.fit(X_train, y_train)
189
+ y_pred = model.predict(X_test)
190
+ print(f"\n===== {name} =====")
191
+ print(classification_report(y_test, y_pred))
192
+ print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
193
+ # ROC-AUC and curve if possible
194
+ if hasattr(model, "predict_proba"):
195
+ proba = model.predict_proba(X_test)[:, 1]
196
+ auc = roc_auc_score(y_test, proba)
197
+ print("ROC-AUC:", auc)
198
+ RocCurveDisplay.from_estimator(model, X_test, y_test)
199
+ plt.title(f"{name} ROC Curve")
200
+ plt.show()
201
+ else:
202
+ print("ROC-AUC not available for this model.")
203
+
204
+ # Gradient Boosting with Binning
205
+ from sklearn.preprocessing import KBinsDiscretizer
206
+
207
+ binning = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile')
208
+ X_train_binned = binning.fit_transform(X_train)
209
+ X_test_binned = binning.transform(X_test)
210
+ gbc_bin = GradientBoostingClassifier()
211
+ gbc_bin.fit(X_train_binned, y_train)
212
+ y_pred_gbc_bin = gbc_bin.predict(X_test_binned)
213
+ print("Gradient Boosting (Optimal Binning) Results:\n", classification_report(y_test, y_pred_gbc_bin))
214
+ print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_gbc_bin))
215
+
216
+ from sklearn.metrics import roc_auc_score, RocCurveDisplay
217
+ import matplotlib.pyplot as plt
218
+
219
+ models_to_plot = {
220
+ 'NB': models['Naive Bayes'],
221
+ 'LR': models['Logistic Regression'],
222
+ 'SVM': models['SVM'],
223
+ 'GBC': models['Gradient Boosting'],
224
+ 'GBC_bin': gbc_bin # gbc_bin was defined in the previous cell (ipython-input-11)
225
+ }
226
+
227
+ for name, model in models_to_plot.items():
228
+ if hasattr(model, "predict_proba"):
229
+ RocCurveDisplay.from_estimator(model, X_test, y_test)
230
+ plt.title(name + " ROC Curve")
231
+ plt.show()
232
+ print(f"{name} ROC-AUC:", roc_auc_score(y_test, model.predict_proba(X_test)[:,1]))
233
+ elif hasattr(model, "decision_function"):
234
+ RocCurveDisplay.from_estimator(model, X_test, y_test)
235
+ plt.title(name + " ROC Curve")
236
+ plt.show()
237
+
238
+ from sklearn.model_selection import RandomizedSearchCV
239
+ from sklearn.ensemble import GradientBoostingClassifier
240
+ from sklearn.linear_model import LogisticRegression
241
+ from sklearn.svm import SVC
242
+ from sklearn.naive_bayes import GaussianNB
243
+ from scipy.stats import uniform, randint
244
+
245
+ # Use a smaller subset for tuning (optional, but helps)
246
+ X_sample = X_train.sample(n=min(2000, len(X_train)), random_state=42)
247
+ y_sample = y_train.loc[X_sample.index]
248
+
249
+ # Parameter distributions
250
+ param_dist_lr = {
251
+ 'C': uniform(0.01, 10),
252
+ 'penalty': ['l2'],
253
+ 'solver': ['lbfgs']
254
+ }
255
+ param_dist_svm = {
256
+ 'C': uniform(0.1, 10)
257
+ }
258
+ param_dist_gbc = {
259
+ 'n_estimators': randint(50, 200),
260
+ 'learning_rate': uniform(0.01, 0.2),
261
+ 'max_depth': randint(3, 7)
262
+ }
263
+ param_dist_nb = {}
264
+
265
+ n_iter_search = 10 # Try 10 random combinations per model
266
+
267
+ # Logistic Regression
268
+ rs_lr = RandomizedSearchCV(
269
+ LogisticRegression(max_iter=1000, random_state=42),
270
+ param_distributions=param_dist_lr,
271
+ n_iter=n_iter_search, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
272
+ rs_lr.fit(X_sample, y_sample)
273
+ print("Best Logistic Regression params:", rs_lr.best_params_)
274
+
275
+ # SVM
276
+
277
+ # Run randomized search for SVM
278
+ # The original code defines rs_svm_linear but never fits it and then tries to access rs_svm.best_estimator_
279
+ # Let's assume the user intended to run RandomizedSearchCV for the general SVM param_dist_svm
280
+ rs_svm = RandomizedSearchCV(
281
+ SVC(random_state=42, max_iter=5000),
282
+ param_distributions=param_dist_svm, # Use the general SVM parameter distribution
283
+ n_iter=5, # Use n_iter_search for consistency
284
+ cv=2,
285
+ scoring='accuracy',
286
+ n_jobs=-1,
287
+ random_state=42
288
+ )
289
+ rs_svm.fit(X_sample, y_sample) # Fit the SVM RandomizedSearchCV
290
+ print("Best SVM params:", rs_svm.best_params_)
291
+
292
+
293
+ # Gradient Boosting
294
+ rs_gbc = RandomizedSearchCV(
295
+ # Removed n_bins, encode, strategy as they are not arguments for GBC
296
+ GradientBoostingClassifier(random_state = 42),
297
+ param_distributions=param_dist_gbc,
298
+ n_iter=n_iter_search, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
299
+ rs_gbc.fit(X_sample, y_sample)
300
+ print("Best Gradient Boosting params:", rs_gbc.best_params_)
301
+
302
+ # Naive Bayes (no real params, but for consistency)
303
+ rs_nb = RandomizedSearchCV(
304
+ GaussianNB(), param_distributions=param_dist_nb,
305
+ n_iter=1, cv=3, scoring='accuracy', random_state=42)
306
+ rs_nb.fit(X_sample, y_sample)
307
+ print("Best Naive Bayes params:", rs_nb.best_params_) # Print best params for NB as well
308
+
309
+ # Evaluate best estimators on full test set
310
+ print("\n--- Test Set Evaluation ---")
311
+ print("LR Test Accuracy:", rs_lr.best_estimator_.score(X_test, y_test))
312
+ print("SVM Test Accuracy:", rs_svm.best_estimator_.score(X_test, y_test)) # Use rs_svm
313
+ print("GBC Test Accuracy:", rs_gbc.best_estimator_.score(X_test, y_test))
314
+ print("NB Test Accuracy:", rs_nb.best_estimator_.score(X_test, y_test))
315
+
316
+ from sklearn.decomposition import PCA
317
+ import matplotlib.pyplot as plt
318
+
319
+ pca = PCA(n_components=2, random_state=42)
320
+ X_pca = pca.fit_transform(X)
321
+
322
+ plt.figure(figsize=(8,6))
323
+ plt.scatter(X_pca[:,0], X_pca[:,1], c=y_bal, cmap='coolwarm', alpha=0.6)
324
+ plt.title("PCA Projection of Data")
325
+ plt.xlabel("Principal Component 1")
326
+ plt.ylabel("Principal Component 2")
327
+ plt.colorbar(label='Class')
328
+ plt.show()
329
+
330
+ from sklearn.manifold import TSNE
331
+
332
+ # t-SNE on top features
333
+ tsne = TSNE(n_components=2, random_state=42)
334
+ X_tsne = tsne.fit_transform(X)
335
+
336
+ plt.figure(figsize=(8,5))
337
+ # Use y_bal for coloring as it corresponds to the subsampled data X
338
+ plt.scatter(X_tsne[:,0], X_tsne[:,1], c=y_bal, cmap='plasma', alpha=0.7)
339
+ plt.title("t-SNE of Features")
340
+ plt.xlabel("t-SNE1")
341
+ plt.ylabel("t-SNE2")
342
+ plt.show()
343
+