Kamalika commited on
Commit
fffd2df
·
1 Parent(s): 684fc39

Updated Jupyter notebook with new changes

Browse files
Files changed (1) hide show
  1. diabetes_prediction_rbl_new_ (1).py +530 -0
diabetes_prediction_rbl_new_ (1).py ADDED
@@ -0,0 +1,530 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """diabetes prediction rbl new .ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1jWjkh5_Y4EdvjYNtxFcXhFR3sc7Hv3oG
8
+
9
+ Importing libraries
10
+ """
11
+
12
+ import numpy as np
13
+ import pandas as pd
14
+ import matplotlib.pyplot as plt
15
+ import seaborn as sns
16
+ from sklearn.preprocessing import StandardScaler
17
+ from sklearn.model_selection import train_test_split
18
+ from sklearn import svm
19
+ from sklearn.metrics import accuracy_score
20
+ from sklearn.impute import SimpleImputer
21
+ from sklearn.metrics import precision_score, recall_score, f1_score
22
+ from sklearn.metrics import classification_report
23
+ from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, roc_curve, ConfusionMatrixDisplay
24
+
25
+ """Data collection and Analysis
26
+
27
+ PIMA Diabetes Dataset(for females)
28
+ """
29
+
30
+ #loading dataset to pandas dataframe
31
+ diabetes_dataset=pd.read_csv('/content/diabetes_prediction_dataset_male female.csv')
32
+
33
+ #pd.read_csv?
34
+
35
+ diabetes_dataset.head()
36
+
37
+ #getting number of rows and columns in dataset
38
+ diabetes_dataset.shape
39
+
40
+ #getting the statistical measures of the data
41
+ diabetes_dataset.describe()
42
+
43
+ # checking how many outcomes(column=diabetes) are there
44
+ # 0=not diabetic
45
+ # 1=diabetic
46
+ diabetes_dataset['diabetes'].value_counts()
47
+
48
+ """Labelling
49
+ (for column=diabetes)
50
+ 0--->Non-Diabetic
51
+ 1--->Diabetic
52
+
53
+ ------------------------------------------------------------
54
+
55
+ (for column=gender)
56
+ 0--->Male
57
+ 1--->Female
58
+
59
+ ------------------------------------------------------------
60
+
61
+ (for column=smoking_history)
62
+ 0---> no info
63
+ 2---> current
64
+ 3---> ever
65
+ 4---> former
66
+ 5---> never
67
+ 6---> not current
68
+
69
+ """
70
+
71
+ #associating 'male' as 0 and 'female' as 1
72
+ gender_mapping = {'Male': 0, 'Female': 1}
73
+ diabetes_dataset['gender'] = diabetes_dataset['gender'].map(gender_mapping)
74
+
75
+ #grouping the dataset on basis of column=gender
76
+ #diabetes_dataset.groupby('gender').mean()
77
+
78
+ #associating 'no info' as 0, 'current' as 1, 'ever' as 2, 'former' as 3, 'never' as 4 and 'not current' as 5
79
+ smoking_history_mapping = {'no info': 0, 'current': 1, 'ever': 2, 'former': 3, 'never': 4, 'not current': 5}
80
+ diabetes_dataset['smoking_history'] = diabetes_dataset['smoking_history'].map(smoking_history_mapping)
81
+
82
+ #grouping the dataset on basis of column=smoking_history
83
+ diabetes_dataset.groupby('smoking_history').mean()
84
+
85
+ #grouping the dataset on basis of column=diabetes
86
+ diabetes_dataset.groupby('diabetes').mean()
87
+
88
+ #finding null values of column=smoking_history
89
+ diabetes_dataset['smoking_history'].isnull().sum() #here null values=='no info'
90
+
91
+ #separating the data and labels
92
+ X=diabetes_dataset.drop(columns='diabetes',axis=1)
93
+ Y=diabetes_dataset['diabetes']
94
+
95
+ print(X)
96
+
97
+ print(Y)
98
+
99
+ diabetes_dataset['smoking_history'].isnull().sum()
100
+ print(diabetes_dataset[diabetes_dataset['smoking_history']=='0'])
101
+
102
+ scalar=StandardScaler()
103
+
104
+ #fitting data in scalar variable
105
+ scalar.fit(X)
106
+
107
+ #transforming the data
108
+ standardized_data=scalar.transform(X)
109
+
110
+ #now printing standardised data
111
+ print(standardized_data)
112
+
113
+ #adding standardized data back to X
114
+ X=standardized_data
115
+ Y=diabetes_dataset['diabetes']
116
+
117
+ #print(Y.isnull().sum())
118
+ #diabetes_dataset = diabetes_dataset.dropna(subset=['diabetes'])
119
+
120
+ # Assuming X is a NumPy array
121
+ row_to_drop = 0 # Change this to the index of the row you want to drop
122
+
123
+ # Create a copy of the original array
124
+ X_original = X.copy()
125
+
126
+ # Drop the specified row
127
+ X = np.delete(X, row_to_drop, axis=0)
128
+
129
+ # If you want to revert and get back the original X
130
+ #X = X_original.copy()
131
+
132
+ #dropped one row from 'X' due to data inconsistancy
133
+
134
+ """Data visualisation
135
+
136
+ """
137
+
138
+ # Plotting age vs BMI
139
+ plt.figure(figsize=(10, 5))
140
+ plt.scatter(diabetes_dataset['age'], diabetes_dataset['bmi'], color='blue', alpha=0.5)
141
+ plt.title('Age vs BMI')
142
+ plt.xlabel('Age')
143
+ plt.ylabel('BMI')
144
+ plt.grid(True)
145
+ plt.show()
146
+
147
+ # Plotting age vs blood glucose levels
148
+ plt.figure(figsize=(10, 5))
149
+ plt.scatter(diabetes_dataset['age'], diabetes_dataset['blood_glucose_level'], color='red', alpha=0.5)
150
+ plt.title('Age vs Blood Glucose Level')
151
+ plt.xlabel('Age')
152
+ plt.ylabel('Blood Glucose Level')
153
+ plt.grid(True)
154
+ plt.show()
155
+
156
+ # Calculate statistics for age, bmi, and blood glucose level
157
+ statistics = {
158
+ 'Feature': ['age', 'bmi', 'blood_glucose_level'],
159
+ 'Mean': [diabetes_dataset['age'].mean(), diabetes_dataset['bmi'].mean(), diabetes_dataset['blood_glucose_level'].mean()],
160
+ 'Median': [diabetes_dataset['age'].median(), diabetes_dataset['bmi'].median(), diabetes_dataset['blood_glucose_level'].median()],
161
+ 'Std Dev': [diabetes_dataset['age'].std(), diabetes_dataset['bmi'].std(), diabetes_dataset['blood_glucose_level'].std()]
162
+ }
163
+
164
+ # Create DataFrame
165
+ statistics_df = pd.DataFrame(statistics)
166
+
167
+ # Print the DataFrame
168
+ print(statistics_df)
169
+
170
+ # Pairplot for comparative analysis
171
+ sns.pairplot(diabetes_dataset[['age', 'bmi', 'blood_glucose_level']])
172
+ plt.show()
173
+
174
+ # Correlation matrix
175
+ correlation_matrix = diabetes_dataset[['age', 'bmi', 'blood_glucose_level']].corr()
176
+ print("Correlation Matrix:")
177
+ print(correlation_matrix)
178
+
179
+ """Splitting into training data and test data"""
180
+
181
+ # Adding a row of zeros to X
182
+ new_row = np.zeros((1, X.shape[1]))
183
+ X = np.vstack([X, new_row])
184
+
185
+ # Now checking the shapes again
186
+ print(X.shape, Y.shape)
187
+
188
+ #stratify=Y means that all the Y dataset(Outcome values) will be randomly split in X_train and X_test
189
+ #random_state=2 this replicates the randomness with which the values will be split
190
+ X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=2)
191
+
192
+ #printing how the values are split
193
+ print(X.shape,X_train.shape,X_test.shape)
194
+
195
+ """Training the model"""
196
+
197
+ #declaring variable classifier to use the svm
198
+ #A linear kernel implies that the decision boundary between classes will be a straight line in the feature space.
199
+ classifier=svm.SVC(kernel='linear')
200
+
201
+ from sklearn.impute import SimpleImputer
202
+ from sklearn.svm import SVC
203
+
204
+ # Define your classifier
205
+ classifier = SVC(kernel='linear' , probability=True)
206
+
207
+ # Creating an imputer object with a strategy to fill NaN values with the mean of each feature
208
+ imputer = SimpleImputer(strategy='mean')
209
+
210
+ # Fiting the imputer on the training data and transform both the training and testing data
211
+ X_train_imputed = imputer.fit_transform(X_train)
212
+ X_test_imputed = imputer.transform(X_test)
213
+
214
+ # Now, training the model using the imputed training data
215
+ classifier.fit(X_train_imputed, Y_train)
216
+
217
+ """Model Evaluation
218
+
219
+ Accuracy Score
220
+ """
221
+
222
+ #accuracy score on the training data
223
+ X_train_accuracy=classifier.predict(X_train_imputed)
224
+ training_data_accuracy=accuracy_score(X_train_accuracy,Y_train)
225
+
226
+ print('Accuracy score of the training data:',training_data_accuracy)
227
+
228
+ # Fiting the imputer on the training data and transform the testing data
229
+ X_test_imputed = imputer.transform(X_test)
230
+ #checking if test data has nan values
231
+ print(np.isnan(X_test_imputed).sum())
232
+
233
+ # Accuracy score on the test data
234
+ X_test_accuracy = classifier.predict(X_test_imputed)
235
+ test_data_accuracy = accuracy_score(X_test_accuracy, Y_test)
236
+ print('Accuracy score of the test data:', test_data_accuracy)
237
+
238
+ #accuracy score on the test data
239
+ #X_test_accuracy=classifier.predict(X_test)
240
+ #test_data_accuracy=accuracy_score(X_test_accuracy,Y_test)
241
+
242
+ """Labelling (for column=diabetes) 0--->Non-Diabetic
243
+ 1--->Diabetic
244
+
245
+ (for column=gender) 0--->Male
246
+ 1--->Female
247
+
248
+ (for column=smoking_history) 0---> no info
249
+ 2---> current 3---> ever 4---> former 5---> never 6---> not current
250
+ """
251
+
252
+ diabetes_dataset.head()
253
+
254
+ """Making predictive system"""
255
+
256
+ input_data=(1,25,0,1,3,36.38,4,158) #input_data is a sample data array to test predictive system
257
+
258
+ #changing the input_data to numpy array
259
+ input_data_as_numpy_array=np.asarray(input_data)
260
+
261
+ #reshaping the array so that we are predicting for only one instance
262
+ input_data_reshaped=input_data_as_numpy_array.reshape(1,-1)
263
+
264
+ #standardising the input_data
265
+ std_data=scalar.transform(input_data_reshaped)
266
+ print(std_data)
267
+
268
+ #predicting based on the standardised input_data (std_data)
269
+ prediction=classifier.predict(std_data)
270
+
271
+ #to find if the person is diabetic or not
272
+ if (prediction[0]==0): #prediction[0] means taking the first value from variable prediction
273
+ print(prediction)
274
+ print('The person is not diabetic')
275
+ else:
276
+ print(prediction)
277
+ print('The person is diabetic')
278
+
279
+ # Predict probabilities
280
+ predicted_probabilities = classifier.predict_proba(X_test_imputed)
281
+
282
+ # Get the probability of the positive class (class 1)
283
+ diabetes_probability = predicted_probabilities[:, 1]
284
+
285
+ # Convert the probability to percentage
286
+ diabetes_percentage = diabetes_probability * 100
287
+
288
+ # Print the likelihood of the person to get diabetes as a percentage
289
+ if (prediction[0] == 1):
290
+ print('Likelihood of the person to get diabetes: {:.2f}%'.format(diabetes_percentage[0]))
291
+ else:
292
+ print('Likelihood of the person not to get diabetes: {:.2f}%'.format(100 - diabetes_percentage[0]))
293
+
294
+ #Using Logistic Regression
295
+ from sklearn.linear_model import LogisticRegression
296
+
297
+ # Defining logistic regression classifier
298
+ logistic_regression_classifier = LogisticRegression()
299
+
300
+ # Training the logistic regression model using the imputed training data
301
+ logistic_regression_classifier.fit(X_train_imputed, Y_train)
302
+
303
+ # Accuracy score on the training data
304
+ training_data_accuracy_logistic = logistic_regression_classifier.score(X_train_imputed, Y_train)
305
+ print('Accuracy score of the training data (Logistic Regression):', training_data_accuracy_logistic)
306
+
307
+ # Accuracy score on the test data
308
+ test_data_accuracy_logistic = logistic_regression_classifier.score(X_test_imputed, Y_test)
309
+ print('Accuracy score of the test data (Logistic Regression):', test_data_accuracy_logistic)
310
+
311
+ # Making predictive statement
312
+ # Assuming input_data is the same as before
313
+ # Predicting based on the standardized input_data (std_data)
314
+ prediction_logistic = logistic_regression_classifier.predict(std_data)
315
+
316
+ # Predicting probabilities
317
+ predicted_probabilities_logistic = logistic_regression_classifier.predict_proba(X_test_imputed)
318
+
319
+ # the probability of the positive class (class 1)
320
+ diabetes_probability_logistic = predicted_probabilities_logistic[:, 1]
321
+
322
+ # Converting the probability to percentage
323
+ diabetes_percentage_logistic = diabetes_probability_logistic * 100
324
+
325
+ # Printing the likelihood of the person to get diabetes as a percentage
326
+ if prediction_logistic[0] == 1:
327
+ print('Likelihood of the person to get diabetes (Logistic Regression): {:.2f}%'.format(diabetes_percentage_logistic[0]))
328
+ else:
329
+ print('Likelihood of the person not to get diabetes (Logistic Regression): {:.2f}%'.format(100 - diabetes_percentage_logistic[0]))
330
+
331
+ #Random Forest
332
+ from sklearn.ensemble import RandomForestClassifier
333
+
334
+ # Define your Random Forest classifier
335
+ random_forest_classifier = RandomForestClassifier()
336
+
337
+ # Train the Random Forest model using the imputed training data
338
+ random_forest_classifier.fit(X_train_imputed, Y_train)
339
+
340
+ # Accuracy score on the training data
341
+ training_data_accuracy_rf = random_forest_classifier.score(X_train_imputed, Y_train)
342
+ print('Accuracy score of the training data (Random Forest):', training_data_accuracy_rf)
343
+
344
+ # Accuracy score on the test data
345
+ test_data_accuracy_rf = random_forest_classifier.score(X_test_imputed, Y_test)
346
+ print('Accuracy score of the test data (Random Forest):', test_data_accuracy_rf)
347
+
348
+ # Making predictive statement
349
+ # Predicting based on the standardized input_data (std_data)
350
+ prediction_rf = random_forest_classifier.predict(std_data)
351
+
352
+ # Predict probabilities
353
+ predicted_probabilities_rf = random_forest_classifier.predict_proba(X_test_imputed)
354
+
355
+ # Get the probability of the positive class (class 1)
356
+ diabetes_probability_rf = predicted_probabilities_rf[:, 1]
357
+
358
+ # Convert the probability to percentage
359
+ diabetes_percentage_rf = diabetes_probability_rf * 100
360
+
361
+ # Print the likelihood of the person to get diabetes as a percentage
362
+ if prediction_rf[0] == 1:
363
+ print('Likelihood of the person to get diabetes (Random Forest): {:.2f}%'.format(diabetes_percentage_rf[0]))
364
+ else:
365
+ print('Likelihood of the person not to get diabetes (Random Forest): {:.2f}%'.format(100 - diabetes_percentage_rf[0]))
366
+
367
+ #Gradient Boosting
368
+ from sklearn.ensemble import GradientBoostingClassifier
369
+
370
+ # Define your Gradient Boosting classifier
371
+ gradient_boosting_classifier = GradientBoostingClassifier()
372
+
373
+ # Train the Gradient Boosting model using the imputed training data
374
+ gradient_boosting_classifier.fit(X_train_imputed, Y_train)
375
+
376
+ # Accuracy score on the training data
377
+ training_data_accuracy_gb = gradient_boosting_classifier.score(X_train_imputed, Y_train)
378
+ print('Accuracy score of the training data (Gradient Boosting):', training_data_accuracy_gb)
379
+
380
+ # Accuracy score on the test data
381
+ test_data_accuracy_gb = gradient_boosting_classifier.score(X_test_imputed, Y_test)
382
+ print('Accuracy score of the test data (Gradient Boosting):', test_data_accuracy_gb)
383
+
384
+ # Making predictive statement
385
+ # Predicting based on the standardized input_data (std_data)
386
+ prediction_gb = gradient_boosting_classifier.predict(std_data)
387
+
388
+ # Predict probabilities
389
+ predicted_probabilities_gb = gradient_boosting_classifier.predict_proba(X_test_imputed)
390
+
391
+ # Get the probability of the positive class (class 1)
392
+ diabetes_probability_gb = predicted_probabilities_gb[:, 1]
393
+
394
+ # Convert the probability to percentage
395
+ diabetes_percentage_gb = diabetes_probability_gb * 100
396
+
397
+ # Print the likelihood of the person to get diabetes as a percentage
398
+ if prediction_gb[0] == 1:
399
+ print('Likelihood of the person to get diabetes (Gradient Boosting): {:.2f}%'.format(diabetes_percentage_gb[0]))
400
+ else:
401
+ print('Likelihood of the person not to get diabetes (Gradient Boosting): {:.2f}%'.format(100 - diabetes_percentage_gb[0]))
402
+
403
+ # Making predictive statement using Gradient Boosting
404
+ # Predicting based on the standardized input_data (std_data)
405
+ prediction_gb = gradient_boosting_classifier.predict(std_data)
406
+
407
+ # Predict probabilities
408
+ predicted_probabilities_gb = gradient_boosting_classifier.predict_proba(std_data)
409
+
410
+ # Get the probability of the positive class (class 1)
411
+ diabetes_probability_gb = predicted_probabilities_gb[:, 1]
412
+
413
+ # Convert the probability to percentage
414
+ diabetes_percentage_gb = diabetes_probability_gb * 100
415
+
416
+ # Print the likelihood of the person to get diabetes as a percentage
417
+ if prediction_gb[0] == 1:
418
+ print('Likelihood of the person to get diabetes (Gradient Boosting): {:.2f}%'.format(diabetes_percentage_gb[0]))
419
+ else:
420
+ print('Likelihood of the person not to get diabetes (Gradient Boosting): {:.2f}%'.format(100 - diabetes_percentage_gb[0]))
421
+
422
+ # Check the shape of Y_test
423
+ print('Shape of Y_test:', Y_test.shape)
424
+
425
+ # Check the shape of prediction_gb
426
+ print('Shape of prediction_gb:', prediction_gb.shape)
427
+
428
+ # Import necessary libraries
429
+ from sklearn.linear_model import LogisticRegression
430
+ from sklearn.ensemble import RandomForestClassifier
431
+ from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
432
+ from collections import Counter
433
+ from sklearn.model_selection import train_test_split
434
+
435
+ # Assuming X_train_imputed, Y_train, X_test_imputed, and Y_test are already defined
436
+ # Step 1: Check class distribution
437
+ print("Class Distribution in Training Set:", Counter(Y_train))
438
+
439
+ # Step 2: Define models with class weights balanced
440
+ logistic_regression_classifier = LogisticRegression(class_weight='balanced', random_state=42)
441
+ random_forest_classifier = RandomForestClassifier(class_weight='balanced', random_state=42)
442
+
443
+ # Step 3: Train the models
444
+ logistic_regression_classifier.fit(X_train_imputed, Y_train)
445
+ random_forest_classifier.fit(X_train_imputed, Y_train)
446
+
447
+ # Step 4: Check Training Accuracy
448
+ log_reg_train_accuracy = logistic_regression_classifier.score(X_train_imputed, Y_train)
449
+ rf_train_accuracy = random_forest_classifier.score(X_train_imputed, Y_train)
450
+ print(f"Logistic Regression Training Accuracy: {log_reg_train_accuracy:.2f}")
451
+ print(f"Random Forest Training Accuracy: {rf_train_accuracy:.2f}")
452
+
453
+ # Step 5: Make Predictions on Test Set
454
+ log_reg_predictions = logistic_regression_classifier.predict(X_test_imputed)
455
+ rf_predictions = random_forest_classifier.predict(X_test_imputed)
456
+
457
+ # Step 6: Evaluate Models
458
+ def evaluate_model(y_true, y_pred, model_name):
459
+ print(f"\nEvaluation for {model_name}:")
460
+ precision = precision_score(y_true, y_pred)
461
+ recall = recall_score(y_true, y_pred)
462
+ f1 = f1_score(y_true, y_pred)
463
+ auc = roc_auc_score(y_true, y_pred)
464
+ cm = confusion_matrix(y_true, y_pred)
465
+
466
+ print(f"Precision: {precision:.2f}")
467
+ print(f"Recall: {recall:.2f}")
468
+ print(f"F1 Score: {f1:.2f}")
469
+ print(f"AUC: {auc:.2f}")
470
+ print("Confusion Matrix:")
471
+ print(cm)
472
+
473
+ # Evaluate Logistic Regression
474
+ evaluate_model(Y_test, log_reg_predictions, "Logistic Regression")
475
+
476
+ # Evaluate Random Forest
477
+ evaluate_model(Y_test, rf_predictions, "Random Forest")
478
+
479
+ # #cosine similarity
480
+
481
+ # from sklearn.metrics.pairwise import cosine_similarity
482
+ # import numpy as np
483
+
484
+ # # Input data (high-risk patient sample)
485
+ # diabetic_sample = np.array([1, 65, 3, 2, 3, 35.5, 7.2, 180])
486
+
487
+ # # Input from the user (new patient sample)
488
+ # test_input = input("Enter your features according to the format: gender, age, hypertension, heart_disease, smoking_history, bmi, HbA1c_level, blood_glucose_level: ")
489
+ # user_input = np.array(test_input.split(","))
490
+ # #user_input = np.array([1, 50, 2, 1, 2, 28.5, 5.5, 160])
491
+
492
+ # # Reshaping the data to match cosine similarity input shape
493
+ # diabetic_sample = diabetic_sample.reshape(1, -1)
494
+ # user_input = user_input.reshape(1, -1)
495
+
496
+ # # Calculate cosine similarity
497
+ # similarity = cosine_similarity(diabetic_sample, user_input)[0][0]
498
+
499
+ # # Convert cosine similarity to probability (inverted logic)
500
+ # probability_of_diabetes = similarity * 100 # Inverted to reflect probability
501
+
502
+ # # Output the results
503
+ # print("Cosine Similarity:", similarity)
504
+ # print("Probability of Diabetes: {:.2f}%".format(probability_of_diabetes))
505
+
506
+ # #euclidean distance
507
+
508
+ # import numpy as np
509
+ # from math import sqrt
510
+
511
+ # # Diabetic sample features (this represents a high-risk patient)
512
+ # diabetic_sample = np.array([1, 65, 3, 2, 3, 35.5, 7.2, 180])
513
+
514
+ # # User input for testing
515
+ # user_input = input("Enter your features according to the format: gender, age, hypertension, heart_disease, smoking_history, bmi, HbA1c_level, blood_glucose_level: ")
516
+ # user_features = np.array([float(x) for x in user_input.split(',')])
517
+
518
+ # # Euclidean distance calculation
519
+ # euclidean_distance = np.sqrt(np.sum((diabetic_sample - user_features) ** 2))
520
+ # print(f"Euclidean Distance: {euclidean_distance}")
521
+
522
+ # # Converting the Euclidean distance into a probability
523
+ # # The greater the distance, the lower the probability of diabetes
524
+ # # You can adjust the scaling factor based on your data distribution to better fit your problem
525
+
526
+ # max_distance = np.linalg.norm(diabetic_sample) # Maximum possible distance (diabetic sample vs. zero)
527
+ # probability_diabetes = (1 - (euclidean_distance / max_distance)) * 100
528
+ # probability_diabetes = max(0, min(100, probability_diabetes)) # Keep it in the 0-100 range
529
+
530
+ # print(f"Probability of Diabetes: {probability_diabetes:.2f}%")