3v324v23 commited on
Commit
b797e49
ยท
1 Parent(s): aebf59c
Files changed (3) hide show
  1. app.py +137 -0
  2. diabetes_prediction_dataset.csv +0 -0
  3. requirements.txt +6 -0
app.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import matplotlib.pyplot as plt
5
+ import seaborn as sns
6
+ from sklearn.model_selection import train_test_split
7
+ from sklearn.preprocessing import StandardScaler, LabelEncoder
8
+ from sklearn.ensemble import RandomForestClassifier
9
+ from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc
10
+
11
+ st.title("๐Ÿฉบ Diabetes Prediction App")
12
+
13
+ # Load dataset
14
+ @st.cache_data
15
+ def load_data():
16
+ file_path = "diabetes_prediction_dataset.csv"
17
+ df = pd.read_csv(file_path)
18
+ return df
19
+
20
+ df = load_data()
21
+
22
+ # Encode categorical features
23
+ label_encoders = {}
24
+ for col in ["gender", "smoking_history"]:
25
+ le = LabelEncoder()
26
+ df[col] = le.fit_transform(df[col])
27
+ label_encoders[col] = le
28
+
29
+ # Convert binary features (0,1) to "Yes" and "No" for display
30
+ binary_columns = ["hypertension", "heart_disease", "diabetes"]
31
+ df_display = df.copy() # Keep a copy for display
32
+ for col in binary_columns:
33
+ df_display[col] = df_display[col].map({0: "No", 1: "Yes"})
34
+
35
+ # Splitting dataset
36
+ X = df.drop(columns=["diabetes"])
37
+ y = df["diabetes"] # Keep original 0/1 format
38
+
39
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
40
+
41
+ # Standardizing data
42
+ scaler = StandardScaler()
43
+ X_train_scaled = scaler.fit_transform(X_train)
44
+ X_test_scaled = scaler.transform(X_test)
45
+
46
+ # Train Random Forest model
47
+ rf = RandomForestClassifier(n_estimators=100, random_state=42)
48
+ rf.fit(X_train_scaled, y_train)
49
+
50
+ # Tabs
51
+ tab1, tab2, tab3 = st.tabs(["๐Ÿ“„ Dataset Preview", "๐Ÿ“ˆ Model Performance", "๐Ÿฉบ Prediction"])
52
+
53
+ # 1๏ธโƒฃ **Tab 1: Dataset Preview**
54
+ with tab1:
55
+ st.subheader("๐Ÿ“„ Complete Dataset Preview")
56
+ st.write(df_display) # Show dataset with Yes/No for better readability
57
+
58
+ st.subheader("๐Ÿ“Š Correlation Heatmap")
59
+ plt.figure(figsize=(10,6))
60
+ sns.heatmap(df.corr(), annot=True, cmap="coolwarm", fmt=".2f")
61
+ st.pyplot(plt)
62
+
63
+ # 2๏ธโƒฃ **Tab 2: Model Performance**
64
+ with tab2:
65
+ st.subheader("๐Ÿ“ˆ Model Performance")
66
+
67
+ # Evaluate model
68
+ y_pred = rf.predict(X_test_scaled)
69
+ accuracy = accuracy_score(y_test, y_pred)
70
+ st.write(f"### โšก Random Forest Accuracy: **{accuracy:.2f}**")
71
+
72
+ # Confusion Matrix
73
+ st.write("### ๐Ÿ“Š Confusion Matrix")
74
+ cm = confusion_matrix(y_test, y_pred)
75
+ plt.figure(figsize=(5,4))
76
+ sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["No Diabetes", "Diabetes"], yticklabels=["No Diabetes", "Diabetes"])
77
+ plt.xlabel("Predicted")
78
+ plt.ylabel("Actual")
79
+ st.pyplot(plt)
80
+
81
+ # ROC Curve
82
+ st.write("### ๐Ÿ“‰ ROC Curve")
83
+ fpr, tpr, _ = roc_curve(y_test, rf.predict_proba(X_test_scaled)[:,1])
84
+ roc_auc = auc(fpr, tpr)
85
+ plt.figure(figsize=(6,4))
86
+ plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
87
+ plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
88
+ plt.xlabel("False Positive Rate")
89
+ plt.ylabel("True Positive Rate")
90
+ plt.title("Receiver Operating Characteristic (ROC) Curve")
91
+ plt.legend(loc="lower right")
92
+ st.pyplot(plt)
93
+
94
+ # 3๏ธโƒฃ **Tab 3: Prediction**
95
+ with tab3:
96
+ st.subheader("๐Ÿฉบ Make a Prediction")
97
+
98
+ # User inputs
99
+ user_name = st.text_input("Patient Name", value="John Doe")
100
+ user_gender = st.selectbox("Gender", label_encoders["gender"].classes_, key="gender_input")
101
+ user_smoking = st.selectbox("Smoking History", label_encoders["smoking_history"].classes_, key="smoking_input")
102
+
103
+ # Convert categorical inputs using label encoders
104
+ user_gender_encoded = label_encoders["gender"].transform([user_gender])[0]
105
+ user_smoking_encoded = label_encoders["smoking_history"].transform([user_smoking])[0]
106
+
107
+ # User inputs numerical features
108
+ user_data = [user_gender_encoded, user_smoking_encoded]
109
+ for col in ["age", "bmi", "HbA1c_level", "blood_glucose_level"]:
110
+ user_data.append(st.number_input(f"Enter {col}", float(df[col].min()), float(df[col].max()), float(df[col].mean())))
111
+
112
+ # User inputs binary features
113
+ user_binary_data = {}
114
+ for col in ["hypertension", "heart_disease"]:
115
+ user_binary_data[col] = st.radio(f"{col.replace('_', ' ').title()} (Yes/No)", ["No", "Yes"])
116
+
117
+ # Convert "Yes"/"No" to numerical (0 or 1) before prediction
118
+ for col in ["hypertension", "heart_disease"]:
119
+ user_data.append(1 if user_binary_data[col] == "Yes" else 0)
120
+
121
+ # Convert input into array
122
+ user_data = np.array([user_data]).reshape(1, -1)
123
+
124
+ # Predict button
125
+ if st.button("๐Ÿ”ฎ Predict"):
126
+ user_data_scaled = scaler.transform(user_data)
127
+
128
+ # Prediction
129
+ prediction = rf.predict(user_data_scaled)
130
+ probability = rf.predict_proba(user_data_scaled)[:, 1][0]
131
+
132
+ # Display result with patient name
133
+ st.subheader(f"๐Ÿค– Prediction for {user_name}")
134
+ if prediction[0] == 1:
135
+ st.error(f"๐Ÿšจ **{user_name} is likely to have diabetes.** (Probability: {probability:.2f})")
136
+ else:
137
+ st.success(f"โœ… **{user_name} is not likely to have diabetes.** (Probability: {probability:.2f})")
diabetes_prediction_dataset.csv ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ streamlit
2
+ pandas
3
+ numpy
4
+ matplotlib
5
+ seaborn
6
+ scikit-learn