import streamlit as st import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler, LabelEncoder from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc st.title("🩺 Diabetes Prediction App") # Load dataset @st.cache_data def load_data(): file_path = "diabetes_prediction_dataset.csv" df = pd.read_csv(file_path) return df df = load_data() # Encode categorical features label_encoders = {} for col in ["gender", "smoking_history"]: le = LabelEncoder() df[col] = le.fit_transform(df[col]) label_encoders[col] = le # Convert binary features (0,1) to "Yes" and "No" for display binary_columns = ["hypertension", "heart_disease", "diabetes"] df_display = df.copy() # Keep a copy for display for col in binary_columns: df_display[col] = df_display[col].map({0: "No", 1: "Yes"}) # Splitting dataset X = df.drop(columns=["diabetes"]) y = df["diabetes"] # Keep original 0/1 format X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Standardizing data scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) # Train Random Forest model rf = RandomForestClassifier(n_estimators=100, random_state=42) rf.fit(X_train_scaled, y_train) # Tabs tab1, tab2, tab3 = st.tabs(["📄 Dataset Preview", "📈 Model Performance", "🩺 Prediction"]) # 1️⃣ **Tab 1: Dataset Preview** with tab1: st.subheader("📄 Complete Dataset Preview") st.write(df_display) # Show dataset with Yes/No for better readability st.subheader("📊 Correlation Heatmap") plt.figure(figsize=(10,6)) sns.heatmap(df.corr(), annot=True, cmap="coolwarm", fmt=".2f") st.pyplot(plt) # 2️⃣ **Tab 2: Model Performance** with tab2: st.subheader("📈 Model Performance") # Evaluate model y_pred = rf.predict(X_test_scaled) accuracy = accuracy_score(y_test, y_pred) st.write(f"### ⚡ Random Forest Accuracy: **{accuracy:.2f}**") # Confusion Matrix st.write("### 📊 Confusion Matrix") cm = confusion_matrix(y_test, y_pred) plt.figure(figsize=(5,4)) sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["No Diabetes", "Diabetes"], yticklabels=["No Diabetes", "Diabetes"]) plt.xlabel("Predicted") plt.ylabel("Actual") st.pyplot(plt) # ROC Curve st.write("### 📉 ROC Curve") fpr, tpr, _ = roc_curve(y_test, rf.predict_proba(X_test_scaled)[:,1]) roc_auc = auc(fpr, tpr) plt.figure(figsize=(6,4)) plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc)) plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') plt.xlabel("False Positive Rate") plt.ylabel("True Positive Rate") plt.title("Receiver Operating Characteristic (ROC) Curve") plt.legend(loc="lower right") st.pyplot(plt) # 3️⃣ **Tab 3: Prediction** with tab3: st.subheader("🩺 Make a Prediction") # User inputs user_name = st.text_input("Patient Name", value="John Doe") user_gender = st.selectbox("Gender", label_encoders["gender"].classes_, key="gender_input") user_smoking = st.selectbox("Smoking History", label_encoders["smoking_history"].classes_, key="smoking_input") # Convert categorical inputs using label encoders user_gender_encoded = label_encoders["gender"].transform([user_gender])[0] user_smoking_encoded = label_encoders["smoking_history"].transform([user_smoking])[0] # User inputs numerical features user_data = [user_gender_encoded, user_smoking_encoded] for col in ["age", "bmi", "HbA1c_level", "blood_glucose_level"]: user_data.append(st.number_input(f"Enter {col}", float(df[col].min()), float(df[col].max()), float(df[col].mean()))) # User inputs binary features user_binary_data = {} for col in ["hypertension", "heart_disease"]: user_binary_data[col] = st.radio(f"{col.replace('_', ' ').title()} (Yes/No)", ["No", "Yes"]) # Convert "Yes"/"No" to numerical (0 or 1) before prediction for col in ["hypertension", "heart_disease"]: user_data.append(1 if user_binary_data[col] == "Yes" else 0) # Convert input into array user_data = np.array([user_data]).reshape(1, -1) # Predict button if st.button("🔮 Predict"): user_data_scaled = scaler.transform(user_data) # Prediction prediction = rf.predict(user_data_scaled) probability = rf.predict_proba(user_data_scaled)[:, 1][0] # Display result with patient name st.subheader(f"🤖 Prediction for {user_name}") if prediction[0] == 1: st.error(f"🚨 **{user_name} is likely to have diabetes.** (Probability: {probability:.2f})") else: st.success(f"✅ **{user_name} is not likely to have diabetes.** (Probability: {probability:.2f})")