Demo / app.py
SumantBobade's picture
Update app.py
c341ea4 verified
# -*- coding: utf-8 -*-
"""app.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1umH6P4k0xEUEZsizNZfLzFttGrqivmwq
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")
df = pd.read_csv("/content/diabetes_prediction_dataset.csv")
df.head(10)
df.describe()
df.info()
df.isnull().sum()
print(df.duplicated().sum())
df = df.drop_duplicates()
print("________Removed Duplicate________")
print(df.duplicated().sum())
#Function to add counts on bars
def add_counts(ax):
for p in ax.patches:
ax.annotate(f'{int(p.get_height())}', (p.get_x()+p.get_width()/2., p.get_height()),
ha ='center', va='center', fontsize=10, color='black', xytext=(0,5), textcoords='offset points')
#set up the matplotlib figure
fig, axes = plt.subplots(3, 2, figsize=(15, 15))
#Plot gender grouped by dibetes
ax = sns.countplot(ax=axes[0,0], x='gender', hue='diabetes', data=df)
ax.set_title('Gender Grouped by Diabetes')
add_counts(ax)
#Plot hypertension groupef by diabetes
ax = sns.countplot(ax=axes[0,1], x='hypertension', hue='diabetes', data=df)
ax.set_title('Hypertension Grouped by Diabetes')
add_counts(ax)
#Plot heart disease grouped by diabetes
ax = sns.countplot(ax=axes[1,0], x='heart_disease', hue='diabetes', data=df)
ax.set_title('Heart Disease Grouped by Diabetes')
add_counts(ax)
#Plot smoking history groupde by diabetes
ax = sns.countplot(ax=axes[1,1], x='smoking_history', hue='diabetes', data=df)
ax.set_title('Smoking History Grouped by Diabetes')
add_counts(ax)
# Plot diabetes
ax = sns.countplot(ax=axes[2, 0], x='diabetes', data=df)
axes[2, 0].set_title('Diabetes Count')
add_counts(ax)
# Create pie plot for diabetes
diabetes_counts = df['diabetes'].value_counts()
axes[2, 1].pie(diabetes_counts, labels=diabetes_counts.index, autopct='%1.1f%%', startangle=90)
axes[2, 1].set_title('Diabetes Distribution')
axes[2, 1].axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
axes[2, 1].legend(title='Diabetes:', loc='upper right')
# Adjust the layout
plt.tight_layout()
# Show the plots
plt.show()
#Calculate minimum, maximum, and average age
min_age = df['age'].min()
max_age = df['age'].max()
avg_age = df['age'].mean()
#Count of individuals with and without diabetes
diabetes_counts = df['diabetes'].value_counts()
#Group by dibetes status and calculate min and max ages
grouped_ages = df.groupby('diabetes')['age'].agg(['min', 'max'])
#Print the results
print("Minimum Age:", min_age)
print("Maximum Age:", max_age)
print("Average Age:", avg_age)
print(diabetes_counts)
print("Age Statistics by Diabetes Status:")
print(grouped_ages)
# Plotting
fig, ax = plt.subplots(1, 2, figsize=(14, 6))
# Plot for overall min, max, and average age
bars = ax[0].bar(['Min Age', 'Max Age', 'Avg Age'], [min_age, max_age, avg_age], color=['blue', 'red', 'green'])
ax[0].set_title('Overall Age Statistics')
ax[0].set_ylabel('Age')
# Annotate bars with their values
for bar in bars:
yval = bar.get_height()
ax[0].text(bar.get_x() + bar.get_width()/2, yval, round(yval, 2), va='bottom') # Add text to the top of the bars
# Plot for min and max ages grouped by diabetes status
grouped_bars = grouped_ages.plot(kind='bar', ax=ax[1])
ax[1].set_title('Age Statistics by Diabetes Status')
ax[1].set_ylabel('Age')
# Annotate bars with their values
for p in grouped_bars.patches:
grouped_bars.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))
plt.tight_layout()
plt.show()
cross_table = pd.crosstab(df['diabetes'], df['smoking_history'])
# Create subplots
fig, ax = plt.subplots(1, 2, figsize=(20, 8))
# Plotting the cross table as a heatmap
sns.heatmap(cross_table, cmap='YlOrRd', annot=True, fmt='d', linewidths=0.5, linecolor='black', ax=ax[0])
ax[0].set_title('Diabetes and Smoking History (Heatmap)')
ax[0].set_xlabel('Smoking History')
ax[0].set_ylabel('Diabetes')
# Plotting the cross table with separate bars for smoking history
cross_table.plot(kind='bar', stacked=False, ax=ax[1], color=plt.cm.Paired.colors)
ax[1].set_title('Diabetes and Smoking History (Bar Plot)')
ax[1].set_xlabel('Diabetes')
ax[1].set_ylabel('Count')
ax[1].legend(title='Smoking History', bbox_to_anchor=(1.05, 1), loc='upper left')
# Annotate bars with their values
for container in ax[1].containers:
ax[1].bar_label(container)
plt.tight_layout()
plt.show()
#incode the data
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['gender'] = le.fit_transform(df['gender'])
df['smoking_history'] = le.fit_transform(df['smoking_history'])
df.head()
##Assume df is your datafram
#Selecting features and target variable
features = ['gender', 'age', 'hypertension', 'heart_disease', 'smoking_history', 'bmi', 'HbA1c_level', 'blood_glucose_level']
X= df[features]
Y= df['diabetes']
# Standardizing the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Applying PCA
pca = PCA()
X_pca = pca.fit_transform(X_scaled)
# Plotting the cumulative explained variance
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(pca.explained_variance_ratio_) + 1),
pca.explained_variance_ratio_.cumsum(), marker='o', linestyle='--')
plt.title('Explained Variance by Number of Principal Components')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance')
plt.grid()
# Find the index of the maximum cumulative explained variance
max_index = pca.explained_variance_ratio_.cumsum().argmax()
# Annotate the point with the highest cumulative explained variance
plt.annotate(f'Max: PC {max_index + 1}',
xy=(max_index + 1, pca.explained_variance_ratio_.cumsum()[max_index]),
xytext=(max_index + 2, pca.explained_variance_ratio_.cumsum()[max_index] - 0.05),
arrowprops=dict(facecolor='black', arrowstyle='->', color='black'))
plt.show()
# Printing explained variance ratios
for i, ratio in enumerate(pca.explained_variance_ratio_.cumsum()):
print(f'Principal Component {i+1}: {ratio:.4f} cumulative explained variance')
# Choose the number of components that explain most of the variance
n_components = max_index + 1
# Applying PCA with the optimal number of components
pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(X_scaled)
#Splitting the date into traing and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, Y, test_size=0.2, random_state=42)
#Initializing and training the XGBoost model
xgb_model = XGBClassifier()
xgb_model.fit(X_train, y_train)
#Making predictions on the test set
y_pred = xgb_model.predict(X_test)
# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
print(f'XGBoost Accuracy: {accuracy:.4f}')
print(f'XGBoost Classification Report:\n{classification_report(y_test, y_pred)}')
# Compute the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
# Plotting the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['No Diabetes', 'Diabetes'], yticklabels=['No Diabetes', 'Diabetes'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()
import pickle
# Save the model
with open('Diabetes_model.pkl', 'wb') as f:
pickle.dump(xgb_model, f)
# Prepare custom data
custom_data = [
[1, 45, 0, 0, 1, 25.6, 6.5, 110],
[0, 35, 1, 0, 0, 28.2, 7.2, 130],
[1, 55, 1, 1, 1, 31.4, 8.0, 150],
[0, 42, 0, 1, 0, 26.9, 7.0, 120],
[1, 50, 1, 0, 1, 29.7, 7.8, 140]
]
# Convert to pandas DataFrame
custom_df = pd.DataFrame(custom_data, columns=features)
# Standardize the custom data
custom_X = scaler.transform(custom_df[features])
# Apply PCA transformation
custom_X_pca = pca.transform(custom_X)
# Make predictions using the trained XGBoost model
custom_predictions = xgb_model.predict(custom_X_pca)
# Print the predictions
for i, pred in enumerate(custom_predictions):
if pred == 0:
print(f"Person {i+1} is not predicted to have diabetes.")
else:
print(f"Person {i+1} is predicted to have diabetes.")
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# Step 1: Split the data
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
# Step 2: Instantiate the classifier
xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
# Step 3: Train the model
xgb_clf.fit(X_train, y_train)
# Step 4: Make predictions
y_pred = xgb_clf.predict(X_test)
# Step 5: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy: {accuracy:.2f}")
# Compute the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
# Plotting the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['No Hypertension', 'Hypertension'], yticklabels=['No Hypertension', 'Hypertension'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()
import pickle
# Save the model
with open('hypertension_model.pkl', 'wb') as f:
pickle.dump(xgb_model, f)
features = ['gender', 'age', 'diabetes', 'heart_disease', 'smoking_history', 'bmi', 'HbA1c_level', 'blood_glucose_level']
customs_data = [
[1, 45, 0, 0, 1, 25.6, 6.5, 110],
[0, 35, 1, 0, 0, 28.2, 7.2, 130],
[1, 55, 1, 1, 1, 31.4, 8.0, 150],
[0, 42, 1, 1, 0, 26.9, 7.0, 120],
[1, 50, 1, 0, 1, 29.7, 7.8, 140]
]
custom_df = pd.DataFrame(customs_data, columns=features)
custom_predictions = xgb_model.predict(custom_df)
for i, pred in enumerate(custom_predictions):
if pred == 0:
print(f"Person {i+1} is not predicted to have hypertension.")
else:
print(f"Person {i+1} is predicted to have hypertension.")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
import cv2
import io
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
print(tf.__version__)
import kagglehub
# Download latest version
path = kagglehub.dataset_download("borhanitrash/alzheimer-mri-disease-classification-dataset")
print("Path to dataset files:", path)
train ='/content/train-00000-of-00001-c08a401c53fe5312.parquet'
test = '/content/test-00000-of-00001-44110b9df98c5585.parquet'
categorias = {
0: 'Mild_Demented',
1: 'Moderate_Demented',
2: 'Non_Demented',
3: 'Very_Mild_Demented'
}
data_train = pd.read_parquet(train)
data_test = pd.read_parquet(test)
data_train.head()
img_dict = data_train['image'][0]
byte_string = img_dict['bytes']
nparr = np.frombuffer(byte_string, np.uint8)
img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
image = Image.open(io.BytesIO(byte_string))
display(image)
def extraccion_y_transformacion(images_set):
et_list_images=[]
images_bytes = images_set['image']
for img_dict in images_bytes:
byte_string = img_dict['bytes']
nparr = np.frombuffer(byte_string, np.uint8)
img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
et_list_images.append(img)
return et_list_images
def visualizar_imagenes(image_set, categorias, limit=5):
fig, axes = plt.subplots(1, limit, figsize=(10, 5))
image_bytes = image_set['image']
for i, (ax, row) in enumerate(zip(axes, image_set.iterrows())):
img_dict = row[1]['image']
label = row[1]['label']
name = categorias[label]
byte_string = img_dict['bytes']
image = Image.open(io.BytesIO(byte_string))
ax.imshow(image, cmap='gray')
ax.set_title(name)
ax.axis('off')
if i + 1 == limit:
break
plt.tight_layout()
plt.show()
train_transformado = extraccion_y_transformacion(data_train)
test_transformado = extraccion_y_transformacion(data_test)
print(train_transformado[:1])
visualizar_imagenes(data_train, categorias, limit=5)
y_test = []
for label in data_test['label']:
y_test.append(label)
y_train = []
for label in data_train['label']:
y_train.append(label)
y_train = np.array(y_train)
y_test = np.array(y_test)
unique, counts = np.unique(y_train, return_counts=True)
plt.bar(unique, counts)
plt.xlabel('Clases')
plt.ylabel('Cantidad')
plt.title('Distribucion de clases')
plt.xticks(unique)
plt.show()
y_train = tf.one_hot(y_train.astype(np.int32), depth=4)
y_test = tf.one_hot(y_test.astype(np.int32), depth=4)
y_train
train_transformado = np.array(train_transformado)/255
test_transformado = np.array(test_transformado)/255
train_transformado = [np.expand_dims(img, axis=-1) for img in train_transformado] # agregar el canal de escala de grises
test_transformado = [np.expand_dims(img, axis=-1) for img in test_transformado]
#test_transformado = [np.expand_dims(img, axis=-1) for img in test_transformado]
train_transformado = np.array(train_transformado)
test_transformado = np.array(test_transformado)
train_transformado[0].shape
train_transformado.shape
class MinMaxScaler3D(MinMaxScaler):
def fit_transform(self, X, y=None):
x = np.reshape(X, newshape=(X.shape[0]*X.shape[1], X.shape[2]))
return np.reshape(super().fit_transform(x, y=y), newshape=X.shape)
scaler = MinMaxScaler3D()
train_scaled = [scaler.fit_transform(X=img) for img in train_transformado]
train_scaled = np.array(train_scaled)
test_scaled = [scaler.fit_transform(X=img) for img in test_transformado]
test_scaled = np.array(test_scaled)
train_scaled.shape
from tensorflow import keras
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import layers
from tensorflow.keras import Sequential, initializers
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.optimizers import Adam
#optimizer = Adam()
optimizer = Adam(
learning_rate=0.001, # Tasa de aprendizaje
beta_1=0.9, # Decay rate del primer momento
beta_2=0.999, # Decay rate del segundo momento
epsilon=1e-07 # Término de suavizado
)
model = Sequential([
layers.Input(shape=(128,128,1)),
layers.Conv2D(64, kernel_size=(2,2), activation='relu',kernel_initializer = initializers.HeNormal(seed=42), padding='same'),
#layers.BatchNormalization(),
layers.MaxPooling2D(pool_size=(2,2)),
#layers.Dropout(0.25),
layers.Conv2D(64,kernel_size=(2,2), activation='relu', kernel_initializer = initializers.HeNormal(seed=42), padding='same'),
#layers.BatchNormalization(),
layers.MaxPooling2D(pool_size=(2,2)),
#layers.Dropout(0.25),
layers.Conv2D(128, kernel_size=(3,3), activation='relu', kernel_initializer = initializers.HeNormal(seed=42), padding='same'),
#layers.BatchNormalization(),
layers.MaxPooling2D(pool_size=(2,2)),
#layers.Dropout(0.25),
layers.Flatten(),
layers.Dropout(0.25),
layers.Dense(256, activation='relu'),
layers.Dense(len(categorias), activation='softmax')
])
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=["accuracy"])
model = Sequential([
Conv2D(32, (3,3), activation='relu', input_shape=(128, 128, 3)),
...
])
class myCallback(tf.keras.callbacks.Callback):
def on_epoch_end(self, epoch, logs={}):
if (logs.get('accuracy') > 0.995):
print("\nReached 99.5% accuracy so cancelling training!")
self.model.stop_training = True
callbacks = myCallback()
history = model.fit(
train_scaled,
y_train,
batch_size=10,
epochs=20,
validation_split=0.1,
callbacks=[callbacks]
)
plt.plot(history.history['loss'], label='Train loss')
plt.plot(history.history['val_loss'], label='Validation loss')
plt.plot(history.history['accuracy'], label='Train accuracy')
plt.plot(history.history['val_accuracy'], label='Validation accuracy')
plt.legend()
plt.title('Loss and accuracy (also validation) per Epoch')
plt.show()
history.model.layers
w, b = history.model.layers[0].get_weights()
b.shape
test_loss, test_acc = model.evaluate(test_scaled, y_test, verbose=2)
print(f'Test accuracy: {test_acc}'
predictions = model.predict(test_scaled)
predictions[0]
np.argmax(predictions[0])
data_test['label'][0]
from sklearn.metrics import classification_report
predicted_classes = np.argmax(predictions, axis=1)
true_classes = np.argmax(y_test, axis=1)
report = classification_report(true_classes, predicted_classes)
print(report)
def plot_image(i, predictions_array, true_label, img):
predictions_array, true_label, img = predictions_array, true_label[i], img[i]
plt.grid(False)
plt.xticks([])
plt.yticks([])
plt.imshow(img, cmap=plt.cm.binary)
predicted_label = np.argmax(predictions_array)
if predicted_label == true_label:
color = 'blue'
else:
color = 'red'
plt.xlabel("{} {:2.0f}% ({})".format(categorias[predicted_label],
100*np.max(predictions_array),
categorias[true_label]),
color=color)
def plot_value_array(i, predictions_array, true_label):
predictions_array, true_label = predictions_array, true_label[i]
plt.grid(False)
plt.xticks(range(4))
plt.yticks([])
thisplot = plt.bar(range(4), predictions_array, color="#777777")
plt.ylim([0, 1])
predicted_label = np.argmax(predictions_array)
thisplot[predicted_label].set_color('red')
thisplot[true_label].set_color('blue')
i = 0
plt.figure(figsize=(6,3))
plt.subplot(1,2,1)
plot_image(i, predictions[i], np.argmax(y_test, axis=1), test_scaled)
plt.subplot(1,2,2)
plot_value_array(i, predictions[i], np.argmax(y_test, axis=1))
plt.show()
i = 8
plt.figure(figsize=(6,3))
plt.subplot(1,2,1)
plot_image(i, predictions[i], np.argmax(y_test, axis=1), test_scaled)
plt.subplot(1,2,2)
plot_value_array(i, predictions[i], np.argmax(y_test, axis=1))
plt.show()
num_rows = 4
num_cols = 3
num_images = num_rows*num_cols
plt.figure(figsize=(2*2*num_cols, 2*num_rows))
for i in range(num_images):
plt.subplot(num_rows, 2*num_cols, 2*i+1)
plot_image(i, predictions[i], np.argmax(y_test, axis=1), test_scaled)
plt.subplot(num_rows, 2*num_cols, 2*i+2)
plot_value_array(i, predictions[i], np.argmax(y_test, axis=1))
plt.tight_layout()
plt.show()
!pip install streamlit ngrok
with open("app.py", "w") as file:
file.write("""
# Streamlit Multi-Page App for Hypertension and Diabetes Prediction
import streamlit as st
from streamlit_option_menu import option_menu
import pandas as pd
import numpy as np
# Placeholder models (replace with actual models trained in the notebook)
class PlaceholderModel:
def predict(self, X):
return np.random.choice([0, 1], size=(len(X),))
diabetes_model = PlaceholderModel()
hypertension_model = PlaceholderModel()
# Streamlit App Pages
st.set_page_config(page_title="Health Prediction App", layout="wide")
# Sidebar Navigation
with st.sidebar:
selected = option_menu(
"Navigation", ["Home", "Hypertension", "Diabetes"],
icons=["house", "activity", "heart"],
menu_icon="menu-app", default_index=0
)
if selected == "Home":
st.title("Health Prediction App")
st.write("Select the prediction model from the sidebar to get started.")
elif selected == "Hypertension":
st.title("Hypertension Prediction")
# Input form for Hypertension
age = st.number_input("Age", min_value=0, max_value=120, value=30)
systolic_bp = st.number_input("Systolic Blood Pressure", min_value=50, max_value=250, value=120)
diastolic_bp = st.number_input("Diastolic Blood Pressure", min_value=30, max_value=150, value=80)
cholesterol = st.number_input("Cholesterol Level", min_value=50, max_value=400, value=200)
smoking = st.selectbox("Smoking Status", ("Non-Smoker", "Former Smoker", "Current Smoker"))
activity = st.selectbox("Physical Activity Level", ("Low", "Moderate", "High"))
smoking_encoded = {"Non-Smoker": 0, "Former Smoker": 1, "Current Smoker": 2}[smoking]
activity_encoded = {"Low": 0, "Moderate": 1, "High": 2}[activity]
data = pd.DataFrame({
'Age': [age],
'SystolicBP': [systolic_bp],
'DiastolicBP': [diastolic_bp],
'Cholesterol': [cholesterol],
'SmokingStatus': [smoking_encoded],
'PhysicalActivity': [activity_encoded]
})
st.write("Input Data:", data)
if st.button("Predict Hypertension"):
prediction = hypertension_model.predict(data)
st.subheader("Prediction Result")
st.write("Hypertension Detected" if prediction[0] == 1 else "No Hypertension Detected")
elif selected == "Diabetes":
st.title("Diabetes Prediction")
# Input form for Diabetes
pregnancies = st.number_input("Pregnancies", min_value=0, max_value=20, value=1)
glucose = st.number_input("Glucose Level", min_value=0, max_value=300, value=100)
blood_pressure = st.number_input("Blood Pressure", min_value=0, max_value=200, value=80)
skin_thickness = st.number_input("Skin Thickness", min_value=0, max_value=100, value=20)
insulin = st.number_input("Insulin Level", min_value=0, max_value=900, value=30)
bmi = st.number_input("BMI", min_value=0.0, max_value=70.0, value=25.0)
dpf = st.number_input("Diabetes Pedigree Function", min_value=0.0, max_value=3.0, value=0.5)
age = st.number_input("Age", min_value=0, max_value=120, value=30)
data = pd.DataFrame({
'Pregnancies': [pregnancies],
'Glucose': [glucose],
'BloodPressure': [blood_pressure],
'SkinThickness': [skin_thickness],
'Insulin': [insulin],
'BMI': [bmi],
'DiabetesPedigreeFunction': [dpf],
'Age': [age]
})
st.write("Input Data:", data)
if st.button("Predict Diabetes"):
prediction = diabetes_model.predict(data)
st.subheader("Prediction Result")
st.write("Diabetes Detected" if prediction[0] == 1 else "No Diabetes Detected")
""")
!pip install pyngrok
!ngrok config add-authtoken 2ubz5Rmqi6qvjBOR7V60Wgzl4uk_64gzCGjEYSRJhNrBKnf9R
!pip install streamlit-option-menu
from pyngrok import ngrok
!streamlit run app.py &>/dev/null&
public_url = ngrok.connect(8501)
print(f"Streamlit app is live at {public_url}")