Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| import joblib | |
| from sklearn.metrics import accuracy_score, classification_report, confusion_matrix | |
| from plotly import graph_objects as go | |
| st.set_page_config(layout="wide") | |
| # Load Dataset | |
| def load_data(): | |
| data = pd.read_excel(r'Model Training/colelithiasis_dataset.xlsx') # Update with your dataset file path | |
| data.drop('Patient No.', axis=1, inplace=True) | |
| return data | |
| # Initialize Session State | |
| if "data" not in st.session_state: | |
| st.session_state.data = load_data() | |
| def introduction_page(): | |
| st.title("Introduction") | |
| st.markdown(""" | |
| ## Project Overview | |
| This project analyzes the Colelithiasis dataset to perform exploratory data analysis (EDA) and prediction using pre-trained machine learning models. The goal is to provide insights into the data and make predictions efficiently. | |
| ## Objectives | |
| - Perform EDA to uncover patterns and insights. | |
| - Use pre-trained machine learning models for predictions. | |
| - Create an interactive Streamlit application. | |
| """) | |
| def stats_page(): | |
| st.title("Exploratory Data Analysis") | |
| # Dataset Overview | |
| st.subheader("Dataset Overview") | |
| st.dataframe(st.session_state.data.head()) | |
| # Summary Statistics | |
| st.subheader("Summary Statistics") | |
| st.write(st.session_state.data.describe()) | |
| # Correlation Matrix | |
| st.subheader("Correlation Analysis") | |
| # encode the target variable | |
| data = st.session_state.data.copy() | |
| data['Health_status'].replace({'healthy': 0, 'patient': 1}, inplace=True) | |
| # apply ordinal encoding to the categorical columns | |
| categorical_columns = ['Gender','Family history','Obese/non obese'] | |
| encoder = joblib.load('Model Training\encoder.pkl') | |
| data[categorical_columns] = encoder.transform(data[categorical_columns]) | |
| correlation = data.corr() | |
| plt.figure(figsize=(5, 3)) | |
| # reduce the font size of the heatmap | |
| sns.set(font_scale=0.5) | |
| sns.heatmap(correlation, annot=True, cmap="coolwarm", fmt=".2f") | |
| st.pyplot(plt, use_container_width=False) | |
| def eda_page(): | |
| st.title("Exploratory Data Analysis") | |
| # Interactive Visualizations | |
| st.subheader("Visualizations") | |
| chart_type = st.selectbox("Choose Chart Type", ["Histogram", "Scatter Plot", "Box Plot"]) | |
| if chart_type == "Histogram": | |
| column = st.selectbox("Choose Column for Visualization", st.session_state.data.columns) | |
| fig = go.Figure() | |
| fig.add_trace(go.Histogram(x=st.session_state.data[column], name=column, marker_color="indigo")) | |
| fig.update_layout( | |
| title=dict(text="Histogram Analysis", x=0.5, font=dict(size=22)), | |
| xaxis_title=column, | |
| yaxis_title="Count", | |
| legend=dict(title="Legend", orientation="h", x=0.5, xanchor="center"), | |
| bargap=0.2, | |
| hovermode="x unified", | |
| template="plotly_dark" | |
| ) | |
| st.plotly_chart(fig) | |
| elif chart_type == "Scatter Plot": | |
| x_col = st.selectbox("Choose X-axis Column", st.session_state.data.columns) | |
| y_col = st.selectbox("Choose Y-axis Column", st.session_state.data.columns) | |
| fig = go.Figure() | |
| fig.add_trace(go.Scatter( | |
| x=st.session_state.data[x_col], | |
| y=st.session_state.data[y_col], | |
| mode="markers", | |
| marker=dict(size=10, color="purple", line=dict(width=1, color="white")), | |
| name=f"{y_col} vs {x_col}" | |
| )) | |
| fig.update_layout( | |
| title=dict(text="Scatter Plot Analysis", x=0.5, font=dict(size=22)), | |
| xaxis_title=x_col, | |
| yaxis_title=y_col, | |
| legend=dict(title="Legend", orientation="h", x=0.5, xanchor="center"), | |
| hovermode="closest", | |
| template="plotly_dark" | |
| ) | |
| st.plotly_chart(fig) | |
| elif chart_type == "Box Plot": | |
| column = st.selectbox("Choose Column for Visualization", st.session_state.data.columns) | |
| fig = go.Figure() | |
| fig.add_trace(go.Box( | |
| y=st.session_state.data[column], | |
| name=column, | |
| boxmean="sd", | |
| marker_color="teal" | |
| )) | |
| fig.update_layout( | |
| title=dict(text="Boxplot Analysis", x=0.5, font=dict(size=22)), | |
| yaxis_title=column, | |
| legend=dict(title="Legend", orientation="h", x=0.5, xanchor="center"), | |
| hovermode="y", | |
| template="plotly_dark" | |
| ) | |
| st.plotly_chart(fig) | |
| def model_page(): | |
| st.title("Model Evaluation") | |
| test_data = pd.read_excel(r'Model Training\test_data.xlsx') | |
| # encode the target variable | |
| test_data['Health_status'].replace({'healthy': 0, 'patient': 1}, inplace=True) | |
| # apply ordinal encoding to the categorical columns | |
| categorical_columns = ['Gender','Family history','Obese/non obese'] | |
| encoder = joblib.load('Model Training\encoder.pkl') | |
| X = test_data.drop( columns=['Health_status']) | |
| X[categorical_columns] = encoder.transform(X[categorical_columns]) | |
| y = test_data['Health_status'] | |
| # apply standard scalling to numberical features in X | |
| numerical_columns = [col_name for col_name in X.columns if col_name not in categorical_columns] | |
| scaler = joblib.load('Model Training\scaler.pkl') | |
| X[numerical_columns] = scaler.transform(X[numerical_columns]) | |
| # Model Selection | |
| st.text("Model Selection") | |
| model_choice = st.selectbox("Choose a Pre-trained Model", ["SVM - Linear", "SVM - Polynomial", "SVM - RBF", | |
| "Random Forest","Random Forest Boosted", "Logistic Regression", "GDA"]) | |
| # Load pre-trained model | |
| model = None | |
| if model_choice == "SVM - Linear": | |
| model = joblib.load('Model Training\svm_model_linear.pkl') | |
| elif model_choice == "SVM - Polynomial": | |
| model = joblib.load('Model Training\svm_model_poly.pkl') | |
| elif model_choice == "SVM - RBF": | |
| model = joblib.load('Model Training\svm_model_rbf.pkl') | |
| elif model_choice == "Random Forest": | |
| model = joblib.load('Model Training\rf_model.pkl') | |
| elif model_choice == "Random Forest Boosted": | |
| model = joblib.load('Model Training\rf_boosted.pkl') | |
| elif model_choice == "Logistic Regression": | |
| model = joblib.load('Model Training\lr_model.pkl') | |
| elif model_choice == "GDA": | |
| model = joblib.load('Model Training\gda.pkl') | |
| if model: | |
| # Make Predictions | |
| y_pred = model.predict(X) | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.subheader("### Predictions on the Test Data:") | |
| st.dataframe(pd.DataFrame({"Actual": y, "Predicted": y_pred})) | |
| with col2: | |
| st.subheader("Classification Report") | |
| report = classification_report(y, y_pred, output_dict=True) | |
| report_df = pd.DataFrame(report).transpose().reset_index() | |
| report_df.drop('support', axis=1, inplace=True) | |
| report_df.set_index(['index'], inplace=True) | |
| report_df.rename(index={'0.0': 'Negative', '1.0': 'Positive'}, inplace=True) | |
| report_df.iloc[report_df.index.get_loc('accuracy'), 0:2] = '' | |
| st.table(report_df) | |
| st.subheader("Confusion Matrix") | |
| conf_matrix = confusion_matrix(y, y_pred) | |
| # Generate text annotations for the confusion matrix | |
| text_annotations = np.array([[str(value) for value in row] for row in conf_matrix]) | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| # Create the heatmap using seaborn | |
| plt.figure(figsize=(3 , 3)) | |
| sns.heatmap(conf_matrix, annot=text_annotations, fmt="", cmap="Blues", cbar=False, square=True) | |
| plt.xlabel("Predicted") | |
| plt.ylabel("Actual") | |
| plt.title("Confusion Matrix") | |
| st.pyplot(plt) | |
| def prediction_page(): | |
| st.title("Get Your Diagnosis") | |
| st.subheader("Symptoms Entry Form") | |
| # Model Selection | |
| model_choice = st.selectbox("Choose a Pre-trained Model", ["SVM - Linear", "SVM - Polynomial", "SVM - RBF", | |
| "Random Forest","Random Forest Boosted", "Logistic Regression", "GDA"]) | |
| # Load pre-trained model | |
| model = None | |
| if model_choice == "SVM - Linear": | |
| model = joblib.load('Model Training\svm_model_linear.pkl') | |
| elif model_choice == "SVM - Polynomial": | |
| model = joblib.load('Model Training\svm_model_poly.pkl') | |
| elif model_choice == "SVM - RBF": | |
| model = joblib.load('Model Training\svm_model_rbf.pkl') | |
| elif model_choice == "Random Forest": | |
| model = joblib.load('Model Training\rf_model.pkl') | |
| elif model_choice == "Random Forest Boosted": | |
| model = joblib.load('Model Training\rf_boosted.pkl') | |
| elif model_choice == "Logistic Regression": | |
| model = joblib.load('Model Training\lr_model.pkl') | |
| elif model_choice == "GDA": | |
| model = joblib.load('Model Training\gda.pkl') | |
| with st.form(key="health_data_form"): | |
| col1, col2, col3, col4 = st.columns(4) | |
| with col1: | |
| # Categorical features with dropdown selection | |
| gender = st.selectbox("Gender", ["Male", "Female"], key="gender") | |
| weight = st.number_input("Weight (kg)", min_value=0, step=1, key="weight") | |
| cholesterol = st.number_input("Cholesterol (mg/dL)", min_value=0, step=1, key="cholesterol") | |
| with col2: | |
| family_history = st.selectbox("Family History of Illness", ["Yes", "No"], key="family_history") | |
| bmi = st.number_input("BMI", min_value=0.0, step=0.1, key="bmi") | |
| triglycerides = st.number_input("Triglycerides Level (mg/dL)", min_value=0, step=1, key="triglycerides") | |
| with col3: | |
| height = st.number_input("Height (cm)", min_value=0.0, step=0.1, key="height") | |
| obese_status = st.selectbox("Obese/Non Obese", ["Obese", "Non-Obese"], key="obese_status") | |
| ldl = st.number_input("LDL Level (mg/dL)", min_value=0.0, step=0.1, key="ldl") | |
| with col4: | |
| vldl = st.number_input("VLDL Level (mg/dL)", min_value=0.0, step=0.1, key="vldl") | |
| # Submit button | |
| submit_button = st.form_submit_button(label="Submit" ) | |
| if submit_button: | |
| # Create a DataFrame directly with the user input data | |
| data = pd.DataFrame({ | |
| "Gender": [gender], | |
| "Family history": [family_history], | |
| "Height": [height], | |
| "Weight": [weight], | |
| "BMI": [bmi], | |
| "Obese/non obese": [obese_status], | |
| "Cholesterol": [cholesterol], | |
| "Triglycerides": [triglycerides], | |
| "LDL level": [ldl], | |
| "VLDL level": [vldl] | |
| }) | |
| columns = ['Gender', 'Family history', 'Height', 'Weight', 'BMI', 'Obese/non obese', 'Cholesterol', 'Triglycerides level', 'LDL level', 'VLDL level'] | |
| data = data.reindex(columns=columns, fill_value=0) | |
| categorical_columns = ['Gender','Family history','Obese/non obese'] | |
| numerical_columns = [col_name for col_name in data.columns if col_name not in categorical_columns] | |
| # Encoding categorical data | |
| encoder = joblib.load('Model Training\encoder.pkl') | |
| data[categorical_columns] = encoder.transform(data[categorical_columns]) | |
| # Scaling the numeric features | |
| scaler = joblib.load('Model Training\scaler.pkl') | |
| data[numerical_columns] = scaler.transform(data[numerical_columns]) | |
| prediction = int(model.predict(data)[0]) | |
| st.write(f"### Predicted Diagnosis: {'Positive' if prediction == 1 else 'Negative'}") | |
| def conclusion_page(): | |
| st.title("Conclusion") | |
| st.markdown(""" | |
| ## Key Takeaways | |
| - Comprehensive EDA provides actionable insights into the data. | |
| - Pre-trained machine learning models allow efficient predictions. | |
| - The interactive app makes the analysis accessible and engaging. | |
| Thank you for exploring this project! | |
| """) | |
| # Sidebar Navigation Menu with radio buttons for page selection | |
| page = st.sidebar.radio("Navigation Menu", ["Introduction","Descriptive Statistics", "Data Analytics", "Model Evaluation", "Get Your Diagnosis", "Conclusion"]) | |
| if page == "Introduction": | |
| introduction_page() | |
| elif page == "Descriptive Statistics": | |
| stats_page() | |
| elif page == "Data Analytics": | |
| eda_page() | |
| elif page == "Model Evaluation": | |
| model_page() | |
| elif page == "Get Your Diagnosis": | |
| prediction_page() | |
| elif page == "Conclusion": | |
| conclusion_page() | |