Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
import matplotlib.pyplot as plt
|
| 5 |
+
import seaborn as sns
|
| 6 |
+
import plotly.express as px
|
| 7 |
+
import plotly.graph_objects as go
|
| 8 |
+
from plotly.subplots import make_subplots
|
| 9 |
+
|
| 10 |
+
def load_data():
|
| 11 |
+
"""Load and enhance the insurance dataset."""
|
| 12 |
+
df = pd.read_csv('insurance.csv')
|
| 13 |
+
|
| 14 |
+
# Add derived features for better analysis
|
| 15 |
+
df['age_group'] = pd.cut(df['age'], bins=[0, 25, 35, 50, 100],
|
| 16 |
+
labels=['Young Adult', 'Adult', 'Middle Age', 'Senior'])
|
| 17 |
+
df['bmi_category'] = pd.cut(df['bmi'], bins=[0, 18.5, 24.9, 29.9, 100],
|
| 18 |
+
labels=['Underweight', 'Normal', 'Overweight', 'Obese'])
|
| 19 |
+
|
| 20 |
+
return df
|
| 21 |
+
|
| 22 |
+
def main():
|
| 23 |
+
st.set_page_config(layout="wide")
|
| 24 |
+
|
| 25 |
+
# Custom styling
|
| 26 |
+
st.markdown("""
|
| 27 |
+
<style>
|
| 28 |
+
.main {
|
| 29 |
+
background-color: #f5f5f5;
|
| 30 |
+
}
|
| 31 |
+
.stButton>button {
|
| 32 |
+
background-color: #4CAF50;
|
| 33 |
+
color: white;
|
| 34 |
+
border-radius: 5px;
|
| 35 |
+
}
|
| 36 |
+
</style>
|
| 37 |
+
""", unsafe_allow_html=True)
|
| 38 |
+
|
| 39 |
+
st.title("🏥 Advanced Insurance Data Analysis Dashboard")
|
| 40 |
+
|
| 41 |
+
# Load data
|
| 42 |
+
df = load_data()
|
| 43 |
+
|
| 44 |
+
# Sidebar for analysis controls
|
| 45 |
+
st.sidebar.header("Analysis Controls")
|
| 46 |
+
|
| 47 |
+
# Variable selection for analysis
|
| 48 |
+
primary_var = st.sidebar.selectbox(
|
| 49 |
+
"Select Primary Variable",
|
| 50 |
+
['age', 'bmi', 'children', 'charges', 'region', 'smoker', 'sex']
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
secondary_var = st.sidebar.selectbox(
|
| 54 |
+
"Select Secondary Variable",
|
| 55 |
+
['charges' if primary_var != 'charges' else 'age'] +
|
| 56 |
+
[var for var in ['age', 'bmi', 'children', 'region', 'smoker', 'sex'] if var != primary_var]
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
# Advanced Analysis Section
|
| 60 |
+
st.header("📊 Advanced Data Analysis")
|
| 61 |
+
|
| 62 |
+
tab1, tab2, tab3 = st.tabs(["Distribution Analysis", "Relationship Analysis", "Categorical Insights"])
|
| 63 |
+
|
| 64 |
+
with tab1:
|
| 65 |
+
col1, col2 = st.columns(2)
|
| 66 |
+
|
| 67 |
+
with col1:
|
| 68 |
+
# Enhanced distribution plot
|
| 69 |
+
fig = plt.figure(figsize=(10, 6))
|
| 70 |
+
if df[primary_var].dtype in ['int64', 'float64']:
|
| 71 |
+
sns.histplot(data=df, x=primary_var, hue='smoker')
|
| 72 |
+
else:
|
| 73 |
+
sns.countplot(data=df, x=primary_var, hue='smoker')
|
| 74 |
+
plt.title(f'Distribution of {primary_var} by Smoking Status')
|
| 75 |
+
plt.xticks(rotation=45)
|
| 76 |
+
st.pyplot(fig)
|
| 77 |
+
|
| 78 |
+
with col2:
|
| 79 |
+
# Box plot with individual points
|
| 80 |
+
fig = plt.figure(figsize=(10, 6))
|
| 81 |
+
if df[primary_var].dtype in ['int64', 'float64']:
|
| 82 |
+
sns.boxplot(data=df, y=primary_var, x='region', hue='smoker')
|
| 83 |
+
plt.title(f'{primary_var} Distribution by Region')
|
| 84 |
+
else:
|
| 85 |
+
sns.boxplot(data=df, y='charges', x=primary_var, hue='smoker')
|
| 86 |
+
plt.title(f'Charges Distribution by {primary_var}')
|
| 87 |
+
plt.xticks(rotation=45)
|
| 88 |
+
st.pyplot(fig)
|
| 89 |
+
|
| 90 |
+
with tab2:
|
| 91 |
+
col1, col2 = st.columns(2)
|
| 92 |
+
|
| 93 |
+
with col1:
|
| 94 |
+
# Scatter plot without trend line
|
| 95 |
+
if df[primary_var].dtype in ['int64', 'float64'] and df[secondary_var].dtype in ['int64', 'float64']:
|
| 96 |
+
fig = px.scatter(df, x=primary_var, y=secondary_var,
|
| 97 |
+
color='smoker',
|
| 98 |
+
title=f'Relationship between {primary_var} and {secondary_var}')
|
| 99 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 100 |
+
else:
|
| 101 |
+
st.write("Cannot create scatter plot for categorical variables")
|
| 102 |
+
|
| 103 |
+
with col2:
|
| 104 |
+
# Advanced violin plot
|
| 105 |
+
fig = plt.figure(figsize=(10, 6))
|
| 106 |
+
if df[primary_var].dtype in ['int64', 'float64']:
|
| 107 |
+
sns.violinplot(data=df, y=primary_var, x='region', hue='smoker', split=True)
|
| 108 |
+
plt.title(f'{primary_var} Distribution by Region and Smoking Status')
|
| 109 |
+
else:
|
| 110 |
+
sns.violinplot(data=df, y='charges', x=primary_var, hue='smoker', split=True)
|
| 111 |
+
plt.title(f'Charges Distribution by {primary_var} and Smoking Status')
|
| 112 |
+
plt.xticks(rotation=45)
|
| 113 |
+
st.pyplot(fig)
|
| 114 |
+
|
| 115 |
+
with tab3:
|
| 116 |
+
col1, col2 = st.columns(2)
|
| 117 |
+
|
| 118 |
+
with col1:
|
| 119 |
+
# Heatmap for numerical variables
|
| 120 |
+
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
|
| 121 |
+
correlation = df[numerical_cols].corr()
|
| 122 |
+
fig = px.imshow(correlation,
|
| 123 |
+
title="Correlation Heatmap",
|
| 124 |
+
labels=dict(color="Correlation"))
|
| 125 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 126 |
+
|
| 127 |
+
with col2:
|
| 128 |
+
# Advanced categorical analysis
|
| 129 |
+
if df[primary_var].dtype not in ['int64', 'float64']:
|
| 130 |
+
# Create joint plot for categorical variables
|
| 131 |
+
fig = plt.figure(figsize=(10, 6))
|
| 132 |
+
sns.barplot(data=df, x=primary_var, y='charges', hue='smoker')
|
| 133 |
+
plt.title(f'Average Charges by {primary_var} and Smoking Status')
|
| 134 |
+
plt.xticks(rotation=45)
|
| 135 |
+
st.pyplot(fig)
|
| 136 |
+
else:
|
| 137 |
+
# Alternative visualization for numerical primary variable
|
| 138 |
+
fig = px.box(df, x='region', y=primary_var, color='smoker',
|
| 139 |
+
title=f'{primary_var} Distribution by Region and Smoking Status')
|
| 140 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 141 |
+
|
| 142 |
+
# Detailed Insights Section
|
| 143 |
+
st.header("🔍 Detailed Insights")
|
| 144 |
+
|
| 145 |
+
# Summary statistics
|
| 146 |
+
if st.checkbox("Show Summary Statistics"):
|
| 147 |
+
if df[primary_var].dtype in ['int64', 'float64']:
|
| 148 |
+
summary = df.groupby('region')[primary_var].describe()
|
| 149 |
+
st.write(f"Summary Statistics for {primary_var} by Region:")
|
| 150 |
+
st.dataframe(summary)
|
| 151 |
+
else:
|
| 152 |
+
summary = df.groupby(primary_var)['charges'].describe()
|
| 153 |
+
st.write(f"Charges Summary Statistics by {primary_var}:")
|
| 154 |
+
st.dataframe(summary)
|
| 155 |
+
|
| 156 |
+
# Cross-analysis
|
| 157 |
+
if st.checkbox("Show Cross Analysis"):
|
| 158 |
+
if df[primary_var].dtype not in ['int64', 'float64']:
|
| 159 |
+
cross_analysis = pd.crosstab(df[primary_var], df['region'], margins=True)
|
| 160 |
+
st.write(f"Cross Analysis of {primary_var} by Region:")
|
| 161 |
+
st.dataframe(cross_analysis)
|
| 162 |
+
else:
|
| 163 |
+
grouped_analysis = df.groupby('region')[primary_var].agg(['mean', 'median', 'std', 'count'])
|
| 164 |
+
st.write(f"Grouped Analysis of {primary_var} by Region:")
|
| 165 |
+
st.dataframe(grouped_analysis)
|
| 166 |
+
|
| 167 |
+
if __name__ == "__main__":
|
| 168 |
+
main()
|