Shyanil commited on
Commit
fc375a7
·
verified ·
1 Parent(s): 6e3e2b9

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +168 -0
app.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import matplotlib.pyplot as plt
5
+ import seaborn as sns
6
+ import plotly.express as px
7
+ import plotly.graph_objects as go
8
+ from plotly.subplots import make_subplots
9
+
10
+ def load_data():
11
+ """Load and enhance the insurance dataset."""
12
+ df = pd.read_csv('insurance.csv')
13
+
14
+ # Add derived features for better analysis
15
+ df['age_group'] = pd.cut(df['age'], bins=[0, 25, 35, 50, 100],
16
+ labels=['Young Adult', 'Adult', 'Middle Age', 'Senior'])
17
+ df['bmi_category'] = pd.cut(df['bmi'], bins=[0, 18.5, 24.9, 29.9, 100],
18
+ labels=['Underweight', 'Normal', 'Overweight', 'Obese'])
19
+
20
+ return df
21
+
22
+ def main():
23
+ st.set_page_config(layout="wide")
24
+
25
+ # Custom styling
26
+ st.markdown("""
27
+ <style>
28
+ .main {
29
+ background-color: #f5f5f5;
30
+ }
31
+ .stButton>button {
32
+ background-color: #4CAF50;
33
+ color: white;
34
+ border-radius: 5px;
35
+ }
36
+ </style>
37
+ """, unsafe_allow_html=True)
38
+
39
+ st.title("🏥 Advanced Insurance Data Analysis Dashboard")
40
+
41
+ # Load data
42
+ df = load_data()
43
+
44
+ # Sidebar for analysis controls
45
+ st.sidebar.header("Analysis Controls")
46
+
47
+ # Variable selection for analysis
48
+ primary_var = st.sidebar.selectbox(
49
+ "Select Primary Variable",
50
+ ['age', 'bmi', 'children', 'charges', 'region', 'smoker', 'sex']
51
+ )
52
+
53
+ secondary_var = st.sidebar.selectbox(
54
+ "Select Secondary Variable",
55
+ ['charges' if primary_var != 'charges' else 'age'] +
56
+ [var for var in ['age', 'bmi', 'children', 'region', 'smoker', 'sex'] if var != primary_var]
57
+ )
58
+
59
+ # Advanced Analysis Section
60
+ st.header("📊 Advanced Data Analysis")
61
+
62
+ tab1, tab2, tab3 = st.tabs(["Distribution Analysis", "Relationship Analysis", "Categorical Insights"])
63
+
64
+ with tab1:
65
+ col1, col2 = st.columns(2)
66
+
67
+ with col1:
68
+ # Enhanced distribution plot
69
+ fig = plt.figure(figsize=(10, 6))
70
+ if df[primary_var].dtype in ['int64', 'float64']:
71
+ sns.histplot(data=df, x=primary_var, hue='smoker')
72
+ else:
73
+ sns.countplot(data=df, x=primary_var, hue='smoker')
74
+ plt.title(f'Distribution of {primary_var} by Smoking Status')
75
+ plt.xticks(rotation=45)
76
+ st.pyplot(fig)
77
+
78
+ with col2:
79
+ # Box plot with individual points
80
+ fig = plt.figure(figsize=(10, 6))
81
+ if df[primary_var].dtype in ['int64', 'float64']:
82
+ sns.boxplot(data=df, y=primary_var, x='region', hue='smoker')
83
+ plt.title(f'{primary_var} Distribution by Region')
84
+ else:
85
+ sns.boxplot(data=df, y='charges', x=primary_var, hue='smoker')
86
+ plt.title(f'Charges Distribution by {primary_var}')
87
+ plt.xticks(rotation=45)
88
+ st.pyplot(fig)
89
+
90
+ with tab2:
91
+ col1, col2 = st.columns(2)
92
+
93
+ with col1:
94
+ # Scatter plot without trend line
95
+ if df[primary_var].dtype in ['int64', 'float64'] and df[secondary_var].dtype in ['int64', 'float64']:
96
+ fig = px.scatter(df, x=primary_var, y=secondary_var,
97
+ color='smoker',
98
+ title=f'Relationship between {primary_var} and {secondary_var}')
99
+ st.plotly_chart(fig, use_container_width=True)
100
+ else:
101
+ st.write("Cannot create scatter plot for categorical variables")
102
+
103
+ with col2:
104
+ # Advanced violin plot
105
+ fig = plt.figure(figsize=(10, 6))
106
+ if df[primary_var].dtype in ['int64', 'float64']:
107
+ sns.violinplot(data=df, y=primary_var, x='region', hue='smoker', split=True)
108
+ plt.title(f'{primary_var} Distribution by Region and Smoking Status')
109
+ else:
110
+ sns.violinplot(data=df, y='charges', x=primary_var, hue='smoker', split=True)
111
+ plt.title(f'Charges Distribution by {primary_var} and Smoking Status')
112
+ plt.xticks(rotation=45)
113
+ st.pyplot(fig)
114
+
115
+ with tab3:
116
+ col1, col2 = st.columns(2)
117
+
118
+ with col1:
119
+ # Heatmap for numerical variables
120
+ numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
121
+ correlation = df[numerical_cols].corr()
122
+ fig = px.imshow(correlation,
123
+ title="Correlation Heatmap",
124
+ labels=dict(color="Correlation"))
125
+ st.plotly_chart(fig, use_container_width=True)
126
+
127
+ with col2:
128
+ # Advanced categorical analysis
129
+ if df[primary_var].dtype not in ['int64', 'float64']:
130
+ # Create joint plot for categorical variables
131
+ fig = plt.figure(figsize=(10, 6))
132
+ sns.barplot(data=df, x=primary_var, y='charges', hue='smoker')
133
+ plt.title(f'Average Charges by {primary_var} and Smoking Status')
134
+ plt.xticks(rotation=45)
135
+ st.pyplot(fig)
136
+ else:
137
+ # Alternative visualization for numerical primary variable
138
+ fig = px.box(df, x='region', y=primary_var, color='smoker',
139
+ title=f'{primary_var} Distribution by Region and Smoking Status')
140
+ st.plotly_chart(fig, use_container_width=True)
141
+
142
+ # Detailed Insights Section
143
+ st.header("🔍 Detailed Insights")
144
+
145
+ # Summary statistics
146
+ if st.checkbox("Show Summary Statistics"):
147
+ if df[primary_var].dtype in ['int64', 'float64']:
148
+ summary = df.groupby('region')[primary_var].describe()
149
+ st.write(f"Summary Statistics for {primary_var} by Region:")
150
+ st.dataframe(summary)
151
+ else:
152
+ summary = df.groupby(primary_var)['charges'].describe()
153
+ st.write(f"Charges Summary Statistics by {primary_var}:")
154
+ st.dataframe(summary)
155
+
156
+ # Cross-analysis
157
+ if st.checkbox("Show Cross Analysis"):
158
+ if df[primary_var].dtype not in ['int64', 'float64']:
159
+ cross_analysis = pd.crosstab(df[primary_var], df['region'], margins=True)
160
+ st.write(f"Cross Analysis of {primary_var} by Region:")
161
+ st.dataframe(cross_analysis)
162
+ else:
163
+ grouped_analysis = df.groupby('region')[primary_var].agg(['mean', 'median', 'std', 'count'])
164
+ st.write(f"Grouped Analysis of {primary_var} by Region:")
165
+ st.dataframe(grouped_analysis)
166
+
167
+ if __name__ == "__main__":
168
+ main()