Spaces:
No application file
No application file
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| from plotly.subplots import make_subplots | |
| def eda_analysis(df): | |
| """ | |
| Comprehensive Exploratory Data Analysis (EDA) with visual insights | |
| """ | |
| st.markdown(""" | |
| <div style='text-align: center; margin-bottom: 2rem;'> | |
| <h2>π Exploratory Data Analysis (EDA)</h2> | |
| <p style='color: gray;'>Discover patterns, relationships, and insights through visual exploration</p> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # Error handling | |
| if df.empty: | |
| st.error("β The dataset is empty. Please upload a valid dataset.") | |
| return | |
| try: | |
| # Create tabs for different EDA aspects | |
| tab1, tab2, tab3, tab4, tab5, tab6 = st.tabs([ | |
| "π Data Overview", | |
| "π Missing Data Analysis", | |
| "π Univariate Analysis", | |
| "π Bivariate Analysis", | |
| "π Multivariate Analysis", | |
| "π― Pattern Discovery" | |
| ]) | |
| with tab1: | |
| st.markdown('<div class="custom-card">', unsafe_allow_html=True) | |
| st.subheader("π Dataset Overview") | |
| try: | |
| # Key metrics in cards | |
| col1, col2, col3, col4 = st.columns(4) | |
| with col1: | |
| st.metric("Total Rows", f"{df.shape[0]:,}") | |
| with col2: | |
| st.metric("Total Columns", df.shape[1]) | |
| with col3: | |
| memory_usage = df.memory_usage(deep=True).sum() / 1024**2 | |
| st.metric("Memory Usage", f"{memory_usage:.2f} MB") | |
| with col4: | |
| missing_total = df.isnull().sum().sum() | |
| st.metric("Missing Values", f"{missing_total:,}") | |
| # Data preview with interactive controls | |
| st.subheader("π Data Preview") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| preview_rows = st.slider("Number of rows to display", 5, 50, 10, key="preview_rows") | |
| with col2: | |
| preview_type = st.radio("Preview type", ["Head", "Tail", "Random Sample"], | |
| horizontal=True, key="preview_type") | |
| if preview_type == "Head": | |
| st.dataframe(df.head(preview_rows), use_container_width=True) | |
| elif preview_type == "Tail": | |
| st.dataframe(df.tail(preview_rows), use_container_width=True) | |
| else: | |
| if len(df) > preview_rows: | |
| st.dataframe(df.sample(preview_rows), use_container_width=True) | |
| else: | |
| st.warning("β οΈ Sample size larger than dataset. Showing all rows.") | |
| st.dataframe(df, use_container_width=True) | |
| # Column information with visual indicators | |
| st.subheader("π Column Information") | |
| col_info = pd.DataFrame({ | |
| 'Column': df.columns, | |
| 'Data Type': df.dtypes.astype(str), | |
| 'Non-Null Count': df.count().values, | |
| 'Null Count': df.isnull().sum().values, | |
| 'Null %': (df.isnull().sum().values / len(df) * 100).round(2), | |
| 'Unique Values': [df[col].nunique() for col in df.columns], | |
| 'Sample Values': [str(df[col].dropna().iloc[:3].tolist()) if len(df[col].dropna()) > 0 else "All null" for col in df.columns] | |
| }) | |
| # Add color coding for data types | |
| def color_data_type(val): | |
| if 'int' in val or 'float' in val: | |
| return 'background-color: #e3f2fd' | |
| elif 'object' in val: | |
| return 'background-color: #f1f8e9' | |
| elif 'datetime' in val: | |
| return 'background-color: #fff3e0' | |
| return '' | |
| st.dataframe(col_info.style.applymap(color_data_type, subset=['Data Type']), | |
| use_container_width=True) | |
| # Data type distribution | |
| st.subheader("π Data Type Distribution") | |
| dtype_counts = df.dtypes.value_counts() | |
| if len(dtype_counts) > 0: | |
| fig = make_subplots(rows=1, cols=2, | |
| specs=[[{"type": "pie"}, {"type": "bar"}]], | |
| subplot_titles=("Pie Chart", "Bar Chart")) | |
| fig.add_trace(go.Pie(labels=dtype_counts.index.astype(str), | |
| values=dtype_counts.values, | |
| hole=0.3), row=1, col=1) | |
| fig.add_trace(go.Bar(x=dtype_counts.index.astype(str), | |
| y=dtype_counts.values, | |
| marker_color=['#42a5f5', '#66bb6a', '#ffa726'][:len(dtype_counts)]), | |
| row=1, col=2) | |
| fig.update_layout(height=400, title_text="Column Types Distribution") | |
| st.plotly_chart(fig, use_container_width=True) | |
| else: | |
| st.warning("β οΈ No data type information available") | |
| # Dataset statistics | |
| st.subheader("π Dataset Statistics") | |
| numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist() | |
| categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist() | |
| datetime_cols = df.select_dtypes(include=['datetime64']).columns.tolist() | |
| bool_cols = df.select_dtypes(include=['bool']).columns.tolist() | |
| col1, col2, col3, col4 = st.columns(4) | |
| with col1: | |
| st.info(f"**Numeric:** {len(numeric_cols)} columns") | |
| with col2: | |
| st.info(f"**Categorical:** {len(categorical_cols)} columns") | |
| with col3: | |
| st.info(f"**Datetime:** {len(datetime_cols)} columns") | |
| with col4: | |
| st.info(f"**Boolean:** {len(bool_cols)} columns") | |
| except Exception as e: | |
| st.error(f"β Error in data overview: {str(e)}") | |
| st.info("π‘ Tip: Check if your dataset contains valid data types") | |
| st.markdown('</div>', unsafe_allow_html=True) | |
| with tab2: | |
| st.markdown('<div class="custom-card">', unsafe_allow_html=True) | |
| st.subheader("π Missing Data Analysis") | |
| try: | |
| if df.isnull().sum().sum() > 0: | |
| # Missing data overview | |
| missing_df = pd.DataFrame({ | |
| 'Column': df.columns, | |
| 'Missing Count': df.isnull().sum().values, | |
| 'Missing %': (df.isnull().sum().values / len(df) * 100).round(2) | |
| }).sort_values('Missing %', ascending=False) | |
| missing_df = missing_df[missing_df['Missing Count'] > 0] | |
| if len(missing_df) > 0: | |
| # Visualize missing data | |
| fig = make_subplots(rows=2, cols=2, | |
| subplot_titles=("Missing Values Heatmap", | |
| "Missing Values by Column", | |
| "Missing Data Patterns", | |
| "Missing Data Matrix"), | |
| specs=[[{"type": "heatmap"}, {"type": "bar"}], | |
| [{"type": "scatter"}, {"type": "heatmap"}]]) | |
| # Heatmap of missing values | |
| missing_matrix = df.isnull().astype(int).T | |
| fig.add_trace(go.Heatmap(z=missing_matrix.values, | |
| y=missing_matrix.index, | |
| colorscale='Reds', | |
| showscale=False), row=1, col=1) | |
| # Bar chart of missing values | |
| fig.add_trace(go.Bar(x=missing_df['Column'].head(20), | |
| y=missing_df['Missing Count'].head(20), | |
| marker_color='#ef5350', | |
| name="Missing Count"), row=1, col=2) | |
| # Missing data patterns (rows with missing data) | |
| missing_rows = df[df.isnull().any(axis=1)] | |
| if len(missing_rows) > 0: | |
| pattern_df = missing_rows.isnull().sum(axis=1).value_counts().reset_index() | |
| pattern_df.columns = ['Missing Count per Row', 'Number of Rows'] | |
| pattern_df = pattern_df.sort_values('Missing Count per Row') | |
| fig.add_trace(go.Scatter(x=pattern_df['Missing Count per Row'], | |
| y=pattern_df['Number of Rows'], | |
| mode='lines+markers', | |
| name="Patterns"), row=2, col=1) | |
| # Missing data matrix for first 50 rows | |
| sample_missing = df.head(min(50, len(df))).isnull().astype(int).T | |
| fig.add_trace(go.Heatmap(z=sample_missing.values, | |
| y=sample_missing.index, | |
| colorscale='Reds', | |
| showscale=False, | |
| name="Matrix"), row=2, col=2) | |
| fig.update_layout(height=800, title_text="Missing Data Analysis", | |
| showlegend=False) | |
| st.plotly_chart(fig, use_container_width=True) | |
| # Detailed missing data table | |
| st.subheader("π Missing Data Details") | |
| # Add severity classification | |
| def classify_severity(pct): | |
| if pct == 0: | |
| return "β None" | |
| elif pct < 5: | |
| return "π’ Low" | |
| elif pct < 20: | |
| return "π‘ Medium" | |
| else: | |
| return "π΄ High" | |
| missing_df['Severity'] = missing_df['Missing %'].apply(classify_severity) | |
| missing_df['Recommendation'] = missing_df['Missing %'].apply( | |
| lambda x: "No action needed" if x == 0 else | |
| "Consider imputation" if x < 5 else | |
| "Imputation recommended" if x < 20 else | |
| "Consider dropping column" | |
| ) | |
| st.dataframe(missing_df, use_container_width=True) | |
| # Missing data patterns | |
| if len(missing_df) > 1: | |
| st.subheader("π Missing Data Patterns") | |
| # Find columns with similar missing patterns | |
| missing_corr = df[missing_df['Column'].tolist()].isnull().corr() | |
| if len(missing_corr) > 1: | |
| fig = px.imshow(missing_corr, | |
| text_auto=True, | |
| aspect="auto", | |
| color_continuous_scale='RdBu_r', | |
| title="Missing Value Correlation Matrix") | |
| st.plotly_chart(fig, use_container_width=True) | |
| # Find highly correlated missing patterns | |
| high_corr = [] | |
| for i in range(len(missing_corr.columns)): | |
| for j in range(i+1, len(missing_corr.columns)): | |
| if abs(missing_corr.iloc[i, j]) > 0.7: | |
| high_corr.append({ | |
| 'Column 1': missing_corr.columns[i], | |
| 'Column 2': missing_corr.columns[j], | |
| 'Correlation': missing_corr.iloc[i, j] | |
| }) | |
| if high_corr: | |
| st.info("π **Columns with similar missing patterns:**") | |
| for item in high_corr[:5]: # Show top 5 | |
| st.write(f"β’ {item['Column 1']} & {item['Column 2']}: {item['Correlation']:.2f}") | |
| else: | |
| st.success("β No missing values found in the dataset!") | |
| else: | |
| st.success("β No missing values found in the dataset!") | |
| # Show complete data visualization | |
| fig = go.Figure() | |
| fig.add_trace(go.Indicator( | |
| mode="number+gauge", | |
| value=100, | |
| title={'text': "Data Completeness"}, | |
| gauge={'axis': {'range': [0, 100]}, | |
| 'bar': {'color': "green"}, | |
| 'steps': [{'range': [0, 100], 'color': "lightgreen"}]} | |
| )) | |
| st.plotly_chart(fig, use_container_width=True) | |
| except Exception as e: | |
| st.error(f"β Error in missing data analysis: {str(e)}") | |
| st.info("π‘ Tip: Ensure your dataset has valid data for missing value analysis") | |
| st.markdown('</div>', unsafe_allow_html=True) | |
| with tab3: | |
| st.markdown('<div class="custom-card">', unsafe_allow_html=True) | |
| st.subheader("π Univariate Analysis") | |
| try: | |
| col_type = st.radio("Select column type", ["Numeric", "Categorical", "Datetime"], | |
| horizontal=True, key="univariate_type") | |
| if col_type == "Numeric": | |
| numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist() | |
| if numeric_cols: | |
| selected_col = st.selectbox("Select numeric column", numeric_cols, key="univariate_num") | |
| data = df[selected_col].dropna() | |
| if len(data) > 0: | |
| # Create comprehensive visualization | |
| fig = make_subplots(rows=2, cols=3, | |
| subplot_titles=("Histogram", "Box Plot", "Violin Plot", | |
| "ECDF", "QQ Plot", "Summary Stats"), | |
| specs=[[{"type": "xy"}, {"type": "xy"}, {"type": "xy"}], | |
| [{"type": "xy"}, {"type": "xy"}, {"type": "domain"}]]) | |
| # Histogram | |
| fig.add_trace(go.Histogram(x=data, nbinsx=30, name="Histogram", | |
| marker_color='#42a5f5'), row=1, col=1) | |
| # Box plot | |
| fig.add_trace(go.Box(y=data, name="Box Plot", boxpoints='outliers', | |
| marker_color='#66bb6a'), row=1, col=2) | |
| # Violin plot | |
| fig.add_trace(go.Violin(y=data, name="Violin Plot", box_visible=True, | |
| line_color='black', fillcolor='#ffa726', | |
| opacity=0.6), row=1, col=3) | |
| # ECDF | |
| sorted_data = np.sort(data) | |
| ecdf = np.arange(1, len(sorted_data)+1) / len(sorted_data) | |
| fig.add_trace(go.Scatter(x=sorted_data, y=ecdf, mode='lines', | |
| name="ECDF", line=dict(color='#ab47bc')), | |
| row=2, col=1) | |
| # QQ plot | |
| theoretical_q = np.random.normal(data.mean(), data.std(), len(data)) | |
| theoretical_q.sort() | |
| fig.add_trace(go.Scatter(x=theoretical_q, y=sorted_data, | |
| mode='markers', name="QQ Plot", | |
| marker=dict(color='#7e57c2', size=3)), | |
| row=2, col=2) | |
| # Add reference line to QQ plot | |
| min_val = min(theoretical_q.min(), sorted_data.min()) | |
| max_val = max(theoretical_q.max(), sorted_data.max()) | |
| fig.add_trace(go.Scatter(x=[min_val, max_val], y=[min_val, max_val], | |
| mode='lines', line=dict(color='red', dash='dash'), | |
| showlegend=False), row=2, col=2) | |
| # Summary statistics as table | |
| stats_text = f""" | |
| <b>Summary Statistics</b><br> | |
| Count: {len(data):,}<br> | |
| Mean: {data.mean():.4f}<br> | |
| Std: {data.std():.4f}<br> | |
| Min: {data.min():.4f}<br> | |
| Q1: {data.quantile(0.25):.4f}<br> | |
| Median: {data.median():.4f}<br> | |
| Q3: {data.quantile(0.75):.4f}<br> | |
| Max: {data.max():.4f}<br> | |
| IQR: {data.quantile(0.75) - data.quantile(0.25):.4f}<br> | |
| Skewness: {data.skew():.4f}<br> | |
| Kurtosis: {data.kurtosis():.4f} | |
| """ | |
| fig.add_annotation(x=0.5, y=0.5, text=stats_text, | |
| showarrow=False, font=dict(size=10), | |
| row=2, col=3, align='left') | |
| fig.update_layout(height=800, title_text=f"Univariate Analysis: {selected_col}") | |
| st.plotly_chart(fig, use_container_width=True) | |
| # Outlier detection | |
| Q1 = data.quantile(0.25) | |
| Q3 = data.quantile(0.75) | |
| IQR = Q3 - Q1 | |
| outliers = data[(data < Q1 - 1.5 * IQR) | (data > Q3 + 1.5 * IQR)] | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.metric("Outliers Count", len(outliers)) | |
| with col2: | |
| st.metric("Outliers %", f"{len(outliers)/len(data)*100:.2f}%") | |
| if len(outliers) > 0: | |
| with st.expander("View outlier values"): | |
| st.write(outliers.tolist()[:20]) # Show first 20 outliers | |
| if len(outliers) > 20: | |
| st.info(f"... and {len(outliers) - 20} more outliers") | |
| else: | |
| st.warning("β οΈ No numeric columns available for analysis") | |
| elif col_type == "Categorical": | |
| categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist() | |
| if categorical_cols: | |
| selected_col = st.selectbox("Select categorical column", categorical_cols, | |
| key="univariate_cat") | |
| # Get value counts | |
| value_counts = df[selected_col].value_counts().reset_index() | |
| value_counts.columns = [selected_col, 'count'] | |
| value_counts['percentage'] = (value_counts['count'] / len(df) * 100).round(2) | |
| if len(value_counts) > 0: | |
| # Create visualizations | |
| fig = make_subplots(rows=2, cols=2, | |
| subplot_titles=("Bar Chart (Top 20)", "Pie Chart (Top 10)", | |
| "Treemap (Top 10)", "Frequency Table"), | |
| specs=[[{"type": "xy"}, {"type": "domain"}], | |
| [{"type": "domain"}, {"type": "table"}]]) | |
| # Bar chart (top 20) | |
| top20 = value_counts.head(20) | |
| fig.add_trace(go.Bar(x=top20[selected_col], | |
| y=top20['count'], | |
| marker_color='#42a5f5', | |
| name="Count"), row=1, col=1) | |
| # Pie chart (top 10) | |
| top10 = value_counts.head(10) | |
| fig.add_trace(go.Pie(labels=top10[selected_col], | |
| values=top10['count'], | |
| hole=0.3, | |
| textinfo='percent+label', | |
| name="Proportion"), row=1, col=2) | |
| # Treemap (top 10) | |
| fig.add_trace(go.Treemap(labels=top10[selected_col], | |
| parents=['']*len(top10), | |
| values=top10['count'], | |
| textinfo='label+value', | |
| name="Treemap"), row=2, col=1) | |
| # Frequency table (top 10) | |
| fig.add_trace(go.Table(header=dict(values=[selected_col, 'Count', 'Percentage']), | |
| cells=dict(values=[top10[selected_col].tolist(), | |
| top10['count'].tolist(), | |
| top10['percentage'].tolist()]), | |
| name="Table"), row=2, col=2) | |
| fig.update_layout(height=800, title_text=f"Categorical Analysis: {selected_col}") | |
| st.plotly_chart(fig, use_container_width=True) | |
| # Summary statistics for categorical | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| st.metric("Unique Values", f"{value_counts.shape[0]:,}") | |
| with col2: | |
| st.metric("Most Frequent", f"{value_counts.iloc[0, 0]}") | |
| with col3: | |
| st.metric("Frequency", f"{value_counts.iloc[0, 1]:,} ({value_counts.iloc[0, 2]}%)") | |
| # Cardinality warning | |
| if value_counts.shape[0] > 50: | |
| st.warning(f"β οΈ High cardinality detected: {value_counts.shape[0]} unique values. Consider grouping rare categories.") | |
| else: | |
| st.warning("β οΈ No categorical columns available for analysis") | |
| elif col_type == "Datetime": | |
| datetime_cols = df.select_dtypes(include=['datetime64']).columns.tolist() | |
| if datetime_cols: | |
| selected_col = st.selectbox("Select datetime column", datetime_cols, | |
| key="univariate_datetime") | |
| # Extract temporal features | |
| df_temp = df[selected_col].dropna() | |
| if len(df_temp) > 0: | |
| # Create temporal distributions | |
| fig = make_subplots(rows=2, cols=2, | |
| subplot_titles=("Year Distribution", "Month Distribution", | |
| "Day of Week Distribution", "Hour Distribution"), | |
| specs=[[{"type": "xy"}, {"type": "xy"}], | |
| [{"type": "xy"}, {"type": "xy"}]]) | |
| # Year distribution | |
| years = df_temp.dt.year.value_counts().sort_index() | |
| if len(years) > 0: | |
| fig.add_trace(go.Bar(x=years.index.astype(str), y=years.values, | |
| marker_color='#42a5f5', name="Year"), row=1, col=1) | |
| # Month distribution | |
| months = df_temp.dt.month.value_counts().sort_index() | |
| month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', | |
| 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] | |
| if len(months) > 0: | |
| fig.add_trace(go.Bar(x=[month_names[i-1] for i in months.index], | |
| y=months.values, marker_color='#66bb6a', | |
| name="Month"), row=1, col=2) | |
| # Day of week distribution | |
| days = df_temp.dt.dayofweek.value_counts().sort_index() | |
| day_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] | |
| if len(days) > 0: | |
| fig.add_trace(go.Bar(x=[day_names[i] for i in days.index], | |
| y=days.values, marker_color='#ffa726', | |
| name="Day of Week"), row=2, col=1) | |
| # Hour distribution (if time component exists) | |
| if df_temp.dt.hour.nunique() > 1: | |
| hours = df_temp.dt.hour.value_counts().sort_index() | |
| fig.add_trace(go.Bar(x=hours.index.astype(str), y=hours.values, | |
| marker_color='#ab47bc', name="Hour"), row=2, col=2) | |
| fig.update_layout(height=800, title_text=f"Temporal Analysis: {selected_col}") | |
| st.plotly_chart(fig, use_container_width=True) | |
| # Date range information | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| st.metric("Start Date", df_temp.min().strftime('%Y-%m-%d')) | |
| with col2: | |
| st.metric("End Date", df_temp.max().strftime('%Y-%m-%d')) | |
| with col3: | |
| date_range = (df_temp.max() - df_temp.min()).days | |
| st.metric("Date Range", f"{date_range} days") | |
| else: | |
| st.warning("β οΈ No datetime columns available for analysis") | |
| except Exception as e: | |
| st.error(f"β Error in univariate analysis: {str(e)}") | |
| st.info("π‘ Tip: Ensure the selected column contains valid data for analysis") | |
| st.markdown('</div>', unsafe_allow_html=True) | |
| with tab4: | |
| st.markdown('<div class="custom-card">', unsafe_allow_html=True) | |
| st.subheader("π Bivariate Analysis") | |
| try: | |
| analysis_type = st.radio("Select analysis type", | |
| ["Numeric vs Numeric", "Numeric vs Categorical", | |
| "Categorical vs Categorical"], | |
| horizontal=True, key="bivariate_type") | |
| if analysis_type == "Numeric vs Numeric": | |
| numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist() | |
| if len(numeric_cols) >= 2: | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| x_col = st.selectbox("Select X axis", numeric_cols, key="bi_x") | |
| with col2: | |
| y_col = st.selectbox("Select Y axis", [c for c in numeric_cols if c != x_col], | |
| key="bi_y") | |
| # Clean data for analysis | |
| plot_df = df[[x_col, y_col]].dropna() | |
| if len(plot_df) > 0: | |
| # Create comprehensive visualization | |
| fig = make_subplots(rows=2, cols=3, | |
| subplot_titles=("Scatter Plot", "Hexbin Plot", "Density Contour", | |
| "Marginal Distributions", "Residuals", "Statistics"), | |
| specs=[[{"type": "xy"}, {"type": "xy"}, {"type": "xy"}], | |
| [{"type": "xy"}, {"type": "xy"}, {"type": "domain"}]]) | |
| # Scatter plot with trendline | |
| fig.add_trace(go.Scatter(x=plot_df[x_col], y=plot_df[y_col], | |
| mode='markers', name="Scatter", | |
| marker=dict(size=5, opacity=0.6, color='#42a5f5')), | |
| row=1, col=1) | |
| # Add trendline | |
| try: | |
| z = np.polyfit(plot_df[x_col], plot_df[y_col], 1) | |
| p = np.poly1d(z) | |
| x_range = np.linspace(plot_df[x_col].min(), plot_df[x_col].max(), 100) | |
| fig.add_trace(go.Scatter(x=x_range, y=p(x_range), | |
| mode='lines', name="Trend", | |
| line=dict(color='red', width=2)), row=1, col=1) | |
| except: | |
| pass | |
| # Hexbin plot | |
| fig.add_trace(go.Histogram2d(x=plot_df[x_col], y=plot_df[y_col], | |
| colorscale='Viridis', | |
| name="Hexbin"), row=1, col=2) | |
| # Density contour | |
| fig.add_trace(go.Histogram2dContour(x=plot_df[x_col], y=plot_df[y_col], | |
| colorscale='Viridis', | |
| name="Contour"), row=1, col=3) | |
| # Marginal distributions | |
| fig.add_trace(go.Histogram(x=plot_df[x_col], name=f"{x_col}", | |
| marker_color='#66bb6a'), row=2, col=1) | |
| fig.add_trace(go.Histogram(y=plot_df[y_col], name=f"{y_col}", | |
| marker_color='#ffa726', orientation='h'), | |
| row=2, col=1) | |
| # Residuals | |
| try: | |
| residuals = plot_df[y_col] - p(plot_df[x_col]) | |
| fig.add_trace(go.Scatter(x=plot_df[x_col], y=residuals, | |
| mode='markers', name="Residuals", | |
| marker=dict(size=3, opacity=0.5, color='#ab47bc')), | |
| row=2, col=2) | |
| fig.add_hline(y=0, line_dash="dash", line_color="red", row=2, col=2) | |
| except: | |
| pass | |
| # Statistics | |
| corr = plot_df[x_col].corr(plot_df[y_col]) | |
| stats_text = f""" | |
| <b>Statistics</b><br> | |
| Correlation: {corr:.4f}<br> | |
| RΒ²: {corr**2:.4f}<br> | |
| Covariance: {plot_df[x_col].cov(plot_df[y_col]):.4f}<br> | |
| Sample Size: {len(plot_df)}<br> | |
| """ | |
| fig.add_annotation(x=0.5, y=0.5, text=stats_text, | |
| showarrow=False, font=dict(size=10), | |
| row=2, col=3, align='left') | |
| fig.update_layout(height=800, title_text=f"Bivariate Analysis: {x_col} vs {y_col}") | |
| st.plotly_chart(fig, use_container_width=True) | |
| # Correlation interpretation | |
| if abs(corr) > 0.7: | |
| st.success(f"β Strong {'positive' if corr > 0 else 'negative'} correlation detected") | |
| elif abs(corr) > 0.3: | |
| st.info(f"βΉοΈ Moderate {'positive' if corr > 0 else 'negative'} correlation detected") | |
| else: | |
| st.warning(f"β οΈ Weak or no correlation detected") | |
| else: | |
| st.warning("β οΈ Need at least 2 numeric columns for this analysis") | |
| elif analysis_type == "Numeric vs Categorical": | |
| numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist() | |
| categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist() | |
| if numeric_cols and categorical_cols: | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| num_col = st.selectbox("Select numeric column", numeric_cols, key="bi_num") | |
| with col2: | |
| cat_col = st.selectbox("Select categorical column", categorical_cols, key="bi_cat") | |
| # Clean data | |
| plot_df = df[[num_col, cat_col]].dropna() | |
| if len(plot_df) > 0 and plot_df[cat_col].nunique() <= 30: | |
| # Create visualizations | |
| fig = make_subplots(rows=2, cols=2, | |
| subplot_titles=("Box Plot", "Violin Plot", | |
| "Strip Plot", "Bar Chart (Means Β± SD)"), | |
| specs=[[{"type": "xy"}, {"type": "xy"}], | |
| [{"type": "xy"}, {"type": "xy"}]]) | |
| # Box plot | |
| fig.add_trace(go.Box(x=plot_df[cat_col], y=plot_df[num_col], | |
| name="Box Plot", marker_color='#42a5f5'), row=1, col=1) | |
| # Violin plot | |
| fig.add_trace(go.Violin(x=plot_df[cat_col], y=plot_df[num_col], | |
| box_visible=True, line_color='black', | |
| fillcolor='#66bb6a', opacity=0.6, | |
| name="Violin Plot"), row=1, col=2) | |
| # Strip plot | |
| fig.add_trace(go.Scatter(x=plot_df[cat_col], y=plot_df[num_col], | |
| mode='markers', name="Strip Plot", | |
| marker=dict(size=3, opacity=0.3, color='#ffa726')), | |
| row=2, col=1) | |
| # Bar chart with error bars | |
| stats_by_cat = plot_df.groupby(cat_col)[num_col].agg(['mean', 'std', 'count']).reset_index() | |
| stats_by_cat = stats_by_cat.sort_values('mean', ascending=False).head(15) | |
| fig.add_trace(go.Bar(x=stats_by_cat[cat_col], y=stats_by_cat['mean'], | |
| error_y=dict(type='data', array=stats_by_cat['std']), | |
| name="Mean Β± SD", marker_color='#ab47bc'), | |
| row=2, col=2) | |
| fig.update_layout(height=800, title_text=f"{num_col} by {cat_col}") | |
| st.plotly_chart(fig, use_container_width=True) | |
| # ANOVA test for groups with >2 categories | |
| if plot_df[cat_col].nunique() >= 2: | |
| groups = [group[num_col].values for name, group in plot_df.groupby(cat_col)] | |
| if all(len(g) > 0 for g in groups): | |
| f_stat, p_val = stats.f_oneway(*groups) | |
| st.write(f"**One-way ANOVA Results:** F-statistic = {f_stat:.4f}, p-value = {p_val:.4f}") | |
| if p_val < 0.05: | |
| st.success("β Significant differences exist between groups") | |
| else: | |
| st.info("βΉοΈ No significant differences found between groups") | |
| elif plot_df[cat_col].nunique() > 30: | |
| st.warning(f"β οΈ Categorical column has {plot_df[cat_col].nunique()} unique values. Consider grouping or selecting another column.") | |
| else: | |
| st.warning("β οΈ Need both numeric and categorical columns for this analysis") | |
| elif analysis_type == "Categorical vs Categorical": | |
| categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist() | |
| if len(categorical_cols) >= 2: | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| cat1 = st.selectbox("Select first categorical column", categorical_cols, key="bi_cat1") | |
| with col2: | |
| cat2 = st.selectbox("Select second categorical column", | |
| [c for c in categorical_cols if c != cat1], key="bi_cat2") | |
| # Create contingency table | |
| contingency = pd.crosstab(df[cat1], df[cat2]) | |
| if contingency.size > 0: | |
| fig = make_subplots(rows=1, cols=2, | |
| subplot_titles=("Stacked Bar Chart", "Heatmap"), | |
| specs=[[{"type": "xy"}, {"type": "heatmap"}]]) | |
| # Stacked bar chart | |
| for col in contingency.columns[:10]: # Limit to 10 categories | |
| fig.add_trace(go.Bar(x=contingency.index[:10], y=contingency[col][:10], | |
| name=str(col)), row=1, col=1) | |
| # Heatmap | |
| fig.add_trace(go.Heatmap(z=contingency.values[:10, :10], | |
| x=contingency.columns[:10].astype(str), | |
| y=contingency.index[:10].astype(str), | |
| colorscale='Viridis', | |
| text=contingency.values[:10, :10], | |
| texttemplate="%{text}"), row=1, col=2) | |
| fig.update_layout(height=600, title_text=f"Relationship: {cat1} vs {cat2}", | |
| barmode='stack') | |
| st.plotly_chart(fig, use_container_width=True) | |
| # Chi-square test | |
| from scipy.stats import chi2_contingency | |
| chi2, p_val, dof, expected = chi2_contingency(contingency) | |
| st.write(f"**Chi-square Test Results:**") | |
| st.write(f"ΟΒ² = {chi2:.4f}, df = {dof}, p-value = {p_val:.4f}") | |
| if p_val < 0.05: | |
| st.success("β Significant association found between variables") | |
| # Cramer's V for effect size | |
| n = contingency.sum().sum() | |
| cramer_v = np.sqrt(chi2 / (n * (min(contingency.shape) - 1))) | |
| st.write(f"**Cramer's V (effect size):** {cramer_v:.4f}") | |
| else: | |
| st.info("βΉοΈ No significant association found") | |
| else: | |
| st.warning("β οΈ Need at least 2 categorical columns for this analysis") | |
| except Exception as e: | |
| st.error(f"β Error in bivariate analysis: {str(e)}") | |
| st.info("π‘ Tip: Check if selected columns have sufficient data for analysis") | |
| st.markdown('</div>', unsafe_allow_html=True) | |
| with tab5: | |
| st.markdown('<div class="custom-card">', unsafe_allow_html=True) | |
| st.subheader("π Multivariate Analysis") | |
| try: | |
| numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist() | |
| if len(numeric_cols) >= 3: | |
| analysis_type = st.radio("Select analysis type", | |
| ["Correlation Matrix", "Parallel Coordinates", | |
| "3D Scatter", "Radar Chart"], | |
| horizontal=True, key="multivariate_type") | |
| if analysis_type == "Correlation Matrix": | |
| corr_matrix = df[numeric_cols].corr() | |
| fig = px.imshow(corr_matrix, | |
| text_auto=True, | |
| aspect="auto", | |
| color_continuous_scale='RdBu_r', | |
| title="Correlation Matrix Heatmap", | |
| zmin=-1, zmax=1) | |
| fig.update_layout(height=700) | |
| st.plotly_chart(fig, use_container_width=True) | |
| # Find highly correlated pairs | |
| high_corr = [] | |
| for i in range(len(numeric_cols)): | |
| for j in range(i+1, len(numeric_cols)): | |
| if abs(corr_matrix.iloc[i, j]) > 0.7: | |
| high_corr.append({ | |
| 'Feature 1': numeric_cols[i], | |
| 'Feature 2': numeric_cols[j], | |
| 'Correlation': corr_matrix.iloc[i, j] | |
| }) | |
| if high_corr: | |
| st.subheader("π Highly Correlated Pairs (|r| > 0.7)") | |
| for item in high_corr: | |
| st.write(f"β’ **{item['Feature 1']}** & **{item['Feature 2']}**: {item['Correlation']:.4f}") | |
| elif analysis_type == "Parallel Coordinates": | |
| # Select dimensions | |
| selected_dims = st.multiselect("Select dimensions (columns)", | |
| numeric_cols, | |
| default=numeric_cols[:min(4, len(numeric_cols))]) | |
| if len(selected_dims) >= 2: | |
| # Optional color dimension | |
| color_dim = st.selectbox("Color by", ["None"] + numeric_cols + | |
| df.select_dtypes(include=['object', 'category']).columns.tolist()) | |
| plot_df = df[selected_dims].dropna() | |
| if len(plot_df) > 0: | |
| if color_dim == "None": | |
| fig = px.parallel_coordinates(plot_df, | |
| dimensions=selected_dims, | |
| title="Parallel Coordinates Plot") | |
| else: | |
| if color_dim in numeric_cols: | |
| fig = px.parallel_coordinates(plot_df, | |
| dimensions=selected_dims, | |
| color=color_dim, | |
| color_continuous_scale=px.colors.diverging.RdBu, | |
| title=f"Parallel Coordinates colored by {color_dim}") | |
| else: | |
| # Categorical color | |
| temp_df = df[selected_dims + [color_dim]].dropna() | |
| fig = px.parallel_coordinates(temp_df, | |
| dimensions=selected_dims, | |
| color=color_dim, | |
| title=f"Parallel Coordinates colored by {color_dim}") | |
| fig.update_layout(height=600) | |
| st.plotly_chart(fig, use_container_width=True) | |
| elif analysis_type == "3D Scatter": | |
| if len(numeric_cols) >= 3: | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| x_3d = st.selectbox("X axis", numeric_cols, key="3d_x") | |
| with col2: | |
| y_3d = st.selectbox("Y axis", [c for c in numeric_cols if c != x_3d], key="3d_y") | |
| with col3: | |
| z_3d = st.selectbox("Z axis", [c for c in numeric_cols if c not in [x_3d, y_3d]], | |
| key="3d_z") | |
| color_3d = st.selectbox("Color by", ["None"] + | |
| df.select_dtypes(include=['object', 'category']).columns.tolist()) | |
| plot_df = df[[x_3d, y_3d, z_3d]].dropna() | |
| if len(plot_df) > 0: | |
| if color_3d == "None": | |
| fig = px.scatter_3d(plot_df, x=x_3d, y=y_3d, z=z_3d, | |
| title=f"3D Scatter Plot", | |
| opacity=0.7) | |
| else: | |
| temp_df = df[[x_3d, y_3d, z_3d, color_3d]].dropna() | |
| fig = px.scatter_3d(temp_df, x=x_3d, y=y_3d, z=z_3d, | |
| color=color_3d, | |
| title=f"3D Scatter colored by {color_3d}", | |
| opacity=0.7) | |
| fig.update_layout(height=700) | |
| st.plotly_chart(fig, use_container_width=True) | |
| elif analysis_type == "Radar Chart": | |
| # Select features for radar | |
| radar_features = st.multiselect("Select features for radar chart", | |
| numeric_cols, | |
| default=numeric_cols[:min(5, len(numeric_cols))]) | |
| if len(radar_features) >= 3: | |
| # Select how many samples to show | |
| n_samples = st.slider("Number of samples to show", 1, min(10, len(df)), 3) | |
| fig = go.Figure() | |
| for i in range(n_samples): | |
| sample = df.iloc[i][radar_features].values | |
| fig.add_trace(go.Scatterpolar( | |
| r=sample, | |
| theta=radar_features, | |
| fill='toself', | |
| name=f'Sample {i}' | |
| )) | |
| fig.update_layout( | |
| polar=dict( | |
| radialaxis=dict( | |
| visible=True, | |
| range=[df[radar_features].min().min(), df[radar_features].max().max()] | |
| )), | |
| title=f"Radar Chart - First {n_samples} Samples", | |
| height=600 | |
| ) | |
| st.plotly_chart(fig, use_container_width=True) | |
| else: | |
| st.warning("β οΈ Need at least 3 numeric columns for multivariate analysis") | |
| except Exception as e: | |
| st.error(f"β Error in multivariate analysis: {str(e)}") | |
| st.info("π‘ Tip: Ensure you have enough numeric columns for multivariate analysis") | |
| st.markdown('</div>', unsafe_allow_html=True) | |
| with tab6: | |
| st.markdown('<div class="custom-card">', unsafe_allow_html=True) | |
| st.subheader("π― Pattern Discovery") | |
| try: | |
| analysis_type = st.radio("Select pattern discovery method", | |
| ["Clustering Visualization", "Outlier Detection", | |
| "Trend Detection", "Seasonal Patterns"], | |
| horizontal=True, key="pattern_type") | |
| numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist() | |
| if analysis_type == "Clustering Visualization": | |
| if len(numeric_cols) >= 2: | |
| from sklearn.cluster import KMeans | |
| from sklearn.preprocessing import StandardScaler | |
| # Select features for clustering | |
| cluster_features = st.multiselect("Select features for clustering", | |
| numeric_cols, | |
| default=numeric_cols[:min(3, len(numeric_cols))]) | |
| if len(cluster_features) >= 2: | |
| n_clusters = st.slider("Number of clusters", 2, 8, 3) | |
| # Prepare data | |
| X = df[cluster_features].dropna() | |
| if len(X) > 0: | |
| # Scale data | |
| scaler = StandardScaler() | |
| X_scaled = scaler.fit_transform(X) | |
| # Perform clustering | |
| kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10) | |
| clusters = kmeans.fit_predict(X_scaled) | |
| # Create visualization | |
| if len(cluster_features) == 2: | |
| fig = px.scatter(x=X[cluster_features[0]], y=X[cluster_features[1]], | |
| color=clusters.astype(str), | |
| title=f"K-Means Clustering (k={n_clusters})", | |
| labels={'x': cluster_features[0], 'y': cluster_features[1], | |
| 'color': 'Cluster'}) | |
| elif len(cluster_features) >= 3: | |
| fig = px.scatter_3d(x=X[cluster_features[0]], y=X[cluster_features[1]], | |
| z=X[cluster_features[2]], color=clusters.astype(str), | |
| title=f"K-Means Clustering (k={n_clusters})", | |
| labels={cluster_features[0]: cluster_features[0], | |
| cluster_features[1]: cluster_features[1], | |
| cluster_features[2]: cluster_features[2], | |
| 'color': 'Cluster'}) | |
| fig.update_layout(height=600) | |
| st.plotly_chart(fig, use_container_width=True) | |
| # Cluster statistics | |
| st.subheader("π Cluster Statistics") | |
| X['Cluster'] = clusters | |
| cluster_stats = X.groupby('Cluster')[cluster_features].mean() | |
| st.dataframe(cluster_stats.style.format("{:.4f}")) | |
| elif analysis_type == "Outlier Detection": | |
| if len(numeric_cols) >= 2: | |
| from sklearn.ensemble import IsolationForest | |
| # Select features for outlier detection | |
| outlier_features = st.multiselect("Select features for outlier detection", | |
| numeric_cols, | |
| default=numeric_cols[:min(3, len(numeric_cols))]) | |
| if len(outlier_features) >= 2: | |
| contamination = st.slider("Expected outlier proportion", 0.01, 0.5, 0.1, 0.01) | |
| # Prepare data | |
| X = df[outlier_features].dropna() | |
| if len(X) > 0: | |
| # Detect outliers | |
| iso_forest = IsolationForest(contamination=contamination, random_state=42) | |
| outliers = iso_forest.fit_predict(X) | |
| # Create visualization | |
| if len(outlier_features) == 2: | |
| fig = px.scatter(x=X[outlier_features[0]], y=X[outlier_features[1]], | |
| color=outliers, | |
| color_continuous_scale=['blue', 'red'], | |
| title=f"Outlier Detection (contamination={contamination})", | |
| labels={'x': outlier_features[0], 'y': outlier_features[1], | |
| 'color': 'Outlier'}) | |
| elif len(outlier_features) >= 3: | |
| fig = px.scatter_3d(x=X[outlier_features[0]], y=X[outlier_features[1]], | |
| z=X[outlier_features[2]], color=outliers, | |
| color_continuous_scale=['blue', 'red'], | |
| title=f"Outlier Detection (contamination={contamination})", | |
| labels={outlier_features[0]: outlier_features[0], | |
| outlier_features[1]: outlier_features[1], | |
| outlier_features[2]: outlier_features[2], | |
| 'color': 'Outlier'}) | |
| fig.update_layout(height=600) | |
| st.plotly_chart(fig, use_container_width=True) | |
| # Outlier statistics | |
| n_outliers = (outliers == -1).sum() | |
| st.write(f"**Outliers detected:** {n_outliers} ({n_outliers/len(X)*100:.2f}%)") | |
| elif analysis_type == "Trend Detection": | |
| datetime_cols = df.select_dtypes(include=['datetime64']).columns.tolist() | |
| if datetime_cols and numeric_cols: | |
| date_col = st.selectbox("Select date column", datetime_cols) | |
| value_col = st.selectbox("Select value column", numeric_cols) | |
| # Prepare time series data | |
| ts_df = df[[date_col, value_col]].dropna().sort_values(date_col) | |
| if len(ts_df) > 10: | |
| # Calculate moving averages | |
| window = st.slider("Moving average window", 2, 30, 7) | |
| ts_df['MA'] = ts_df[value_col].rolling(window=window).mean() | |
| # Detect trend using linear regression | |
| from sklearn.linear_model import LinearRegression | |
| X = np.arange(len(ts_df)).reshape(-1, 1) | |
| y = ts_df[value_col].values | |
| model = LinearRegression() | |
| model.fit(X, y) | |
| trend = model.predict(X) | |
| # Create visualization | |
| fig = go.Figure() | |
| fig.add_trace(go.Scatter(x=ts_df[date_col], y=ts_df[value_col], | |
| mode='lines', name='Original')) | |
| fig.add_trace(go.Scatter(x=ts_df[date_col], y=ts_df['MA'], | |
| mode='lines', name=f'{window}-period MA', | |
| line=dict(color='orange'))) | |
| fig.add_trace(go.Scatter(x=ts_df[date_col], y=trend, | |
| mode='lines', name='Linear Trend', | |
| line=dict(color='red', dash='dash'))) | |
| fig.update_layout(title="Trend Detection", | |
| xaxis_title="Date", | |
| yaxis_title=value_col, | |
| height=500) | |
| st.plotly_chart(fig, use_container_width=True) | |
| # Trend statistics | |
| slope = model.coef_[0] | |
| st.write(f"**Trend slope:** {slope:.4f} units per time step") | |
| if slope > 0: | |
| st.success("β Upward trend detected") | |
| elif slope < 0: | |
| st.warning("β οΈ Downward trend detected") | |
| else: | |
| st.info("βΉοΈ No clear trend detected") | |
| elif analysis_type == "Seasonal Patterns": | |
| datetime_cols = df.select_dtypes(include=['datetime64']).columns.tolist() | |
| if datetime_cols and numeric_cols: | |
| date_col = st.selectbox("Select date column", datetime_cols, key="seasonal_date") | |
| value_col = st.selectbox("Select value column", numeric_cols, key="seasonal_value") | |
| # Extract seasonal components | |
| df_temp = df[[date_col, value_col]].dropna() | |
| df_temp['year'] = pd.DatetimeIndex(df_temp[date_col]).year | |
| df_temp['month'] = pd.DatetimeIndex(df_temp[date_col]).month | |
| df_temp['quarter'] = pd.DatetimeIndex(df_temp[date_col]).quarter | |
| df_temp['dayofweek'] = pd.DatetimeIndex(df_temp[date_col]).dayofweek | |
| # Create seasonal visualizations | |
| fig = make_subplots(rows=2, cols=2, | |
| subplot_titles=("Year-over-Year", "Monthly Pattern", | |
| "Quarterly Pattern", "Day of Week Pattern"), | |
| specs=[[{"type": "xy"}, {"type": "xy"}], | |
| [{"type": "xy"}, {"type": "xy"}]]) | |
| # Year-over-Year | |
| yearly_avg = df_temp.groupby('year')[value_col].mean().reset_index() | |
| fig.add_trace(go.Scatter(x=yearly_avg['year'], y=yearly_avg[value_col], | |
| mode='lines+markers', name="Yearly Avg"), row=1, col=1) | |
| # Monthly pattern | |
| monthly_avg = df_temp.groupby('month')[value_col].mean().reset_index() | |
| month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', | |
| 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] | |
| fig.add_trace(go.Bar(x=[month_names[m-1] for m in monthly_avg['month']], | |
| y=monthly_avg[value_col], name="Monthly Avg"), row=1, col=2) | |
| # Quarterly pattern | |
| quarterly_avg = df_temp.groupby('quarter')[value_col].mean().reset_index() | |
| quarter_names = ['Q1', 'Q2', 'Q3', 'Q4'] | |
| fig.add_trace(go.Bar(x=[quarter_names[q-1] for q in quarterly_avg['quarter']], | |
| y=quarterly_avg[value_col], name="Quarterly Avg"), row=2, col=1) | |
| # Day of week pattern | |
| dow_avg = df_temp.groupby('dayofweek')[value_col].mean().reset_index() | |
| day_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] | |
| fig.add_trace(go.Bar(x=[day_names[d] for d in dow_avg['dayofweek']], | |
| y=dow_avg[value_col], name="Day of Week Avg"), row=2, col=2) | |
| fig.update_layout(height=800, title_text="Seasonal Pattern Analysis") | |
| st.plotly_chart(fig, use_container_width=True) | |
| except Exception as e: | |
| st.error(f"β Error in pattern discovery: {str(e)}") | |
| st.info("π‘ Tip: Ensure you have sufficient data for pattern detection") | |
| st.markdown('</div>', unsafe_allow_html=True) | |
| except Exception as e: | |
| st.error(f"β Critical error in EDA: {str(e)}") | |
| st.info("π‘ Please check your dataset and try again") | |
| # Export options | |
| st.markdown("---") | |
| st.markdown("### π₯ Export EDA Report") | |
| try: | |
| numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist() | |
| report_text = f""" | |
| EXPLORATORY DATA ANALYSIS REPORT | |
| ================================= | |
| Dataset Information: | |
| β’ Total Rows: {df.shape[0]:,} | |
| β’ Total Columns: {df.shape[1]} | |
| β’ Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB | |
| Column Types: | |
| β’ Numeric: {len(numeric_cols)} | |
| β’ Categorical: {len(df.select_dtypes(include=['object', 'category']).columns)} | |
| β’ Datetime: {len(df.select_dtypes(include=['datetime64']).columns)} | |
| Data Quality: | |
| β’ Missing Values: {df.isnull().sum().sum():,} | |
| β’ Complete Cases: {df.dropna().shape[0]:,} | |
| β’ Duplicate Rows: {df.duplicated().sum():,} | |
| Analysis Performed: | |
| β’ Data Overview | |
| β’ Missing Data Analysis | |
| β’ Univariate Analysis | |
| β’ Bivariate Analysis | |
| β’ Multivariate Analysis | |
| β’ Pattern Discovery | |
| """ | |
| st.download_button( | |
| label="π₯ Download EDA Report", | |
| data=report_text, | |
| file_name="eda_report.txt", | |
| mime="text/plain", | |
| use_container_width=True | |
| ) | |
| except Exception as e: | |
| st.error(f"β Error generating report: {str(e)}") |