Spaces:
No application file
No application file
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| from scipy import stats | |
| def generate_business_insights(df): | |
| st.markdown(""" | |
| <div style='text-align: center; margin-bottom: 2rem;'> | |
| <h2>π‘ Automated Business Insights</h2> | |
| <p style='color: gray;'>AI-powered analysis to uncover hidden patterns and opportunities</p> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # Get column types | |
| numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist() | |
| categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist() | |
| datetime_cols = df.select_dtypes(include=['datetime64']).columns.tolist() | |
| # Create tabs for different insight categories | |
| tab1, tab2, tab3, tab4, tab5 = st.tabs([ | |
| "π Overview", "π Trends & Patterns", "π― Key Drivers", | |
| "β οΈ Anomalies", "π‘ Recommendations" | |
| ]) | |
| with tab1: | |
| st.markdown('<div class="custom-card">', unsafe_allow_html=True) | |
| st.subheader("π Dataset Overview") | |
| # Key metrics | |
| col1, col2, col3, col4 = st.columns(4) | |
| with col1: | |
| st.metric("Total Records", f"{df.shape[0]:,}") | |
| with col2: | |
| st.metric("Total Features", df.shape[1]) | |
| with col3: | |
| completeness = (1 - df.isnull().sum().sum() / (df.shape[0] * df.shape[1])) * 100 | |
| st.metric("Data Completeness", f"{completeness:.1f}%") | |
| with col4: | |
| if numeric_cols: | |
| total_value = df[numeric_cols].sum().sum() | |
| st.metric("Total Value", f"{total_value:,.0f}" if total_value < 1e6 else f"{total_value/1e6:,.1f}M") | |
| # Column composition | |
| st.markdown("### π Column Composition") | |
| comp_data = { | |
| 'Type': ['Numeric', 'Categorical', 'Datetime'], | |
| 'Count': [len(numeric_cols), len(categorical_cols), len(datetime_cols)] | |
| } | |
| fig = px.pie(comp_data, values='Count', names='Type', | |
| title="Column Type Distribution", | |
| color_discrete_sequence=px.colors.qualitative.Set3) | |
| st.plotly_chart(fig, use_container_width=True) | |
| # Data quality score | |
| st.markdown("### π Data Quality Score") | |
| quality_score = 0 | |
| quality_metrics = [] | |
| # Completeness score | |
| completeness_score = (1 - df.isnull().sum().sum() / (df.shape[0] * df.shape[1])) * 100 | |
| quality_metrics.append(completeness_score) | |
| # Uniqueness score (avoid duplicates) | |
| duplicate_pct = (df.duplicated().sum() / df.shape[0]) * 100 | |
| uniqueness_score = 100 - duplicate_pct | |
| quality_metrics.append(uniqueness_score) | |
| # Consistency score (data type consistency) | |
| type_consistency = 100 # Default high | |
| for col in df.columns: | |
| if df[col].dtype == 'object': | |
| # Check if column has mixed types | |
| try: | |
| pd.to_numeric(df[col], errors='raise') | |
| # If convertible to numeric, it's consistent | |
| except: | |
| pass # Object type is fine | |
| else: | |
| # Numeric columns are consistent | |
| pass | |
| quality_metrics.append(type_consistency) | |
| # Average quality score | |
| avg_quality = np.mean(quality_metrics) | |
| # Display gauge chart | |
| fig = go.Figure(go.Indicator( | |
| mode = "gauge+number", | |
| value = avg_quality, | |
| domain = {'x': [0, 1], 'y': [0, 1]}, | |
| title = {'text': "Overall Data Quality"}, | |
| gauge = { | |
| 'axis': {'range': [None, 100]}, | |
| 'bar': {'color': "darkblue"}, | |
| 'steps': [ | |
| {'range': [0, 50], 'color': "lightgray"}, | |
| {'range': [50, 80], 'color': "gray"}, | |
| {'range': [80, 100], 'color': "lightgreen"}], | |
| 'threshold': { | |
| 'line': {'color': "red", 'width': 4}, | |
| 'thickness': 0.75, | |
| 'value': 90}})) | |
| st.plotly_chart(fig, use_container_width=True) | |
| st.markdown('</div>', unsafe_allow_html=True) | |
| with tab2: | |
| st.markdown('<div class="custom-card">', unsafe_allow_html=True) | |
| st.subheader("π Trends & Patterns") | |
| if len(numeric_cols) > 0: | |
| # Correlation analysis | |
| if len(numeric_cols) >= 2: | |
| st.markdown("### π Key Relationships") | |
| corr_matrix = df[numeric_cols].corr() | |
| # Find strongest correlations | |
| corr_pairs = [] | |
| for i in range(len(numeric_cols)): | |
| for j in range(i+1, len(numeric_cols)): | |
| corr_pairs.append({ | |
| 'feature1': numeric_cols[i], | |
| 'feature2': numeric_cols[j], | |
| 'correlation': corr_matrix.iloc[i, j] | |
| }) | |
| corr_pairs.sort(key=lambda x: abs(x['correlation']), reverse=True) | |
| # Display top 5 correlations | |
| for pair in corr_pairs[:5]: | |
| strength = abs(pair['correlation']) | |
| if strength > 0.7: | |
| emoji = "π’" | |
| desc = "Strong" | |
| elif strength > 0.3: | |
| emoji = "π‘" | |
| desc = "Moderate" | |
| else: | |
| emoji = "π΄" | |
| desc = "Weak" | |
| direction = "positive" if pair['correlation'] > 0 else "negative" | |
| st.markdown( | |
| f"{emoji} **{pair['feature1']}** & **{pair['feature2']}**: " | |
| f"{pair['correlation']:.3f} ({desc} {direction} correlation)" | |
| ) | |
| # Insight | |
| if corr_pairs: | |
| st.info(f"π‘ **Insight**: {corr_pairs[0]['feature1']} and {corr_pairs[0]['feature2']} " | |
| f"have the strongest {'positive' if corr_pairs[0]['correlation'] > 0 else 'negative'} " | |
| f"relationship in the dataset.") | |
| # Distribution insights | |
| st.markdown("### π Distribution Analysis") | |
| skewness = df[numeric_cols].skew() | |
| skewed_cols = skewness[abs(skewness) > 1].index.tolist() | |
| if skewed_cols: | |
| st.warning(f"β οΈ **Skewed Features**: {', '.join(skewed_cols[:3])}" + | |
| (" and more" if len(skewed_cols) > 3 else "")) | |
| st.markdown("π‘ These features might benefit from transformation for better model performance.") | |
| # Show distribution of most skewed feature | |
| if skewed_cols: | |
| col_to_show = skewed_cols[0] | |
| fig = px.histogram(df, x=col_to_show, nbins=30, | |
| title=f"Distribution of {col_to_show} (Most Skewed)", | |
| marginal="box") | |
| st.plotly_chart(fig, use_container_width=True) | |
| else: | |
| st.info("No numeric columns available for trend analysis") | |
| st.markdown('</div>', unsafe_allow_html=True) | |
| with tab3: | |
| st.markdown('<div class="custom-card">', unsafe_allow_html=True) | |
| st.subheader("π― Key Business Drivers") | |
| if len(numeric_cols) > 0: | |
| # Find features with highest variance (potential impact) | |
| variances = df[numeric_cols].var().sort_values(ascending=False) | |
| st.markdown("### π High Variance Features") | |
| st.markdown("Features with high variance often indicate key business drivers") | |
| fig = px.bar(x=variances.index[:10], y=variances.values[:10], | |
| title="Top 10 Features by Variance", | |
| labels={'x': 'Feature', 'y': 'Variance'}) | |
| st.plotly_chart(fig, use_container_width=True) | |
| # Feature importance based on mutual information | |
| if len(numeric_cols) >= 2: | |
| st.markdown("### π Predictive Power") | |
| # Use last numeric column as potential target | |
| target = numeric_cols[-1] | |
| features = numeric_cols[:-1] | |
| if len(features) > 0: | |
| from sklearn.feature_selection import mutual_info_regression | |
| mi_scores = mutual_info_regression(df[features].fillna(0), df[target].fillna(0)) | |
| mi_df = pd.DataFrame({ | |
| 'feature': features, | |
| 'importance': mi_scores | |
| }).sort_values('importance', ascending=False) | |
| fig = px.bar(mi_df.head(10), x='importance', y='feature', | |
| orientation='h', | |
| title=f"Feature Importance for Predicting {target}") | |
| st.plotly_chart(fig, use_container_width=True) | |
| st.info(f"π‘ **Key Driver**: {mi_df.iloc[0]['feature']} appears to be the most " | |
| f"important factor for predicting {target}") | |
| else: | |
| st.info("No numeric columns available for driver analysis") | |
| st.markdown('</div>', unsafe_allow_html=True) | |
| with tab4: | |
| st.markdown('<div class="custom-card">', unsafe_allow_html=True) | |
| st.subheader("β οΈ Anomaly Detection") | |
| if len(numeric_cols) > 0: | |
| # Outlier detection using IQR | |
| outlier_report = [] | |
| for col in numeric_cols: | |
| Q1 = df[col].quantile(0.25) | |
| Q3 = df[col].quantile(0.75) | |
| IQR = Q3 - Q1 | |
| outliers = df[(df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)] | |
| outlier_pct = (len(outliers) / len(df)) * 100 | |
| if outlier_pct > 5: | |
| outlier_report.append({ | |
| 'column': col, | |
| 'outlier_pct': outlier_pct, | |
| 'lower_bound': Q1 - 1.5 * IQR, | |
| 'upper_bound': Q3 + 1.5 * IQR | |
| }) | |
| if outlier_report: | |
| st.warning(f"β οΈ Found {len(outlier_report)} columns with significant outliers") | |
| for item in outlier_report[:5]: | |
| st.markdown(f"**{item['column']}**: {item['outlier_pct']:.1f}% outliers " | |
| f"(outside [{item['lower_bound']:.2f}, {item['upper_bound']:.2f}])") | |
| # Visualize outliers for first column | |
| col_to_show = outlier_report[0]['column'] | |
| fig = px.box(df, y=col_to_show, title=f"Outliers in {col_to_show}") | |
| st.plotly_chart(fig, use_container_width=True) | |
| st.markdown("π‘ **Recommendation**: Investigate these outliers - they may represent " | |
| "unusual but important business events or data quality issues.") | |
| else: | |
| st.success("β No significant outliers detected in numeric columns") | |
| else: | |
| st.info("No numeric columns available for outlier detection") | |
| st.markdown('</div>', unsafe_allow_html=True) | |
| with tab5: | |
| st.markdown('<div class="custom-card">', unsafe_allow_html=True) | |
| st.subheader("π‘ Strategic Recommendations") | |
| # Generate business recommendations based on data insights | |
| recommendations = [] | |
| if len(numeric_cols) > 0: | |
| # Check for growth opportunities | |
| growth_cols = [] | |
| for col in numeric_cols: | |
| if df[col].min() >= 0 and df[col].max() > df[col].min() * 10: | |
| growth_cols.append(col) | |
| if growth_cols: | |
| recommendations.append({ | |
| 'area': 'Growth Opportunity', | |
| 'recommendation': f"Focus on {growth_cols[0]} which shows high variability " | |
| f"(range: {df[growth_cols[0]].min():.2f} to {df[growth_cols[0]].max():.2f})", | |
| 'priority': 'High' | |
| }) | |
| # Check for efficiency opportunities | |
| if len(numeric_cols) >= 2: | |
| # Find features with high correlation - potential redundancy | |
| corr_matrix = df[numeric_cols].corr() | |
| for i in range(len(numeric_cols)): | |
| for j in range(i+1, len(numeric_cols)): | |
| if abs(corr_matrix.iloc[i, j]) > 0.9: | |
| recommendations.append({ | |
| 'area': 'Efficiency', | |
| 'recommendation': f"Consider consolidating {numeric_cols[i]} and {numeric_cols[j]} " | |
| f"as they are highly correlated ({corr_matrix.iloc[i, j]:.2f})", | |
| 'priority': 'Medium' | |
| }) | |
| break | |
| if len(recommendations) > 3: | |
| break | |
| if categorical_cols: | |
| # Check for customer/market segments | |
| for col in categorical_cols[:2]: | |
| if df[col].nunique() > 1 and df[col].nunique() <= 10: | |
| top_segment = df[col].value_counts().index[0] | |
| recommendations.append({ | |
| 'area': 'Segmentation', | |
| 'recommendation': f"Target the dominant segment in {col}: '{top_segment}' " | |
| f"({df[col].value_counts().iloc[0]:,} records)", | |
| 'priority': 'Medium' | |
| }) | |
| # Display recommendations | |
| if recommendations: | |
| for rec in recommendations: | |
| priority_color = "π΄" if rec['priority'] == 'High' else "π‘" if rec['priority'] == 'Medium' else "π’" | |
| st.markdown(f"{priority_color} **{rec['area']}**: {rec['recommendation']}") | |
| else: | |
| st.info("No specific recommendations generated. Try uploading a dataset with more variety.") | |
| # Add download insights option | |
| st.markdown("---") | |
| st.markdown("### π₯ Export Insights") | |
| insight_text = f""" | |
| BUSINESS INSIGHTS REPORT | |
| ======================= | |
| Dataset: {df.shape[0]} rows Γ {df.shape[1]} columns | |
| KEY METRICS: | |
| β’ Total Records: {df.shape[0]:,} | |
| β’ Total Features: {df.shape[1]} | |
| β’ Data Completeness: {completeness:.1f}% | |
| COLUMN COMPOSITION: | |
| β’ Numeric: {len(numeric_cols)} | |
| β’ Categorical: {len(categorical_cols)} | |
| β’ Datetime: {len(datetime_cols)} | |
| RECOMMENDATIONS: | |
| """ | |
| for rec in recommendations: | |
| insight_text += f"\nβ’ {rec['area']}: {rec['recommendation']} (Priority: {rec['priority']})" | |
| st.download_button( | |
| label="π₯ Download Insights Report", | |
| data=insight_text, | |
| file_name="business_insights.txt", | |
| mime="text/plain", | |
| use_container_width=True | |
| ) | |
| st.markdown('</div>', unsafe_allow_html=True) |