Spaces:

entropy25
/

data-analysis-platform

Sleeping

App Files Files Community

entropy25 commited on Aug 9, 2025

Commit

e879f17

verified ·

1 Parent(s): 9dd7d77

Update analyzer.py

Browse files

Files changed (1) hide show

analyzer.py +277 -1115

analyzer.py CHANGED Viewed

@@ -1,1166 +1,328 @@
-import streamlit as st
 import pandas as pd
 import numpy as np
-import plotly.express as px
-import plotly.graph_objects as go
-import plotly.figure_factory as ff
-from plotly.subplots import make_subplots
-from typing import Dict, List, Any, Optional
-import os
-from dotenv import load_dotenv
-from data_handler import *
-# ML imports
 try:
     from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
-    from sklearn.linear_model import LogisticRegression, LinearRegression
-    from sklearn.model_selection import train_test_split
-    from sklearn.metrics import accuracy_score, r2_score, mean_squared_error
-    from sklearn.preprocessing import LabelEncoder
     ML_AVAILABLE = True
 except ImportError:
     ML_AVAILABLE = False
-# Load environment variables
-load_dotenv()
-class AIAssistant:
-    """AI-powered analysis assistant"""
-    def __init__(self):
-        self.openai_key = os.getenv('OPENAI_API_KEY')
-        self.gemini_key = os.getenv('GOOGLE_API_KEY')
-        try:
-            import google.generativeai as genai
-            if self.gemini_key:
-                genai.configure(api_key=self.gemini_key)
-                self.gemini_model = genai.GenerativeModel('gemini-1.5-flash')
-        except ImportError:
-            pass
-    def analyze_insights(self, df: pd.DataFrame, insights: List[Dict], model: str = "Google Gemini") -> str:
-        """Get AI analysis of insights"""
-        summary = f"""
-        Dataset Summary:
-        - Shape: {df.shape}
-        - Columns: {list(df.columns)}
-        Key Insights:
-        """
-        for insight in insights:
-            summary += f"\n- {insight['insight']}"
-        prompt = f"""
-        As a senior data scientist, analyze this dataset and provide:
-        1. Business implications
-        2. Key opportunities and risks
-        3. Actionable recommendations
-        4. Suggestions for further analysis
-        {summary}
-        """
         try:
-            if hasattr(self, 'gemini_model'):
-                response = self.gemini_model.generate_content(prompt)
-                return response.text
-            else:
-                return "AI analysis not available. Please configure API keys in .env file."
         except Exception as e:
-            return f"AI Analysis Error: {str(e)}"
-class DataAnalysisWorkflow:
-    """Enhanced data analysis workflow with ML capabilities"""
-    def __init__(self, df: pd.DataFrame):
-        self.df = df
-        self.original_df = df.copy()  # Keep original for reference
-        self.stats = calculate_basic_stats(df)
-        self.column_types = get_column_types(df)
-        self.insights = []
-        self.ml_results = {}
-    def add_insight(self, insight: str, stage: int):
-        """Add insight to analysis report"""
-        self.insights.append({
-            'stage': stage,
-            'insight': insight,
-            'timestamp': pd.Timestamp.now()
-        })
-    def stage_1_overview(self):
-        """Stage 1: Enhanced Data Overview"""
-        st.subheader("📊 Data Overview")
-        # Key metrics with better formatting
-        col1, col2, col3, col4 = st.columns(4)
-        with col1:
-            st.metric("Total Rows", f"{self.stats['shape'][0]:,}")
-        with col2:
-            st.metric("Total Columns", f"{self.stats['shape'][1]:,}")
-        with col3:
-            missing_pct = (self.stats['missing_values'] / (self.stats['shape'][0] * self.stats['shape'][1])) * 100
-            st.metric("Missing Values", f"{self.stats['missing_values']:,}", f"{missing_pct:.1f}%")
-        with col4:
-            st.metric("Memory Usage", f"{self.stats['memory_usage']:.1f} MB")
-        # Enhanced data types visualization
-        if self.stats['dtypes']:
-            col1, col2 = st.columns(2)
-            with col1:
-                fig = px.pie(
-                    values=list(self.stats['dtypes'].values()),
-                    names=list(self.stats['dtypes'].keys()),
-                    title="Data Types Distribution",
-                    color_discrete_sequence=px.colors.qualitative.Set3
-                )
-                st.plotly_chart(fig, use_container_width=True)
-            with col2:
-                # Column overview table
-                column_info = []
-                for col in self.df.columns:
-                    column_info.append({
-                        'Column': col,
-                        'Type': str(self.df[col].dtype),
-                        'Non-Null': self.df[col].notna().sum(),
-                        'Unique': self.df[col].nunique()
-                    })
-                info_df = pd.DataFrame(column_info)
-                st.subheader("Column Details")
-                st.dataframe(info_df, use_container_width=True, height=300)
-        # Enhanced data preview
-        st.subheader("Data Preview")
-        preview_option = st.radio(
-            "Preview type:",
-            ["First 10 rows", "Last 10 rows", "Random sample", "Custom range"],
-            horizontal=True
-        )
-        if preview_option == "First 10 rows":
-            st.dataframe(self.df.head(10), use_container_width=True)
-        elif preview_option == "Last 10 rows":
-            st.dataframe(self.df.tail(10), use_container_width=True)
-        elif preview_option == "Random sample":
-            sample_size = min(10, len(self.df))
-            st.dataframe(self.df.sample(n=sample_size), use_container_width=True)
-        else:
-            col1, col2 = st.columns(2)
-            with col1:
-                start_row = st.number_input("Start row", 0, len(self.df)-1, 0)
-            with col2:
-                end_row = st.number_input("End row", start_row+1, len(self.df), min(start_row+10, len(self.df)))
-            st.dataframe(self.df.iloc[start_row:end_row], use_container_width=True)
-        # Missing values analysis
-        missing_df = calculate_missing_data(self.df)
-        if not missing_df.empty:
-            st.subheader("Missing Values Analysis")
-            # Visualize missing values
-            fig = px.bar(
-                missing_df,
-                x='Column',
-                y='Missing %',
-                title="Missing Values by Column",
-                color='Missing %',
-                color_continuous_scale='Reds'
-            )
-            st.plotly_chart(fig, use_container_width=True)
-            st.dataframe(missing_df, use_container_width=True)
-            worst_column = missing_df.iloc[0]['Column']
-            worst_percentage = missing_df.iloc[0]['Missing %']
-            self.add_insight(f"Column '{worst_column}' has highest missing data: {worst_percentage:.1f}%", 1)
-        else:
-            st.success("✅ No missing values found - Excellent data quality!")
-            self.add_insight("Dataset has perfect completeness with no missing values", 1)
-    def stage_2_exploration(self):
-        """Stage 2: Enhanced Exploratory Data Analysis"""
-        st.subheader("🔍 Exploratory Data Analysis")
-        numeric_cols = self.column_types['numeric']
-        categorical_cols = self.column_types['categorical']
-        # Numeric analysis with enhanced visualizations
-        if numeric_cols:
-            st.subheader("Numeric Variables Analysis")
-            # Multi-column selection
-            selected_numerics = st.multiselect(
-                "Select numeric columns for analysis:",
-                numeric_cols,
-                default=numeric_cols[:3] if len(numeric_cols) >= 3 else numeric_cols
-            )
-            if selected_numerics:
-                # Distribution plots
-                st.subheader("Distribution Analysis")
-                if len(selected_numerics) == 1:
-                    col = selected_numerics[0]
-                    col1, col2 = st.columns(2)
-                    with col1:
-                        fig = px.histogram(
-                            self.df,
-                            x=col,
-                            marginal="box",
-                            title=f"Distribution of {col}",
-                            nbins=50
-                        )
-                        st.plotly_chart(fig, use_container_width=True)
-                    with col2:
-                        # Q-Q plot
-                        from scipy import stats
-                        fig = go.Figure()
-                        # Remove NaN values for Q-Q plot
-                        clean_data = self.df[col].dropna()
-                        if len(clean_data) > 0:
-                            qq = stats.probplot(clean_data, dist="norm")
-                            fig.add_trace(go.Scatter(
-                                x=qq[0][0],
-                                y=qq[0][1],
-                                mode='markers',
-                                name='Data points'
-                            ))
-                            fig.add_trace(go.Scatter(
-                                x=qq[0][0],
-                                y=qq[1][1] + qq[1][0] * qq[0][0],
-                                mode='lines',
-                                name='Normal distribution line',
-                                line=dict(color='red')
-                            ))
-                            fig.update_layout(
-                                title=f"Q-Q Plot: {col}",
-                                xaxis_title="Theoretical Quantiles",
-                                yaxis_title="Sample Quantiles"
-                            )
-                            st.plotly_chart(fig, use_container_width=True)
-                else:
-                    # Multiple distributions
-                    fig = make_subplots(
-                        rows=len(selected_numerics),
-                        cols=1,
-                        subplot_titles=selected_numerics,
-                        vertical_spacing=0.05
-                    )
-                    for i, col in enumerate(selected_numerics, 1):
-                        fig.add_trace(
-                            go.Histogram(x=self.df[col], name=col, nbinsx=30),
-                            row=i, col=1
-                        )
-                    fig.update_layout(height=200 * len(selected_numerics), showlegend=False)
-                    st.plotly_chart(fig, use_container_width=True)
-                # Statistical summary
-                st.subheader("Statistical Summary")
-                summary_stats = self.df[selected_numerics].describe()
-                st.dataframe(summary_stats, use_container_width=True)
-                # Correlation analysis
-                if len(selected_numerics) > 1:
-                    st.subheader("Correlation Analysis")
-                    corr_matrix = self.df[selected_numerics].corr()
-                    # Enhanced correlation heatmap
-                    fig = px.imshow(
-                        corr_matrix,
-                        text_auto=True,
-                        aspect="auto",
-                        title="Correlation Matrix",
-                        color_continuous_scale='RdBu',
-                        zmin=-1, zmax=1
-                    )
-                    fig.update_layout(height=500)
-                    st.plotly_chart(fig, use_container_width=True)
-                    # Find strongest correlations
-                    corr_pairs = []
-                    for i in range(len(corr_matrix.columns)):
-                        for j in range(i+1, len(corr_matrix.columns)):
-                            corr_val = corr_matrix.iloc[i, j]
-                            if abs(corr_val) > 0.1:  # Only show meaningful correlations
-                                corr_pairs.append({
-                                    'Variable 1': corr_matrix.columns[i],
-                                    'Variable 2': corr_matrix.columns[j],
-                                    'Correlation': corr_val,
-                                    'Strength': 'Strong' if abs(corr_val) > 0.7 else 'Moderate' if abs(corr_val) > 0.3 else 'Weak'
-                                })
-                    if corr_pairs:
-                        corr_df = pd.DataFrame(corr_pairs).sort_values('Correlation', key=abs, ascending=False)
-                        st.subheader("Top Correlations")
-                        st.dataframe(corr_df, use_container_width=True)
-                        strongest = corr_df.iloc[0]
-                        self.add_insight(
-                            f"Strongest correlation: {strongest['Variable 1']} vs {strongest['Variable 2']} ({strongest['Correlation']:.3f})",
-                            2
-                        )
-        # Enhanced categorical analysis
-        if categorical_cols:
-            st.subheader("Categorical Variables Analysis")
-            selected_categorical = st.selectbox("Select categorical column:", categorical_cols)
-            value_counts = get_value_counts(self.df, selected_categorical, 15)  # Top 15
-            col1, col2 = st.columns(2)
-            with col1:
-                # Bar chart
-                fig = px.bar(
-                    x=value_counts.values,
-                    y=value_counts.index,
-                    orientation='h',
-                    title=f"Top Categories in {selected_categorical}",
-                    color=value_counts.values,
-                    color_continuous_scale='viridis'
-                )
-                fig.update_layout(height=400, yaxis={'categoryorder':'total ascending'})
-                st.plotly_chart(fig, use_container_width=True)
-            with col2:
-                # Pie chart for top categories
-                top_5 = value_counts.head(5)
-                others = value_counts.iloc[5:].sum() if len(value_counts) > 5 else 0
-                if others > 0:
-                    pie_data = list(top_5.values) + [others]
-                    pie_labels = list(top_5.index) + ['Others']
-                else:
-                    pie_data = list(top_5.values)
-                    pie_labels = list(top_5.index)
-                fig = px.pie(
-                    values=pie_data,
-                    names=pie_labels,
-                    title=f"Distribution of {selected_categorical}",
-                    color_discrete_sequence=px.colors.qualitative.Set3
-                )
-                st.plotly_chart(fig, use_container_width=True)
-            # Category statistics
-            total_categories = self.df[selected_categorical].nunique()
-            most_common = value_counts.index[0]
-            most_common_pct = (value_counts.iloc[0] / len(self.df)) * 100
-            st.metric("Total Unique Categories", total_categories)
-            st.metric("Most Common Category", f"{most_common} ({most_common_pct:.1f}%)")
-            self.add_insight(f"Column '{selected_categorical}' has {total_categories} categories, dominated by '{most_common}' ({most_common_pct:.1f}%)", 2)
-    def stage_3_quality_check(self):
-        """Stage 3: Enhanced Data Quality Assessment"""
-        st.subheader("🧹 Data Quality Assessment")
-        quality_score = 100
-        issues = []
-        # Missing values check
-        if self.stats['missing_values'] > 0:
-            missing_pct = (self.stats['missing_values'] / (self.stats['shape'][0] * self.stats['shape'][1])) * 100
-            st.warning(f"⚠️ Found {self.stats['missing_values']:,} missing values ({missing_pct:.2f}%)")
-            quality_score -= min(missing_pct * 2, 30)
-            issues.append("Missing values detected")
-        else:
-            st.success("✅ No missing values")
-        # Duplicates check
-        if self.stats['duplicates'] > 0:
-            dup_pct = (self.stats['duplicates'] / self.stats['shape'][0]) * 100
-            st.warning(f"⚠️ Found {self.stats['duplicates']:,} duplicate rows ({dup_pct:.2f}%)")
-            quality_score -= min(dup_pct * 3, 25)
-            issues.append("Duplicate rows found")
-        else:
-            st.success("✅ No duplicate rows")
-        # Outlier detection with enhanced visualization
-        numeric_cols = self.column_types['numeric']
-        if numeric_cols:
-            st.subheader("Outlier Detection")
-            outlier_summary = []
-            for col in numeric_cols:
-                outliers = calculate_outliers(self.df, col)
-                outlier_pct = (len(outliers) / len(self.df)) * 100
-                outlier_summary.append({
-                    'Column': col,
-                    'Outliers': len(outliers),
-                    'Percentage': outlier_pct,
-                    'Status': '⚠️ High' if outlier_pct > 10 else '⚡ Medium' if outlier_pct > 5 else '✅ Low'
-                })
-            outlier_df = pd.DataFrame(outlier_summary)
-            st.dataframe(outlier_df, use_container_width=True)
-            # Visualize outliers
-            selected_col = st.selectbox("Select column for detailed outlier analysis:", numeric_cols)
-            col1, col2 = st.columns(2)
-            with col1:
-                fig = px.box(
-                    self.df,
-                    y=selected_col,
-                    title=f"Box Plot: {selected_col}",
-                    points="outliers"
-                )
-                st.plotly_chart(fig, use_container_width=True)
-            with col2:
-                # Outlier details
-                outliers = calculate_outliers(self.df, selected_col)
-                if len(outliers) > 0:
-                    st.metric("Outliers Found", len(outliers))
-                    st.metric("Outlier Percentage", f"{len(outliers)/len(self.df)*100:.2f}%")
-                    if len(outliers) <= 100:  # Show outlier values if not too many
-                        st.subheader("Outlier Values")
-                        st.dataframe(outliers[[selected_col]].head(20), use_container_width=True)
                 else:
-                    st.success("✅ No outliers detected")
-            # Adjust quality score based on outliers
-            total_outlier_pct = sum([row['Percentage'] for row in outlier_summary]) / len(outlier_summary)
-            quality_score -= min(total_outlier_pct, 20)
-        # Data consistency checks
-        st.subheader("Data Consistency Analysis")
-        consistency_issues = []
-        # Check for mixed data types in object columns
-        for col in self.column_types['categorical']:
-            unique_types = set(type(x).__name__ for x in self.df[col].dropna().head(100))
-            if len(unique_types) > 1:
-                consistency_issues.append(f"Mixed data types in column '{col}': {unique_types}")
-        # Check for unusual string patterns
-        for col in self.column_types['categorical']:
-            sample_values = self.df[col].dropna().head(50).astype(str)
-            if sample_values.str.contains(r'^[0-9]+$').any() and sample_values.str.contains(r'[a-zA-Z]').any():
-                consistency_issues.append(f"Mixed numeric/text patterns in column '{col}'")
-        if consistency_issues:
-            for issue in consistency_issues:
-                st.warning(f"⚠️ {issue}")
-            quality_score -= len(consistency_issues) * 5
-        else:
-            st.success("✅ Data types are consistent")
-        # Overall quality score
-        st.subheader("Overall Data Quality Score")
-        quality_score = max(0, min(100, quality_score))  # Ensure 0-100 range
-        col1, col2, col3 = st.columns(3)
-        with col2:
-            if quality_score >= 90:
-                st.success(f"🏆 Excellent Quality: {quality_score:.0f}/100")
-                quality_level = "Excellent"
-            elif quality_score >= 75:
-                st.info(f"👍 Good Quality: {quality_score:.0f}/100")
-                quality_level = "Good"
-            elif quality_score >= 60:
-                st.warning(f"⚠️ Fair Quality: {quality_score:.0f}/100")
-                quality_level = "Fair"
-            else:
-                st.error(f"❌ Poor Quality: {quality_score:.0f}/100")
-                quality_level = "Poor"
-        # Action recommendations
-        if issues:
-            st.subheader("📋 Recommended Actions")
-            for i, issue in enumerate(issues, 1):
-                st.write(f"{i}. Address {issue}")
-            self.add_insight(f"Data quality: {quality_level} ({quality_score:.0f}/100) - {len(issues)} issues identified", 3)
-        else:
-            st.success("🎉 No major data quality issues found!")
-            self.add_insight(f"Excellent data quality ({quality_score:.0f}/100) with no major issues", 3)
-    def stage_4_advanced_analysis(self):
-        """Stage 4: Advanced Statistical Analysis"""
-        st.subheader("🔬 Advanced Analysis")
-        numeric_cols = self.column_types['numeric']
-        categorical_cols = self.column_types['categorical']
-        # Advanced relationship analysis
-        if len(numeric_cols) >= 2:
-            st.subheader("🔗 Advanced Relationship Analysis")
-            # Scatter plot matrix for multiple variables
-            if len(numeric_cols) >= 3:
-                st.subheader("Scatter Plot Matrix")
-                selected_vars = st.multiselect(
-                    "Select variables for scatter plot matrix:",
-                    numeric_cols,
-                    default=numeric_cols[:4] if len(numeric_cols) >= 4 else numeric_cols
-                )
-                if len(selected_vars) >= 2:
-                    # Sample data for performance
-                    sample_size = min(1000, len(self.df))
-                    sample_df = self.df[selected_vars].sample(n=sample_size) if len(self.df) > sample_size else self.df[selected_vars]
-                    fig = px.scatter_matrix(
-                        sample_df,
-                        dimensions=selected_vars,
-                        title="Scatter Plot Matrix"
-                    )
-                    fig.update_layout(height=600)
-                    st.plotly_chart(fig, use_container_width=True)
-            # Pairwise analysis
-            st.subheader("Detailed Pairwise Analysis")
-            col1, col2 = st.columns(2)
-            with col1:
-                x_var = st.selectbox("X Variable:", numeric_cols, key="x_var_advanced")
-            with col2:
-                y_var = st.selectbox("Y Variable:", [col for col in numeric_cols if col != x_var], key="y_var_advanced")
-            # Color by categorical variable option
-            color_var = None
-            if categorical_cols:
-                use_color = st.checkbox("Color by categorical variable")
-                if use_color:
-                    color_var = st.selectbox("Color variable:", categorical_cols)
-            # Create enhanced scatter plot
-            sample_size = min(5000, len(self.df))
-            plot_df = self.df.sample(n=sample_size) if len(self.df) > sample_size else self.df
-            fig = px.scatter(
-                plot_df,
-                x=x_var,
-                y=y_var,
-                color=color_var,
-                title=f"Advanced Analysis: {x_var} vs {y_var}",
-                trendline="ols",
-                marginal_x="histogram",
-                marginal_y="histogram"
-            )
-            st.plotly_chart(fig, use_container_width=True)
-            # Statistical analysis
-            correlation = self.df[x_var].corr(self.df[y_var])
-            col1, col2, col3 = st.columns(3)
-            with col1:
-                st.metric("Correlation", f"{correlation:.3f}")
-            with col2:
-                r_squared = correlation ** 2
-                st.metric("R²", f"{r_squared:.3f}")
-            with col3:
-                if abs(correlation) > 0.7:
-                    strength = "Strong"
-                elif abs(correlation) > 0.3:
-                    strength = "Moderate"
-                else:
-                    strength = "Weak"
-                st.metric("Relationship", strength)
-            self.add_insight(f"Advanced analysis: {strength} relationship between {x_var} and {y_var} (r={correlation:.3f})", 4)
-        # Group comparison analysis
-        if categorical_cols and numeric_cols:
-            st.subheader("📊 Group Comparison Analysis")
-            col1, col2 = st.columns(2)
-            with col1:
-                group_var = st.selectbox("Group by:", categorical_cols, key="group_var_advanced")
-            with col2:
-                metric_var = st.selectbox("Analyze metric:", numeric_cols, key="metric_var_advanced")
-            # Calculate group statistics
-            group_stats = calculate_group_stats(self.df, group_var, metric_var)
-            # Enhanced group visualization
-            unique_groups = self.df[group_var].nunique()
-            if unique_groups <= 20:
-                col1, col2 = st.columns(2)
-                with col1:
-                    # Box plot
-                    fig = px.box(
-                        self.df,
-                        x=group_var,
-                        y=metric_var,
-                        title=f"{metric_var} Distribution by {group_var}",
-                        points="outliers"
-                    )
-                    fig.update_xaxes(tickangle=45)
-                    st.plotly_chart(fig, use_container_width=True)
-                with col2:
-                    # Violin plot
-                    fig = px.violin(
-                        self.df,
-                        x=group_var,
-                        y=metric_var,
-                        title=f"{metric_var} Density by {group_var}",
-                        box=True
-                    )
-                    fig.update_xaxes(tickangle=45)
-                    st.plotly_chart(fig, use_container_width=True)
-                # Statistical comparison
-                st.subheader("Statistical Comparison")
-                st.dataframe(group_stats, use_container_width=True)
-                # Identify best performing group
-                best_group = group_stats['mean'].idxmax()
-                best_value = group_stats.loc[best_group, 'mean']
-                worst_group = group_stats['mean'].idxmin()
-                worst_value = group_stats.loc[worst_group, 'mean']
-                col1, col2 = st.columns(2)
-                with col1:
-                    st.metric("Best Performing Group", best_group, f"Avg: {best_value:.2f}")
-                with col2:
-                    st.metric("Lowest Performing Group", worst_group, f"Avg: {worst_value:.2f}")
-                self.add_insight(f"Group analysis: '{best_group}' performs best with average {metric_var} of {best_value:.2f}", 4)
-            else:
-                st.info(f"Too many groups ({unique_groups}) for detailed visualization. Showing summary statistics only.")
-                st.dataframe(group_stats.head(15), use_container_width=True)
-    def stage_5_ml_modeling(self):
-        """Stage 5: Machine Learning Modeling"""
-        st.subheader("🤖 Machine Learning Modeling")
-        if not ML_AVAILABLE:
-            st.warning("⚠️ Machine Learning libraries not available. Please install scikit-learn to use this feature.")
-            st.code("pip install scikit-learn")
-            return
-        numeric_cols = self.column_types['numeric']
-        categorical_cols = self.column_types['categorical']
-        if len(numeric_cols) < 2:
-            st.warning("⚠️ Need at least 2 numeric columns for ML modeling.")
-            return
-        st.info("🎯 Automated machine learning model training and evaluation")
-        # Model configuration
-        st.subheader("Model Configuration")
-        col1, col2 = st.columns(2)
-        with col1:
-            target_column = st.selectbox(
-                "Select target variable (what to predict):",
-                numeric_cols + categorical_cols
-            )
-        with col2:
-            model_type = st.radio(
-                "Problem type:",
-                ["Auto-detect", "Regression", "Classification"]
-            )
-        # Feature selection
-        available_features = [col for col in numeric_cols if col != target_column]
-        if len(available_features) == 0:
-            st.error("❌ No suitable features available for modeling.")
-            return
-        selected_features = st.multiselect(
-            "Select features (leave empty for auto-selection):",
-            available_features,
-            default=available_features[:5] if len(available_features) >= 5 else available_features
-        )
-        if not selected_features:
-            selected_features = available_features[:10]  # Auto-select top 10
-        if st.button("🚀 Train Models", type="primary"):
-            try:
-                with st.spinner("Training machine learning models..."):
-                    self._train_ml_models(target_column, selected_features, model_type)
-                st.success("✅ Models trained successfully!")
-            except Exception as e:
-                st.error(f"❌ Model training failed: {str(e)}")
-        # Display results if available
-        if hasattr(self, 'ml_results') and self.ml_results:
-            self._display_ml_results()
-    def _train_ml_models(self, target_col: str, feature_cols: List[str], model_type: str):
-        """Train ML models"""
-        # Prepare data
-        X = self.df[feature_cols].copy()
-        y = self.df[target_col].copy()
-        # Handle missing values
-        X = X.fillna(X.mean())
-        y = y.fillna(y.mean() if y.dtype in ['int64', 'float64'] else y.mode()[0])
-        # Auto-detect problem type
-        if model_type == "Auto-detect":
-            if y.dtype == 'object' or y.nunique() < 10:
-                model_type = "Classification"
-            else:
-                model_type = "Regression"
-        # Encode categorical target if needed
-        label_encoder = None
-        if model_type == "Classification" and y.dtype == 'object':
-            label_encoder = LabelEncoder()
-            y = label_encoder.fit_transform(y)
-        # Split data
-        X_train, X_test, y_train, y_test = train_test_split(
-            X, y, test_size=0.2, random_state=42, stratify=y if model_type == "Classification" else None
-        )
-        # Train models
-        models = {}
-        results = {}
-        if model_type == "Regression":
-            models = {
-                "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
-                "Linear Regression": LinearRegression()
-            }
-        else:
-            models = {
-                "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
-                "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000)
             }
-        for name, model in models.items():
-            # Train model
-            model.fit(X_train, y_train)
-            # Make predictions
-            y_pred = model.predict(X_test)
-            # Calculate metrics
-            if model_type == "Regression":
-                r2 = r2_score(y_test, y_pred)
-                mse = mean_squared_error(y_test, y_pred)
-                results[name] = {
-                    "R² Score": r2,
-                    "MSE": mse,
-                    "RMSE": np.sqrt(mse)
-                }
-            else:
-                accuracy = accuracy_score(y_test, y_pred)
-                results[name] = {
-                    "Accuracy": accuracy
-                }
-            # Feature importance
-            if hasattr(model, 'feature_importances_'):
-                feature_importance = pd.DataFrame({
-                    'feature': feature_cols,
-                    'importance': model.feature_importances_
-                }).sort_values('importance', ascending=False)
-                results[name]['feature_importance'] = feature_importance
-        # Store results
-        self.ml_results = {
-            'model_type': model_type,
-            'target_column': target_col,
-            'feature_columns': feature_cols,
-            'results': results,
-            'label_encoder': label_encoder,
-            'test_size': len(X_test)
-        }
-        # Add insight
-        best_model = max(results.keys(), key=lambda x:
-            results[x]['R² Score'] if model_type == "Regression" else results[x]['Accuracy']
-        )
-        best_score = (results[best_model]['R² Score'] if model_type == "Regression"
-                     else results[best_model]['Accuracy'])
-        self.add_insight(f"ML modeling: Best {model_type.lower()} model is {best_model} with score {best_score:.3f}", 5)
-    def _display_ml_results(self):
-        """Display ML modeling results"""
-        st.subheader("🎯 Model Performance Results")
-        results = self.ml_results['results']
-        model_type = self.ml_results['model_type']
-        # Performance comparison
-        performance_data = []
-        for model_name, metrics in results.items():
-            row = {'Model': model_name}
-            for metric, value in metrics.items():
-                if metric != 'feature_importance':
-                    row[metric] = value
-            performance_data.append(row)
-        performance_df = pd.DataFrame(performance_data)
-        st.dataframe(performance_df, use_container_width=True)
-        # Visualize performance
-        if model_type == "Regression":
-            metric_to_plot = "R² Score"
-        else:
-            metric_to_plot = "Accuracy"
-        fig = px.bar(
-            performance_df,
-            x='Model',
-            y=metric_to_plot,
-            title=f"Model Performance Comparison ({metric_to_plot})",
-            color=metric_to_plot,
-            color_continuous_scale='viridis'
-        )
-        st.plotly_chart(fig, use_container_width=True)
-        # Feature importance analysis
-        st.subheader("🔍 Feature Importance Analysis")
-        # Get feature importance from best model
-        best_model = max(results.keys(), key=lambda x:
-            results[x][metric_to_plot]
-        )
-        if 'feature_importance' in results[best_model]:
-            importance_df = results[best_model]['feature_importance']
-            col1, col2 = st.columns(2)
-            with col1:
-                # Bar plot
-                fig = px.bar(
-                    importance_df.head(10),
-                    x='importance',
-                    y='feature',
-                    orientation='h',
-                    title=f"Top 10 Feature Importance ({best_model})",
-                    color='importance',
-                    color_continuous_scale='plasma'
-                )
-                fig.update_layout(yaxis={'categoryorder':'total ascending'})
-                st.plotly_chart(fig, use_container_width=True)
-            with col2:
-                # Show importance table
-                st.subheader("Feature Rankings")
-                st.dataframe(importance_df.head(10), use_container_width=True)
-            # Top features insight
-            top_feature = importance_df.iloc[0]['feature']
-            top_importance = importance_df.iloc[0]['importance']
-            self.add_insight(f"Most important feature: '{top_feature}' (importance: {top_importance:.3f})", 5)
-        # Model recommendations
-        st.subheader("📋 Model Recommendations")
-        best_score = results[best_model][metric_to_plot]
-        if model_type == "Regression":
-            if best_score > 0.8:
-                st.success(f"🏆 Excellent model performance! {best_model} explains {best_score*100:.1f}% of the variance.")
-            elif best_score > 0.6:
-                st.info(f"👍 Good model performance. {best_model} explains {best_score*100:.1f}% of the variance.")
-            else:
-                st.warning(f"⚠️ Model performance could be improved. Consider feature engineering or more advanced models.")
-        else:
-            if best_score > 0.9:
-                st.success(f"🏆 Excellent classification accuracy: {best_score*100:.1f}%")
-            elif best_score > 0.8:
-                st.info(f"👍 Good classification accuracy: {best_score*100:.1f}%")
-            else:
-                st.warning(f"⚠️ Classification accuracy could be improved: {best_score*100:.1f}%")
-    def stage_6_summary(self):
-        """Stage 6: Enhanced Summary and Export"""
-        st.subheader("📈 Analysis Summary & Export")
-        # Key metrics overview
-        col1, col2, col3, col4 = st.columns(4)
-        with col1:
-            st.metric("Total Insights Generated", len(self.insights))
-        with col2:
-            quality = "High" if self.stats['missing_values'] == 0 and self.stats['duplicates'] == 0 else "Medium"
-            st.metric("Data Quality", quality)
-        with col3:
-            analysis_completeness = "100%" if len(self.insights) >= 5 else f"{len(self.insights)*20}%"
-            st.metric("Analysis Complete", analysis_completeness)
-        with col4:
-            ml_status = "✅" if hasattr(self, 'ml_results') and self.ml_results else "➖"
-            st.metric("ML Models", ml_status)
-        # Insights timeline
-        st.subheader("🔍 Key Insights Timeline")
-        insights_by_stage = {}
-        for insight in self.insights:
-            stage = insight['stage']
-            if stage not in insights_by_stage:
-                insights_by_stage[stage] = []
-            insights_by_stage[stage].append(insight)
-        stage_names = {
-            1: "📊 Data Overview",
-            2: "🔍 Exploration",
-            3: "🧹 Quality Check",
-            4: "🔬 Advanced Analysis",
-            5: "🤖 ML Modeling",
-            6: "📈 Summary"
-        }
-        for stage_num in sorted(insights_by_stage.keys()):
-            with st.expander(f"{stage_names.get(stage_num, f'Stage {stage_num}')} - {len(insights_by_stage[stage_num])} insights"):
-                for i, insight in enumerate(insights_by_stage[stage_num], 1):
-                    st.write(f"{i}. {insight['insight']}")
-                    st.caption(f"Generated: {insight['timestamp'].strftime('%H:%M:%S')}")
-        # Executive summary with AI
-        st.subheader("🤖 AI-Powered Executive Summary")
-        ai_assistant = AIAssistant()
-        if st.button("Generate AI Summary", type="primary"):
-            with st.spinner("Generating AI-powered analysis summary..."):
-                ai_summary = ai_assistant.analyze_insights(self.df, self.insights)
-                st.markdown("### 📋 Executive Summary")
-                st.markdown(ai_summary)
-                # Store AI summary for export
-                self.ai_summary = ai_summary
-        # Export options
-        st.subheader("📥 Export Results")
-        col1, col2, col3 = st.columns(3)
-        with col1:
-            if st.button("📄 Generate Report"):
-                report = self._generate_comprehensive_report()
-                st.download_button(
-                    label="📥 Download Analysis Report",
-                    data=report,
-                    file_name=f"analysis_report_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}.txt",
-                    mime="text/plain"
-                )
-        with col2:
-            if st.button("📊 Export Data Summary"):
-                summary_data = self._generate_data_summary()
-                st.download_button(
-                    label="📥 Download Data Summary (CSV)",
-                    data=summary_data.to_csv(index=False),
-                    file_name=f"data_summary_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}.csv",
-                    mime="text/csv"
-                )
-        with col3:
-            if hasattr(self, 'ml_results') and self.ml_results:
-                if st.button("🤖 Export ML Results"):
-                    ml_report = self._generate_ml_report()
-                    st.download_button(
-                        label="📥 Download ML Report",
-                        data=ml_report,
-                        file_name=f"ml_report_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}.txt",
-                        mime="text/plain"
-                    )
-        # Analysis completion celebration
-        if len(self.insights) >= 5:
-            st.balloons()
-            st.success("🎉 Comprehensive analysis completed successfully!")
-    def _generate_comprehensive_report(self) -> str:
-        """Generate comprehensive analysis report"""
-        report = f"""
-COMPREHENSIVE DATA ANALYSIS REPORT
-{'='*50}
-DATASET OVERVIEW
-{'-'*20}
-• Dataset Shape: {self.stats['shape'][0]:,} rows × {self.stats['shape'][1]:,} columns
-• Memory Usage: {self.stats['memory_usage']:.2f} MB
-• Missing Values: {self.stats['missing_values']:,} ({self.stats['missing_values']/(self.stats['shape'][0]*self.stats['shape'][1])*100:.2f}%)
-• Duplicate Rows: {self.stats['duplicates']:,}
-DATA TYPES DISTRIBUTION
-{'-'*25}
-"""
-        for dtype, count in self.stats['dtypes'].items():
-            report += f"• {dtype}: {count} columns\n"
-        report += f"""
-KEY INSIGHTS BY ANALYSIS STAGE
-{'-'*35}
-"""
-        stage_names = {
-            1: "Data Overview",
-            2: "Exploratory Analysis",
-            3: "Quality Assessment",
-            4: "Advanced Analysis",
-            5: "Machine Learning",
-            6: "Summary"
-        }
-        for i, insight in enumerate(self.insights, 1):
-            stage_name = stage_names.get(insight['stage'], f"Stage {insight['stage']}")
-            report += f"\n{i}. [{stage_name}] {insight['insight']}"
-        # Add ML results if available
-        if hasattr(self, 'ml_results') and self.ml_results:
-            report += f"""
-MACHINE LEARNING RESULTS
-{'-'*25}
-• Problem Type: {self.ml_results['model_type']}
-• Target Variable: {self.ml_results['target_column']}
-• Features Used: {len(self.ml_results['feature_columns'])}
-• Test Set Size: {self.ml_results['test_size']} samples
-Model Performance:
-"""
-            for model_name, metrics in self.ml_results['results'].items():
-                report += f"\n{model_name}:\n"
-                for metric, value in metrics.items():
-                    if metric != 'feature_importance':
-                        report += f"  • {metric}: {value:.4f}\n"
-        # Add AI summary if available
-        if hasattr(self, 'ai_summary'):
-            report += f"""
-AI-POWERED EXECUTIVE SUMMARY
-{'-'*30}
-{self.ai_summary}
-"""
-        report += f"""
-ANALYSIS METADATA
-{'-'*18}
-• Total Insights Generated: {len(self.insights)}
-• Analysis Completion Time: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}
-• Platform: Enhanced Data Analysis Platform v2.0
-{'-'*50}
-Report generated automatically by Enhanced Data Analysis Platform
-"""
-        return report
-    def _generate_data_summary(self) -> pd.DataFrame:
-        """Generate data summary for export"""
-        summary_data = []
-        for col in self.df.columns:
-            col_info = {
-                'Column': col,
-                'Data_Type': str(self.df[col].dtype),
-                'Non_Null_Count': self.df[col].notna().sum(),
-                'Missing_Count': self.df[col].isna().sum(),
-                'Missing_Percentage': (self.df[col].isna().sum() / len(self.df)) * 100,
-                'Unique_Values': self.df[col].nunique(),
-                'Most_Common_Value': str(self.df[col].mode().iloc[0]) if not self.df[col].mode().empty else 'N/A'
-            }
-            if self.df[col].dtype in ['int64', 'float64']:
-                col_info.update({
-                    'Mean': self.df[col].mean(),
-                    'Median': self.df[col].median(),
-                    'Std_Dev': self.df[col].std(),
-                    'Min_Value': self.df[col].min(),
-                    'Max_Value': self.df[col].max()
-                })
-            summary_data.append(col_info)
-        return pd.DataFrame(summary_data)
-    def _generate_ml_report(self) -> str:
-        """Generate ML-specific report"""
-        if not hasattr(self, 'ml_results') or not self.ml_results:
-            return "No ML results available."
-        ml_report = f"""
-MACHINE LEARNING ANALYSIS REPORT
-{'='*40}
-MODEL CONFIGURATION
-{'-'*20}
-• Problem Type: {self.ml_results['model_type']}
-• Target Variable: {self.ml_results['target_column']}
-• Number of Features: {len(self.ml_results['feature_columns'])}
-• Features Used: {', '.join(self.ml_results['feature_columns'])}
-• Test Set Size: {self.ml_results['test_size']} samples
-MODEL PERFORMANCE RESULTS
-{'-'*27}
-"""
-        for model_name, metrics in self.ml_results['results'].items():
-            ml_report += f"\n{model_name}:\n"
-            for metric, value in metrics.items():
-                if metric != 'feature_importance':
-                    ml_report += f"  • {metric}: {value:.6f}\n"
-        # Add feature importance for best model
-        best_model = max(self.ml_results['results'].keys(), key=lambda x:
-            list(self.ml_results['results'][x].values())[0] if isinstance(list(self.ml_results['results'][x].values())[0], (int, float)) else 0
-        )
-        if 'feature_importance' in self.ml_results['results'][best_model]:
-            ml_report += f"""
-FEATURE IMPORTANCE ANALYSIS ({best_model})
-{'-'*35}
-"""
-            importance_df = self.ml_results['results'][best_model]['feature_importance']
-            for _, row in importance_df.head(10).iterrows():
-                ml_report += f"• {row['feature']}: {row['importance']:.6f}\n"
-        ml_report += f"""
-RECOMMENDATIONS
-{'-'*15}
-"""
-        if self.ml_results['model_type'] == "Regression":
-            best_score = max([metrics.get('R² Score', 0) for metrics in self.ml_results['results'].values()])
-            if best_score > 0.8:
-                ml_report += "• Excellent model performance - ready for production use\n"
-            elif best_score > 0.6:
-                ml_report += "• Good model performance - consider feature engineering for improvement\n"
-            else:
-                ml_report += "• Model performance needs improvement - try advanced algorithms or more features\n"
-        else:
-            best_score = max([metrics.get('Accuracy', 0) for metrics in self.ml_results['results'].values()])
-            if best_score > 0.9:
-                ml_report += "• Excellent classification accuracy - model ready for deployment\n"
-            elif best_score > 0.8:
-                ml_report += "• Good classification performance - minor optimizations recommended\n"
-            else:
-                ml_report += "• Classification accuracy needs improvement - consider ensemble methods\n"
-        ml_report += f"""
-{'-'*40}
-ML Report generated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}
-"""
-        return ml_report

 import pandas as pd
 import numpy as np
+import streamlit as st
+from typing import Dict, List, Any, Optional, Tuple
+import warnings
+warnings.filterwarnings('ignore')
+# Machine Learning imports
 try:
     from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
+    from sklearn.linear_model import LinearRegression, LogisticRegression
+    from sklearn.model_selection import train_test_split, cross_val_score
+    from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report
+    from sklearn.preprocessing import StandardScaler, LabelEncoder
     ML_AVAILABLE = True
 except ImportError:
     ML_AVAILABLE = False
+    st.warning("⚠️ Machine Learning libraries not available. Please install scikit-learn for ML features.")
+class DataAnalyzer:
+    """Enhanced data analyzer with ML capabilities"""
+    def __init__(self, df: pd.DataFrame):
+        """Initialize analyzer with dataframe"""
+        self.df = df.copy()
+        self.numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
+        self.categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
+        self.results = {}
+    def run_basic_analysis(self) -> Dict[str, Any]:
+        """Run basic statistical analysis"""
         try:
+            analysis = {}
+            # Shape and basic info
+            analysis['dataset_info'] = {
+                'rows': self.df.shape[0],
+                'columns': self.df.shape[1],
+                'memory_usage_mb': self.df.memory_usage(deep=True).sum() / (1024**2)
+            }
+            # Missing data summary
+            missing_data = self.df.isnull().sum()
+            analysis['missing_data'] = {
+                'total_missing': int(missing_data.sum()),
+                'missing_percentage': float((missing_data.sum() / (self.df.shape[0] * self.df.shape[1])) * 100),
+                'columns_with_missing': missing_data[missing_data > 0].to_dict()
+            }
+            # Data types summary
+            dtype_counts = self.df.dtypes.value_counts()
+            analysis['data_types'] = {str(k): int(v) for k, v in dtype_counts.items()}
+            # Numeric columns analysis
+            if self.numeric_cols:
+                numeric_analysis = {}
+                for col in self.numeric_cols:
+                    try:
+                        numeric_analysis[col] = {
+                            'mean': float(self.df[col].mean()),
+                            'median': float(self.df[col].median()),
+                            'std': float(self.df[col].std()),
+                            'min': float(self.df[col].min()),
+                            'max': float(self.df[col].max()),
+                            'skewness': float(self.df[col].skew()),
+                            'kurtosis': float(self.df[col].kurtosis())
+                        }
+                    except:
+                        numeric_analysis[col] = {'error': 'Could not calculate statistics'}
+                analysis['numeric_analysis'] = numeric_analysis
+            # Categorical columns analysis
+            if self.categorical_cols:
+                categorical_analysis = {}
+                for col in self.categorical_cols:
+                    try:
+                        mode_val = self.df[col].mode()
+                        most_frequent = str(mode_val.iloc[0]) if not mode_val.empty else 'None'
+                        most_frequent_count = int(self.df[col].value_counts().iloc[0]) if len(self.df[col].value_counts()) > 0 else 0
+                        categorical_analysis[col] = {
+                            'unique_values': int(self.df[col].nunique()),
+                            'most_frequent': most_frequent,
+                            'most_frequent_count': most_frequent_count
+                        }
+                    except:
+                        categorical_analysis[col] = {'error': 'Could not calculate statistics'}
+                analysis['categorical_analysis'] = categorical_analysis
+            self.results['basic_analysis'] = analysis
+            return analysis
         except Exception as e:
+            st.error(f"Error in basic analysis: {str(e)}")
+            return {}
+    def run_correlation_analysis(self) -> Dict[str, Any]:
+        """Run correlation analysis for numeric columns"""
+        try:
+            if len(self.numeric_cols) < 2:
+                return {'message': 'Need at least 2 numeric columns for correlation analysis'}
+            # Calculate correlation matrix
+            correlation_matrix = self.df[self.numeric_cols].corr()
+            # Find strong correlations (threshold > 0.7)
+            strong_correlations = []
+            for i in range(len(correlation_matrix.columns)):
+                for j in range(i+1, len(correlation_matrix.columns)):
+                    corr_value = correlation_matrix.iloc[i, j]
+                    if not pd.isna(corr_value) and abs(corr_value) > 0.7:
+                        strong_correlations.append({
+                            'variable_1': correlation_matrix.columns[i],
+                            'variable_2': correlation_matrix.columns[j],
+                            'correlation': float(corr_value),
+                            'strength': 'Strong Positive' if corr_value > 0.7 else 'Strong Negative'
+                        })
+            analysis = {
+                'correlation_matrix': correlation_matrix.to_dict(),
+                'strong_correlations': strong_correlations,
+                'total_pairs': len(strong_correlations)
+            }
+            self.results['correlation_analysis'] = analysis
+            return analysis
+        except Exception as e:
+            st.error(f"Error in correlation analysis: {str(e)}")
+            return {}
+    def run_ml_analysis(self, target_column: str) -> Dict[str, Any]:
+        """Run machine learning analysis"""
+        if not ML_AVAILABLE:
+            return {'error': 'Machine learning libraries not available'}
+        try:
+            # Prepare data
+            features = [col for col in self.numeric_cols if col != target_column]
+            if len(features) < 1:
+                return {'error': 'Not enough features for ML analysis'}
+            # Get clean data (no missing values)
+            ml_data = self.df[features + [target_column]].dropna()
+            if len(ml_data) < 10:
+                return {'error': 'Not enough data points for ML analysis'}
+            X = ml_data[features]
+            y = ml_data[target_column]
+            # Split data
+            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+            # Scale features
+            scaler = StandardScaler()
+            X_train_scaled = scaler.fit_transform(X_train)
+            X_test_scaled = scaler.transform(X_test)
+            results = {}
+            # Determine if regression or classification
+            is_classification = len(np.unique(y)) < 10 and (y.dtype == 'object' or len(np.unique(y)) <= 5)
+            if is_classification:
+                # Classification models
+                models = {
+                    'Random Forest Classifier': RandomForestClassifier(n_estimators=100, random_state=42),
+                    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000)
+                }
+                for name, model in models.items():
+                    try:
+                        # Train model
+                        if name == 'Logistic Regression':
+                            model.fit(X_train_scaled, y_train)
+                            y_pred = model.predict(X_test_scaled)
+                        else:
+                            model.fit(X_train, y_train)
+                            y_pred = model.predict(X_test)
+                        # Calculate metrics
+                        accuracy = accuracy_score(y_test, y_pred)
+                        results[name] = {
+                            'accuracy': float(accuracy),
+                            'type': 'classification'
+                        }
+                    except Exception as e:
+                        results[name] = {'error': str(e)}
+            else:
+                # Regression models
+                models = {
+                    'Random Forest Regressor': RandomForestRegressor(n_estimators=100, random_state=42),
+                    'Linear Regression': LinearRegression()
+                }
+                for name, model in models.items():
+                    try:
+                        # Train model
+                        if name == 'Linear Regression':
+                            model.fit(X_train_scaled, y_train)
+                            y_pred = model.predict(X_test_scaled)
+                        else:
+                            model.fit(X_train, y_train)
+                            y_pred = model.predict(X_test)
+                        # Calculate metrics
+                        r2 = r2_score(y_test, y_pred)
+                        mse = mean_squared_error(y_test, y_pred)
+                        results[name] = {
+                            'r2_score': float(r2),
+                            'mse': float(mse),
+                            'rmse': float(np.sqrt(mse)),
+                            'type': 'regression'
+                        }
+                    except Exception as e:
+                        results[name] = {'error': str(e)}
+            self.results['ml_analysis'] = results
+            return results
+        except Exception as e:
+            st.error(f"Error in ML analysis: {str(e)}")
+            return {'error': str(e)}
+    def generate_insights(self) -> Dict[str, Any]:
+        """Generate comprehensive insights from all analyses"""
+        try:
+            insights = {}
+            # Basic insights
+            basic = self.run_basic_analysis()
+            if basic:
+                insights['data_summary'] = [
+                    f"Dataset contains {basic['dataset_info']['rows']:,} rows and {basic['dataset_info']['columns']} columns",
+                    f"Memory usage: {basic['dataset_info']['memory_usage_mb']:.1f} MB",
+                    f"Missing data: {basic['missing_data']['missing_percentage']:.1f}% of total cells"
+                ]
+            # Correlation insights
+            correlation = self.run_correlation_analysis()
+            if correlation and 'strong_correlations' in correlation:
+                if correlation['strong_correlations']:
+                    corr_insights = []
+                    for corr in correlation['strong_correlations'][:5]:  # Top 5
+                        corr_insights.append(
+                            f"{corr['variable_1']} and {corr['variable_2']} are strongly correlated (r={corr['correlation']:.3f})"
+                        )
+                    insights['correlation_insights'] = corr_insights
                 else:
+                    insights['correlation_insights'] = ["No strong correlations found between numeric variables"]
+            # Data quality insights
+            quality_insights = []
+            # Missing data insights
+            if basic and basic['missing_data']['total_missing'] > 0:
+                quality_insights.append(f"Found {basic['missing_data']['total_missing']} missing values")
+                if basic['missing_data']['missing_percentage'] > 10:
+                    quality_insights.append("⚠️ High percentage of missing data may affect analysis quality")
+            # Duplicates
+            duplicates = self.df.duplicated().sum()
+            if duplicates > 0:
+                quality_insights.append(f"Found {duplicates} duplicate rows")
+            if not quality_insights:
+                quality_insights.append("✅ Data quality looks good - no major issues detected")
+            insights['data_quality'] = quality_insights
+            # Recommendations
+            recommendations = []
+            if basic and basic['missing_data']['missing_percentage'] > 5:
+                recommendations.append("Consider handling missing values before analysis")
+            if len(self.numeric_cols) < 2:
+                recommendations.append("Add more numeric columns for better analysis capabilities")
+            if self.df.shape[0] < 100:
+                recommendations.append("Consider collecting more data points for robust analysis")
+            if not recommendations:
+                recommendations.append("Dataset is ready for comprehensive analysis")
+            insights['recommendations'] = recommendations
+            return insights
+        except Exception as e:
+            st.error(f"Error generating insights: {str(e)}")
+            return {'error': str(e)}
+    def get_summary_statistics(self) -> Dict[str, Any]:
+        """Get comprehensive summary statistics"""
+        try:
+            summary = {
+                'shape': self.df.shape,
+                'columns': self.df.columns.tolist(),
+                'dtypes': self.df.dtypes.to_dict(),
+                'missing_values': self.df.isnull().sum().to_dict(),
+                'memory_usage': self.df.memory_usage(deep=True).sum() / (1024**2)  # MB
             }
+            # Numeric statistics
+            if self.numeric_cols:
+                summary['numeric_stats'] = self.df[self.numeric_cols].describe().to_dict()
+            # Categorical statistics
+            if self.categorical_cols:
+                categorical_stats = {}
+                for col in self.categorical_cols:
+                    categorical_stats[col] = {
+                        'unique_count': self.df[col].nunique(),
+                        'top_values': self.df[col].value_counts().head(5).to_dict()
+                    }
+                summary['categorical_stats'] = categorical_stats
+            return summary
+        except Exception as e:
+            st.error(f"Error getting summary statistics: {str(e)}")
+            return {}