Spaces:

entropy25
/

data-analysis-platform

Sleeping

App Files Files Community

entropy25 commited on Aug 9, 2025

Commit

c50f214

verified ·

1 Parent(s): aa64ef2

Update analyzer.py

Browse files

Files changed (1) hide show

analyzer.py +483 -1261

analyzer.py CHANGED Viewed

@@ -5,7 +5,6 @@ import plotly.express as px
 import plotly.graph_objects as go
 from typing import Dict, List, Any, Optional
 import os
-import logging
 from dotenv import load_dotenv
 from data_handler import *
 from io import BytesIO
@@ -13,42 +12,29 @@ from io import BytesIO
 # Load environment variables
 load_dotenv()
-# Configure logging
-logger = logging.getLogger(__name__)
-# Optional AI Integration with enhanced error handling
 try:
     import openai
     OPENAI_AVAILABLE = True
 except ImportError:
     OPENAI_AVAILABLE = False
-    logger.info("OpenAI not available - install openai package for AI features")
 try:
     import google.generativeai as genai
     GEMINI_AVAILABLE = True
 except ImportError:
     GEMINI_AVAILABLE = False
-    logger.info("Gemini not available - install google-generativeai package for AI features")
 class AIAssistant:
-    """Enhanced AI-powered analysis assistant with better error handling"""
     def __init__(self):
         self.openai_key = os.getenv('OPENAI_API_KEY')
         self.gemini_key = os.getenv('GOOGLE_API_KEY')
-        self.setup_models()
-    def setup_models(self):
-        """Initialize AI models with error handling"""
-        try:
-            if self.gemini_key and GEMINI_AVAILABLE:
-                genai.configure(api_key=self.gemini_key)
-                self.gemini_model = genai.GenerativeModel('gemini-1.5-flash')
-                logger.info("Gemini model initialized successfully")
-        except Exception as e:
-            logger.error(f"Failed to initialize Gemini: {str(e)}")
-            self.gemini_key = None
     def get_available_models(self) -> List[str]:
         """Get list of available AI models"""
@@ -60,1412 +46,648 @@ class AIAssistant:
         return models
     def analyze_insights(self, df: pd.DataFrame, insights: List[Dict], model: str = "Google Gemini") -> str:
-        """Get AI analysis with enhanced error handling and rate limiting"""
-        if not insights:
-            return "No insights available for analysis. Please complete the data analysis stages first."
         try:
-            # Prepare concise data summary
-            summary = self._prepare_data_summary(df, insights)
-            prompt = self._create_analysis_prompt(summary)
             if model == "Google Gemini" and hasattr(self, 'gemini_model'):
                 response = self.gemini_model.generate_content(prompt)
-                return self._format_ai_response(response.text)
-            elif model == "OpenAI GPT" and self.openai_key and OPENAI_AVAILABLE:
                 client = openai.OpenAI(api_key=self.openai_key)
                 response = client.chat.completions.create(
                     model="gpt-3.5-turbo",
-                    messages=[{"role": "user", "content": prompt}],
-                    max_tokens=800,
-                    temperature=0.7
                 )
-                return self._format_ai_response(response.choices[0].message.content)
             else:
-                return "❌ AI analysis not available. Please check your API configuration."
         except Exception as e:
-            error_msg = f"AI Analysis Error: {str(e)}"
-            logger.error(error_msg)
-            return f"❌ {error_msg}\n\n💡 Try checking your API keys or internet connection."
-    def _prepare_data_summary(self, df: pd.DataFrame, insights: List[Dict]) -> str:
-        """Prepare concise data summary for AI analysis"""
-        summary = f"""Dataset: {df.shape[0]} rows × {df.shape[1]} columns
-Data Types: {dict(df.dtypes.value_counts())}
-Missing Data: {df.isnull().sum().sum()} cells
-Key Findings:"""
-        for insight in insights[-5:]:  # Last 5 insights
-            summary += f"\n• {insight['insight']}"
-        return summary
-    def _create_analysis_prompt(self, summary: str) -> str:
-        """Create optimized prompt for AI analysis"""
-        return f"""As a data scientist, provide a brief analysis focusing on:
-1. **Business Impact**: What do these findings mean?
-2. **Recommendations**: 2-3 actionable next steps
-3. **Risks**: Potential data quality concerns
-{summary}
-Keep response under 300 words and focus on actionable insights."""
-    def _format_ai_response(self, response: str) -> str:
-        """Format AI response for better readability"""
-        if not response:
-            return "No response received from AI model."
-        # Clean up response
-        formatted = response.strip()
-        # Add emoji headers if not present
-        if "Business Impact" in formatted and "🎯" not in formatted:
-            formatted = formatted.replace("Business Impact", "🎯 **Business Impact**")
-        if "Recommendations" in formatted and "💡" not in formatted:
-            formatted = formatted.replace("Recommendations", "💡 **Recommendations**")
-        if "Risks" in formatted and "⚠️" not in formatted:
-            formatted = formatted.replace("Risks", "⚠️ **Risks**")
-        return formatted
 class DataAnalysisWorkflow:
-    """Enhanced data analysis workflow with improved UX and error handling"""
     def __init__(self, df: pd.DataFrame):
         self.df = df
-        self.original_df = df.copy()  # Keep original for rollback
         self.stats = calculate_basic_stats(df)
         self.column_types = get_column_types(df)
         self.insights = []
-        self.page_size = 1000
-        self.cleaning_history = []
-        # Validate data on initialization
-        is_valid, validation_issues = validate_dataframe(df)
-        if not is_valid:
-            for issue in validation_issues:
-                self.add_insight(f"Data validation issue: {issue}", 0)
-    def add_insight(self, insight: str, stage: int, insight_type: str = "info"):
-        """Enhanced insight tracking with types"""
         self.insights.append({
             'stage': stage,
             'insight': insight,
-            'type': insight_type,
             'timestamp': pd.Timestamp.now()
         })
     def get_paginated_data(self, page: int = 0) -> pd.DataFrame:
-        """Get paginated data with validation"""
-        try:
-            start_idx = page * self.page_size
-            end_idx = min(start_idx + self.page_size, len(self.df))
-            return self.df.iloc[start_idx:end_idx]
-        except Exception as e:
-            logger.error(f"Pagination error: {str(e)}")
-            return self.df.head(10)
     def stage_1_overview(self):
-        """Stage 1: Enhanced Data Overview with better UX"""
         st.subheader("📊 Data Overview")
-        # Help section
-        with st.expander("ℹ️ Help - Understanding Your Data", expanded=False):
-            st.markdown("""
-            **This stage provides:**
-            - Basic dataset statistics and structure
-            - Data quality assessment and scoring
-            - Memory usage analysis and optimization suggestions
-            - Column type classification and cardinality analysis
-            """)
-        # Data Quality Score with enhanced display
         quality_metrics = calculate_data_quality_score(self.df)
         col1, col2, col3, col4 = st.columns(4)
         with col1:
-            st.metric("Rows", f"{self.stats['shape'][0]:,}", help="Total number of records")
         with col2:
-            st.metric("Columns", f"{self.stats['shape'][1]:,}", help="Total number of features")
         with col3:
-            score_color = "normal" if quality_metrics['score'] >= 80 else "inverse"
-            st.metric("Quality Score", f"{quality_metrics['score']:.1f}/100",
-                     help="Overall data quality assessment")
         with col4:
-            grade_emoji = {"A+": "🌟", "A": "✅", "B+": "👍", "B": "👌", "C+": "⚠️", "C": "⚠️", "D": "❌", "F": "💥"}
-            st.metric("Grade", f"{grade_emoji.get(quality_metrics['grade'], '❓')} {quality_metrics['grade']}")
-        # Quality Issues and Recommendations
         if quality_metrics['issues']:
-            st.error("🚨 **Data Quality Issues Found:**")
             for issue in quality_metrics['issues']:
                 st.write(f"• {issue}")
-        if quality_metrics.get('recommendations'):
-            st.info("💡 **Recommendations:**")
-            for rec in quality_metrics['recommendations']:
-                st.write(f"• {rec}")
-        # Memory Analysis with actionable insights
-        st.subheader("💾 Memory Analysis")
         memory_opt = calculate_memory_optimization(self.df)
-        col1, col2, col3 = st.columns(3)
         with col1:
             st.metric("Current Memory", f"{memory_opt['current_memory_mb']:.1f} MB")
         with col2:
             if memory_opt['potential_savings_mb'] > 0:
                 st.metric("Potential Savings",
                          f"{memory_opt['potential_savings_mb']:.1f} MB",
-                         f"-{memory_opt['potential_savings_pct']:.1f}%")
-        with col3:
-            efficiency = 100 - memory_opt['potential_savings_pct']
-            st.metric("Memory Efficiency", f"{efficiency:.1f}%")
-        if memory_opt['suggestions']:
-            with st.expander("🔧 View Optimization Suggestions", expanded=False):
-                st.dataframe(pd.DataFrame(memory_opt['suggestions']), use_container_width=True)
-                st.info("💡 Converting object columns to categories can significantly reduce memory usage for repeated values.")
-        # Enhanced Column Analysis
-        st.subheader("📋 Column Analysis")
         cardinality_df = calculate_column_cardinality(self.df)
-        if not cardinality_df.empty:
-            # Interactive filters
-            col1, col2 = st.columns(2)
-            with col1:
-                col_types = cardinality_df['Type'].unique()
-                selected_types = st.multiselect("Filter by Cardinality Type",
-                                              col_types,
-                                              default=col_types,
-                                              help="Filter columns by their cardinality classification")
-            with col2:
-                data_types = cardinality_df['Data Type'].unique()
-                selected_data_types = st.multiselect("Filter by Data Type",
-                                                   data_types,
-                                                   default=data_types,
-                                                   help="Filter columns by their pandas data type")
-            # Apply filters
-            filtered_df = cardinality_df[
-                (cardinality_df['Type'].isin(selected_types)) &
-                (cardinality_df['Data Type'].isin(selected_data_types))
-            ]
-            st.dataframe(filtered_df, use_container_width=True)
-            # Actionable insights
-            self._display_cardinality_insights(filtered_df)
-        # Data Types Visualization
-        if self.stats['dtypes']:
-            col1, col2 = st.columns(2)
-            with col1:
-                st.subheader("📊 Data Types Distribution")
-                fig = px.pie(values=list(self.stats['dtypes'].values()),
-                            names=list(self.stats['dtypes'].keys()),
-                            title="Data Types Distribution")
-                fig.update_traces(textposition='inside', textinfo='percent+label')
-                st.plotly_chart(fig, use_container_width=True)
-            with col2:
-                st.subheader("📈 Column Count by Type")
-                fig = px.bar(x=list(self.stats['dtypes'].keys()),
-                           y=list(self.stats['dtypes'].values()),
-                           title="Column Count by Data Type")
-                st.plotly_chart(fig, use_container_width=True)
-        # Enhanced Sample Data Display
-        self._display_sample_data()
-        # Missing Values Analysis
-        self._analyze_missing_values()
-        # Record insights
-        self._record_stage1_insights(quality_metrics, memory_opt, cardinality_df)
-    def _display_cardinality_insights(self, cardinality_df: pd.DataFrame):
-        """Display actionable insights from cardinality analysis"""
-        if cardinality_df.empty:
-            return
-        # Key findings
-        id_cols = cardinality_df[cardinality_df['Type'] == 'Unique Identifier']['Column'].tolist()
-        const_cols = cardinality_df[cardinality_df['Type'] == 'Constant']['Column'].tolist()
-        low_card_cols = cardinality_df[cardinality_df['Type'].str.contains('Low')]['Column'].tolist()
         if id_cols:
-            st.success(f"🔑 **Potential ID Columns:** {', '.join(id_cols[:3])}" +
-                      (f" (+{len(id_cols)-3} more)" if len(id_cols) > 3 else ""))
         if const_cols:
-            st.warning(f"⚠️ **Constant Columns (consider removing):** {', '.join(const_cols[:3])}" +
-                      (f" (+{len(const_cols)-3} more)" if len(const_cols) > 3 else ""))
-        if low_card_cols:
-            st.info(f"📊 **Good for Grouping/Filtering:** {', '.join(low_card_cols[:3])}" +
-                   (f" (+{len(low_card_cols)-3} more)" if len(low_card_cols) > 3 else ""))
-    def _display_sample_data(self):
-        """Enhanced sample data display with pagination"""
-        st.subheader("👀 Sample Data")
         total_pages = (len(self.df) - 1) // self.page_size + 1
-        col1, col2, col3 = st.columns([2, 1, 1])
-        with col1:
-            if total_pages > 1:
-                page = st.slider("Page", 0, total_pages - 1, 0,
-                               help=f"Navigate through {total_pages} pages of data")
-                sample_data = self.get_paginated_data(page)
-                start_row = page * self.page_size + 1
-                end_row = min((page + 1) * self.page_size, len(self.df))
-                st.caption(f"Showing rows {start_row:,} to {end_row:,} of {len(self.df):,}")
-            else:
-                sample_data = self.df.head(20)
-                page = 0
-        with col2:
-            show_dtypes = st.checkbox("Show Data Types", help="Display column data types")
-        with col3:
-            max_cols = st.number_input("Max Columns", min_value=5, max_value=50, value=10,
-                                     help="Limit displayed columns for better readability")
-        # Display data with optional type info
-        display_df = sample_data.iloc[:, :max_cols]
-        if show_dtypes:
-            # Create a summary row with data types
-            type_row = pd.DataFrame([display_df.dtypes.astype(str)],
-                                  index=['Data Type'])
-            type_row.columns = display_df.columns
-            st.dataframe(type_row, use_container_width=True)
-            st.dataframe(display_df, use_container_width=True)
-        else:
-            st.dataframe(display_df, use_container_width=True)
-    def _analyze_missing_values(self):
-        """Enhanced missing values analysis"""
         missing_df = calculate_missing_data(self.df)
         if not missing_df.empty:
-            st.subheader("🕳️ Missing Values Analysis")
-            # Summary metrics
-            total_missing = missing_df['Missing Count'].sum()
-            affected_cols = len(missing_df)
-            col1, col2, col3 = st.columns(3)
-            with col1:
-                st.metric("Total Missing", f"{total_missing:,}")
-            with col2:
-                st.metric("Affected Columns", affected_cols)
-            with col3:
-                worst_col_pct = missing_df.iloc[0]['Missing %'] if len(missing_df) > 0 else 0
-                st.metric("Worst Column", f"{worst_col_pct:.1f}%")
-            # Detailed table
             st.dataframe(missing_df, use_container_width=True)
-            # Visualization for top missing columns
-            if len(missing_df) > 1:
-                top_missing = missing_df.head(10)
-                fig = px.bar(top_missing, x='Column', y='Missing %',
-                           title="Missing Values by Column",
-                           color='Missing %',
-                           color_continuous_scale='Reds')
-                fig.update_layout(xaxis_tickangle=-45)
-                st.plotly_chart(fig, use_container_width=True)
-            # Actionable recommendations
-            high_missing = missing_df[missing_df['Missing %'] > 50]
-            if not high_missing.empty:
-                st.error(f"⚠️ **Critical:** {len(high_missing)} columns have >50% missing data")
-                st.write("Consider removing these columns or investigating data collection issues.")
         else:
-            st.success("✅ **Excellent!** No missing values found in the dataset")
-    def _record_stage1_insights(self, quality_metrics, memory_opt, cardinality_df):
-        """Record insights from stage 1 analysis"""
-        # Quality insights
-        if quality_metrics['score'] >= 90:
-            self.add_insight("Excellent data quality detected", 1, "success")
-        elif quality_metrics['score'] < 70:
-            self.add_insight(f"Data quality needs attention (Score: {quality_metrics['score']:.1f}/100)", 1, "warning")
-        # Memory insights
         if memory_opt['potential_savings_pct'] > 20:
-            self.add_insight(f"Significant memory optimization opportunity: {memory_opt['potential_savings_pct']:.1f}%", 1, "info")
-        # Structure insights
-        if not cardinality_df.empty:
-            id_cols = len(cardinality_df[cardinality_df['Type'] == 'Unique Identifier'])
-            const_cols = len(cardinality_df[cardinality_df['Type'] == 'Constant'])
-            if id_cols > 0:
-                self.add_insight(f"Found {id_cols} potential identifier column(s)", 1, "info")
-            if const_cols > 0:
-                self.add_insight(f"Found {const_cols} constant column(s) - consider removal", 1, "warning")
     def stage_2_exploration(self):
-        """Stage 2: Enhanced Exploratory Data Analysis"""
         st.subheader("🔍 Exploratory Data Analysis")
-        with st.expander("ℹ️ Help - Exploratory Analysis", expanded=False):
-            st.markdown("""
-            **This stage helps you:**
-            - Understand distributions of your variables
-            - Identify patterns and relationships
-            - Spot potential anomalies or interesting features
-            - Guide further analysis decisions
-            """)
         numeric_cols = self.column_types['numeric']
         categorical_cols = self.column_types['categorical']
-        if not numeric_cols and not categorical_cols:
-            st.warning("⚠️ No suitable columns found for analysis. Please check your data types.")
-            return
-        # Enhanced Numeric Analysis
         if numeric_cols:
-            self._analyze_numeric_variables(numeric_cols)
-        # Enhanced Categorical Analysis
-        if categorical_cols:
-            self._analyze_categorical_variables(categorical_cols)
-        # Relationship Analysis
-        self._analyze_relationships(numeric_cols, categorical_cols)
-    def _analyze_numeric_variables(self, numeric_cols: List[str]):
-        """Enhanced numeric variable analysis"""
-        st.subheader("🔢 Numeric Variables Analysis")
-        col1, col2 = st.columns([1, 1])
-        with col1:
-            selected_numeric = st.selectbox("Select numeric column:", numeric_cols,
-                                          help="Choose a numeric column to analyze its distribution")
-        with col2:
-            chart_type = st.selectbox("Chart type:", ["Histogram", "Box Plot", "Violin Plot", "Q-Q Plot"])
-        if selected_numeric:
-            # Statistics summary
-            stats_dict = calculate_numeric_stats(self.df, selected_numeric)
-            if stats_dict:
-                col1, col2, col3, col4 = st.columns(4)
-                with col1:
-                    st.metric("Mean", f"{stats_dict['mean']:.2f}")
-                with col2:
-                    st.metric("Median", f"{stats_dict['median']:.2f}")
-                with col3:
-                    st.metric("Std Dev", f"{stats_dict['std']:.2f}")
-                with col4:
-                    skew_interpretation = "Right-skewed" if stats_dict['skewness'] > 0.5 else "Left-skewed" if stats_dict['skewness'] < -0.5 else "Symmetric"
-                    st.metric("Skewness", f"{stats_dict['skewness']:.2f}", help=skew_interpretation)
-            # Enhanced visualizations
-            try:
-                col1, col2 = st.columns(2)
-                with col1:
-                    if chart_type == "Histogram":
-                        fig = px.histogram(self.df, x=selected_numeric,
-                                         title=f"Distribution of {selected_numeric}",
-                                         marginal="rug")
-                    elif chart_type == "Box Plot":
-                        fig = px.box(self.df, y=selected_numeric,
-                                   title=f"Box Plot of {selected_numeric}")
-                    elif chart_type == "Violin Plot":
-                        fig = px.violin(self.df, y=selected_numeric,
-                                      title=f"Violin Plot of {selected_numeric}")
-                    else:  # Q-Q Plot
-                        from scipy import stats
-                        qq_data = stats.probplot(self.df[selected_numeric].dropna(), dist="norm")
-                        fig = go.Figure()
-                        fig.add_scatter(x=qq_data[0][0], y=qq_data[0][1], mode='markers',
-                                      name='Data Points')
-                        fig.add_scatter(x=qq_data[0][0], y=qq_data[1][1] + qq_data[1][0] * qq_data[0][0],
-                                      mode='lines', name='Normal Distribution')
-                        fig.update_layout(title=f"Q-Q Plot of {selected_numeric}",
-                                        xaxis_title="Theoretical Quantiles",
-                                        yaxis_title="Sample Quantiles")
-                    st.plotly_chart(fig, use_container_width=True)
-                with col2:
-                    # Summary statistics table
-                    if stats_dict:
-                        summary_data = {
-                            'Statistic': ['Count', 'Mean', 'Median', 'Std Dev', 'Min', 'Max', 'Q25', 'Q75', 'Skewness', 'Kurtosis'],
-                            'Value': [
-                                len(self.df[selected_numeric].dropna()),
-                                f"{stats_dict['mean']:.3f}",
-                                f"{stats_dict['median']:.3f}",
-                                f"{stats_dict['std']:.3f}",
-                                f"{stats_dict['min']:.3f}",
-                                f"{stats_dict['max']:.3f}",
-                                f"{stats_dict['q25']:.3f}",
-                                f"{stats_dict['q75']:.3f}",
-                                f"{stats_dict['skewness']:.3f}",
-                                f"{stats_dict['kurtosis']:.3f}"
-                            ]
-                        }
-                        st.dataframe(pd.DataFrame(summary_data), use_container_width=True, hide_index=True)
-                # Distribution insights
-                if abs(stats_dict['skewness']) > 1:
-                    skew_type = "highly right-skewed" if stats_dict['skewness'] > 1 else "highly left-skewed"
-                    self.add_insight(f"{selected_numeric} is {skew_type} (skewness: {stats_dict['skewness']:.2f})", 2, "info")
-                if stats_dict['kurtosis'] > 3:
-                    self.add_insight(f"{selected_numeric} has heavy tails (kurtosis: {stats_dict['kurtosis']:.2f})", 2, "info")
-            except Exception as e:
-                st.error(f"Error creating visualization: {str(e)}")
-                logger.error(f"Visualization error for {selected_numeric}: {str(e)}")
-    def _analyze_categorical_variables(self, categorical_cols: List[str]):
-        """Enhanced categorical variable analysis"""
-        st.subheader("📝 Categorical Variables Analysis")
-        selected_categorical = st.selectbox("Select categorical column:", categorical_cols,
-                                          help="Choose a categorical column to analyze its distribution")
-        if selected_categorical:
-            try:
-                # Get value counts with error handling
-                value_counts = get_value_counts(self.df, selected_categorical, top_n=20)
-                if value_counts is not None and not value_counts.empty:
-                    total_categories = self.df[selected_categorical].nunique()
-                    # Summary metrics
-                    col1, col2, col3 = st.columns(3)
-                    with col1:
-                        st.metric("Total Categories", total_categories)
-                    with col2:
-                        top_category_pct = (value_counts.iloc[0] / len(self.df)) * 100
-                        st.metric("Top Category", f"{top_category_pct:.1f}%")
-                    with col3:
-                        entropy = -sum((value_counts / value_counts.sum()) * np.log2(value_counts / value_counts.sum() + 1e-10))
-                        st.metric("Diversity (Entropy)", f"{entropy:.2f}")
-                    # Visualization
-                    col1, col2 = st.columns(2)
-                    with col1:
-                        fig = px.bar(x=value_counts.index, y=value_counts.values,
-                                   title=f"Top {min(20, len(value_counts))} Values in {selected_categorical}")
-                        fig.update_layout(xaxis_tickangle=-45)
-                        st.plotly_chart(fig, use_container_width=True)
-                    with col2:
-                        # Show data table
-                        display_data = pd.DataFrame({
-                            'Category': value_counts.index,
-                            'Count': value_counts.values,
-                            'Percentage': np.round((value_counts.values / len(self.df)) * 100, 2)
-                        })
-                        st.dataframe(display_data, use_container_width=True, hide_index=True)
-                    # Insights
-                    if total_categories > 100:
-                        self.add_insight(f"{selected_categorical} has very high cardinality ({total_categories} categories)", 2, "warning")
-                    elif top_category_pct > 90:
-                        self.add_insight(f"{selected_categorical} is highly imbalanced (top category: {top_category_pct:.1f}%)", 2, "warning")
-                else:
-                    st.warning(f"⚠️ Unable to analyze column '{selected_categorical}' - it may be empty or have issues")
-            except Exception as e:
-                st.error(f"Error analyzing categorical variable: {str(e)}")
-                logger.error(f"Categorical analysis error for {selected_categorical}: {str(e)}")
-    def _analyze_relationships(self, numeric_cols: List[str], categorical_cols: List[str]):
-        """Enhanced relationship analysis"""
-        if len(numeric_cols) >= 2:
-            st.subheader("🔗 Variable Relationships")
-            # Correlation matrix
-            corr_matrix = calculate_correlation_matrix(self.df)
-            if corr_matrix is not None and not corr_matrix.empty:
-                col1, col2 = st.columns(2)
-                with col1:
-                    fig = px.imshow(corr_matrix,
-                                  text_auto=True,
-                                  aspect="auto",
-                                  title="Correlation Matrix",
-                                  color_continuous_scale='RdBu')
                     st.plotly_chart(fig, use_container_width=True)
-                with col2:
-                    # Find strongest correlations
-                    corr_pairs = []
                     for i in range(len(corr_matrix.columns)):
                         for j in range(i+1, len(corr_matrix.columns)):
-                            col1_name = corr_matrix.columns[i]
-                            col2_name = corr_matrix.columns[j]
-                            corr_val = corr_matrix.iloc[i, j]
-                            if not np.isnan(corr_val):
-                                corr_pairs.append({
-                                    'Variable 1': col1_name,
-                                    'Variable 2': col2_name,
-                                    'Correlation': round(corr_val, 3),
-                                    'Strength': 'Strong' if abs(corr_val) > 0.7 else 'Moderate' if abs(corr_val) > 0.3 else 'Weak'
-                                })
-                    if corr_pairs:
-                        corr_df = pd.DataFrame(corr_pairs).sort_values('Correlation', key=abs, ascending=False)
-                        st.subheader("🎯 Strongest Correlations")
-                        st.dataframe(corr_df.head(10), use_container_width=True, hide_index=True)
-                        # Record strongest correlation insight
-                        strongest = corr_df.iloc[0]
-                        self.add_insight(f"Strongest correlation: {strongest['Variable 1']} ↔ {strongest['Variable 2']} ({strongest['Correlation']})", 2, "info")
     def stage_3_cleaning(self):
-        """Stage 3: Enhanced Data Quality Assessment and Cleaning"""
-        st.subheader("🧹 Data Quality & Cleaning")
-        with st.expander("ℹ️ Help - Data Cleaning", expanded=False):
-            st.markdown("""
-            **Available cleaning operations:**
-            - **Missing Values:** Fill with statistics, drop rows, or use custom values
-            - **Duplicates:** Remove identical rows
-            - **Outliers:** Remove or cap extreme values
-            - **Data Types:** Convert columns to appropriate types
-            """)
-        # Progress tracking
-        cleaning_progress = st.empty()
-        # Enhanced Missing Values Handling
-        self._handle_missing_values()
-        # Enhanced Duplicates Handling
-        self._handle_duplicates()
-        # Enhanced Mixed Types Handling
-        self._handle_mixed_types()
-        # Enhanced Outlier Detection
-        self._handle_outliers()
-        # Cleaning Summary
-        self._display_cleaning_summary()
-    def _handle_missing_values(self):
-        """Enhanced missing values handling with preview"""
-        missing_df = calculate_missing_data(self.df)
-        if not missing_df.empty:
-            st.subheader("🕳️ Missing Values Treatment")
-            # Select column and method
-            col1, col2, col3 = st.columns(3)
             with col1:
-                selected_col = st.selectbox("Column to clean:", missing_df['Column'].tolist())
             with col2:
-                col_dtype = str(self.df[selected_col].dtype)
-                if 'int' in col_dtype or 'float' in col_dtype:
-                    methods = ["Drop rows", "Mean", "Median", "Mode", "Custom value"]
-                else:
-                    methods = ["Drop rows", "Mode", "Custom value"]
-                fill_method = st.selectbox("Fill method:", methods)
-            with col3:
-                if fill_method == "Custom value":
-                    if 'int' in col_dtype or 'float' in col_dtype:
-                        custom_value = st.number_input("Custom value:", value=0.0)
-                    else:
-                        custom_value = st.text_input("Custom value:", value="Unknown")
-            # Preview impact
-            if selected_col:
-                missing_count = self.df[selected_col].isnull().sum()
-                total_count = len(self.df)
-                if fill_method == "Drop rows":
-                    remaining_rows = total_count - missing_count
-                    st.info(f"📊 **Preview:** Will remove {missing_count} rows, keeping {remaining_rows} rows")
-                else:
-                    st.info(f"📊 **Preview:** Will fill {missing_count} missing values")
-            # Apply cleaning
-            if st.button("✨ Apply Missing Value Treatment", type="primary"):
                 try:
-                    original_missing = self.df[selected_col].isnull().sum()
                     if fill_method == "Drop rows":
                         self.df = self.df.dropna(subset=[selected_col])
-                        operation = f"Dropped {original_missing} rows with missing values in {selected_col}"
                     else:
                         if fill_method == "Mean":
                             fill_value = self.df[selected_col].mean()
                         elif fill_method == "Median":
                             fill_value = self.df[selected_col].median()
                         elif fill_method == "Mode":
-                            mode_result = self.df[selected_col].mode()
-                            fill_value = mode_result.iloc[0] if not mode_result.empty else "Unknown"
-                        else:
-                            fill_value = custom_value
                         self.df[selected_col] = self.df[selected_col].fillna(fill_value)
-                        operation = f"Filled {original_missing} missing values in {selected_col} with {fill_method}"
-                    self.cleaning_history.append(operation)
-                    st.success(f"✅ {operation}")
-                    st.rerun()
                 except Exception as e:
-                    st.error(f"❌ Error applying treatment: {str(e)}")
-        else:
-            st.success("✅ No missing values found!")
-    def _handle_duplicates(self):
-        """Enhanced duplicate handling"""
         if self.stats['duplicates'] > 0:
-            st.subheader("👥 Duplicate Rows")
-            duplicate_pct = (self.stats['duplicates'] / len(self.df)) * 100
-            st.warning(f"⚠️ Found **{self.stats['duplicates']:,}** duplicate rows ({duplicate_pct:.1f}% of data)")
-            # Show sample duplicates
-            duplicates = self.df[self.df.duplicated(keep=False)].head(10)
-            if not duplicates.empty:
-                st.write("**Sample duplicate rows:**")
-                st.dataframe(duplicates, use_container_width=True)
-            if st.button("🗑️ Remove Duplicate Rows", type="primary"):
                 try:
-                    original_len = len(self.df)
-                    self.df = self.df.drop_duplicates()
-                    removed = original_len - len(self.df)
-                    operation = f"Removed {removed} duplicate rows"
-                    self.cleaning_history.append(operation)
-                    st.success(f"✅ {operation}")
-                    st.rerun()
                 except Exception as e:
-                    st.error(f"❌ Error removing duplicates: {str(e)}")
-        else:
-            st.success("✅ No duplicate rows found!")
-    def _handle_mixed_types(self):
-        """Enhanced mixed types handling"""
-        mixed_types = detect_mixed_types(self.df)
-        if mixed_types:
-            st.subheader("🔀 Mixed Data Types")
-            for issue in mixed_types:
-                col = issue['column']
-                problems = issue['problematic_values']
-                pct = issue['percentage']
-                st.warning(f"⚠️ **{col}:** {problems} values ({pct:.1f}%) cannot be converted to numeric")
-                # Show sample problematic values
-                if 'sample_issues' in issue:
-                    sample_issues = issue['sample_issues']
-                    st.write("**Sample problematic values:**")
-                    for value, count in list(sample_issues.items())[:5]:
-                        st.write(f"• '{value}' ({count} occurrences)")
-                col1, col2 = st.columns(2)
-                with col1:
-                    fix_method = st.selectbox(f"Fix method for {col}:",
-                                            ["Convert to numeric (coerce errors)", "Keep as text"],
-                                            key=f"fix_{col}")
-                with col2:
-                    if st.button(f"🔧 Fix {col}", key=f"apply_{col}"):
-                        try:
-                            if fix_method == "Convert to numeric (coerce errors)":
-                                self.df[col] = pd.to_numeric(self.df[col], errors='coerce')
-                                operation = f"Converted {col} to numeric (with coercion)"
-                            else:
-                                operation = f"Kept {col} as text type"
-                            self.cleaning_history.append(operation)
-                            st.success(f"✅ {operation}")
-                            st.rerun()
-                        except Exception as e:
-                            st.error(f"❌ Error fixing {col}: {str(e)}")
         else:
-            st.success("✅ No mixed data type issues found!")
-    def _handle_outliers(self):
-        """Enhanced outlier detection and handling"""
         numeric_cols = self.column_types['numeric']
-        if numeric_cols:
-            st.subheader("🎯 Outlier Detection")
-            col1, col2, col3 = st.columns(3)
             with col1:
-                selected_col = st.selectbox("Column for outlier detection:", numeric_cols)
             with col2:
-                detection_method = st.selectbox("Detection method:",
-                                              ["IQR (Interquartile Range)", "Z-Score", "Percentile"])
-            with col3:
-                if detection_method == "Z-Score":
-                    threshold = st.number_input("Z-Score threshold:", min_value=1.0, max_value=5.0, value=3.0)
-                elif detection_method == "Percentile":
-                    percentile = st.slider("Outlier percentile:", 0.1, 5.0, 1.0)
-            if selected_col:
-                try:
-                    method_map = {
-                        "IQR (Interquartile Range)": "iqr",
-                        "Z-Score": "zscore",
-                        "Percentile": "percentile"
-                    }
-                    outliers = calculate_outliers(self.df, selected_col, method_map[detection_method])
-                    if outliers is not None and not outliers.empty:
-                        outlier_count = len(outliers)
-                        outlier_pct = (outlier_count / len(self.df)) * 100
-                        st.warning(f"⚠️ Found **{outlier_count}** potential outliers ({outlier_pct:.1f}% of data)")
-                        # Show outlier statistics
-                        col1, col2 = st.columns(2)
-                        with col1:
-                            outlier_stats = outliers[selected_col].describe()
-                            st.write("**Outlier Statistics:**")
-                            st.dataframe(outlier_stats.to_frame().T, use_container_width=True)
-                        with col2:
-                            # Visualization of outliers
-                            fig = go.Figure()
-                            fig.add_trace(go.Scatter(
-                                x=self.df.index,
-                                y=self.df[selected_col],
-                                mode='markers',
-                                name='Normal Data',
-                                marker=dict(color='blue', opacity=0.6)
-                            ))
-                            fig.add_trace(go.Scatter(
-                                x=outliers.index,
-                                y=outliers[selected_col],
-                                mode='markers',
-                                name='Outliers',
-                                marker=dict(color='red', size=8)
-                            ))
-                            fig.update_layout(title=f"Outliers in {selected_col}")
-                            st.plotly_chart(fig, use_container_width=True)
-                        # Treatment options
-                        treatment_method = st.selectbox("Outlier treatment:",
-                                                      ["None", "Remove outliers", "Cap at bounds"])
-                        if treatment_method != "None":
-                            st.info(f"📊 **Preview:** This will affect {outlier_count} data points")
-                            if st.button("🔧 Apply Outlier Treatment", type="primary"):
-                                try:
-                                    if treatment_method == "Remove outliers":
-                                        self.df = self.df[~self.df.index.isin(outliers.index)]
-                                        operation = f"Removed {outlier_count} outliers from {selected_col}"
-                                    else:  # Cap at bounds
-                                        Q1 = self.df[selected_col].quantile(0.25)
-                                        Q3 = self.df[selected_col].quantile(0.75)
-                                        IQR = Q3 - Q1
-                                        lower_bound = Q1 - 1.5 * IQR
-                                        upper_bound = Q3 + 1.5 * IQR
-                                        self.df[selected_col] = self.df[selected_col].clip(lower_bound, upper_bound)
-                                        operation = f"Capped outliers in {selected_col} to bounds"
-                                    self.cleaning_history.append(operation)
-                                    st.success(f"✅ {operation}")
-                                    st.rerun()
-                                except Exception as e:
-                                    st.error(f"❌ Error treating outliers: {str(e)}")
-                    else:
-                        st.success(f"✅ No outliers detected in '{selected_col}' using {detection_method}")
-                except Exception as e:
-                    st.error(f"❌ Error detecting outliers: {str(e)}")
-    def _display_cleaning_summary(self):
-        """Display comprehensive cleaning summary"""
-        if self.cleaning_history:
-            st.subheader("📋 Cleaning Operations History")
-            for i, operation in enumerate(self.cleaning_history, 1):
-                st.write(f"**{i}.** {operation}")
-            # Show data changes
-            col1, col2 = st.columns(2)
-            with col1:
-                st.metric("Original Rows", f"{self.original_df.shape[0]:,}")
-                st.metric("Original Memory", f"{self.original_df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
-            with col2:
-                st.metric("Current Rows", f"{self.df.shape[0]:,}",
-                         delta=f"{self.df.shape[0] - self.original_df.shape[0]:,}")
-                current_memory = self.df.memory_usage(deep=True).sum() / 1024**2
-                original_memory = self.original_df.memory_usage(deep=True).sum() / 1024**2
-                st.metric("Current Memory", f"{current_memory:.1f} MB",
-                         delta=f"{current_memory - original_memory:.1f} MB")
-            # Rollback option
-            if st.button("↩️ Reset to Original Data", help="Restore original dataset"):
-                self.df = self.original_df.copy()
-                self.cleaning_history = []
-                st.success("✅ Data reset to original state")
-                st.rerun()
-            self.add_insight(f"Applied {len(self.cleaning_history)} cleaning operations", 3, "info")
-        else:
-            st.info("ℹ️ No cleaning operations performed yet")
-    def stage_4_analysis(self):
-        """Stage 4: Enhanced Advanced Analysis"""
-        st.subheader("🔬 Advanced Analysis")
-        with st.expander("ℹ️ Help - Advanced Analysis", expanded=False):
-            st.markdown("""
-            **Advanced analysis includes:**
-            - **Relationships:** Correlation and scatter plot analysis
-            - **Group Analysis:** Compare metrics across categories
-            - **Distribution Analysis:** Statistical testing and comparisons
-            """)
-        numeric_cols = self.column_types['numeric']
-        categorical_cols = self.column_types['categorical']
-        # Enhanced Relationship Analysis
-        if len(numeric_cols) >= 2:
-            self._advanced_relationship_analysis(numeric_cols)
-        # Enhanced Group Analysis
         if categorical_cols and numeric_cols:
-            self._advanced_group_analysis(categorical_cols, numeric_cols)
-        # Statistical Testing
-        if len(numeric_cols) >= 2:
-            self._statistical_testing(numeric_cols, categorical_cols)
-    def _advanced_relationship_analysis(self, numeric_cols: List[str]):
-        """Enhanced relationship analysis with statistical insights"""
-        st.subheader("🔗 Variable Relationships")
-        col1, col2, col3 = st.columns(3)
-        with col1:
-            x_var = st.selectbox("X Variable:", numeric_cols)
-        with col2:
-            y_var = st.selectbox("Y Variable:", [col for col in numeric_cols if col != x_var])
-        with col3:
-            color_var = st.selectbox("Color by (optional):",
-                                   ["None"] + self.column_types['categorical'][:10])
-        if x_var and y_var:
-            try:
-                # Sample for performance
-                sample_size = min(5000, len(self.df))
-                if len(self.df) > sample_size:
-                    sample_df = self.df.sample(n=sample_size, random_state=42)
-                    st.info(f"📊 Showing sample of {sample_size:,} points for performance")
-                else:
-                    sample_df = self.df
-                # Create scatter plot
-                if color_var != "None":
-                    fig = px.scatter(sample_df, x=x_var, y=y_var, color=color_var,
-                                   title=f"Relationship: {x_var} vs {y_var}",
-                                   trendline="ols")
-                else:
-                    fig = px.scatter(sample_df, x=x_var, y=y_var,
-                                   title=f"Relationship: {x_var} vs {y_var}",
-                                   trendline="ols")
-                st.plotly_chart(fig, use_container_width=True)
-                # Statistical analysis
-                correlation = self.df[x_var].corr(self.df[y_var])
-                col1, col2, col3 = st.columns(3)
-                with col1:
-                    st.metric("Correlation", f"{correlation:.3f}")
-                with col2:
-                    if abs(correlation) > 0.7:
-                        strength = "Strong"
-                    elif abs(correlation) > 0.3:
-                        strength = "Moderate"
-                    else:
-                        strength = "Weak"
-                    st.metric("Strength", strength)
-                with col3:
-                    direction = "Positive" if correlation > 0 else "Negative"
-                    st.metric("Direction", direction)
-                # Record insight
-                self.add_insight(f"{strength} {direction.lower()} correlation ({correlation:.3f}) between {x_var} and {y_var}", 4, "info")
-            except Exception as e:
-                st.error(f"❌ Error in relationship analysis: {str(e)}")
-    def _advanced_group_analysis(self, categorical_cols: List[str], numeric_cols: List[str]):
-        """Enhanced group analysis with statistical comparisons"""
-        st.subheader("👥 Group Analysis")
-        col1, col2 = st.columns(2)
-        with col1:
-            group_var = st.selectbox("Group by:", categorical_cols)
-        with col2:
-            metric_var = st.selectbox("Analyze metric:", numeric_cols)
-        if group_var and metric_var:
-            try:
-                group_stats = calculate_group_stats(self.df, group_var, metric_var)
-                if group_stats is not None and not group_stats.empty:
-                    # Display statistics
-                    st.dataframe(group_stats, use_container_width=True)
-                    # Visualization
-                    unique_groups = self.df[group_var].nunique()
-                    if unique_groups <= 20:
-                        col1, col2 = st.columns(2)
-                        with col1:
-                            fig = px.box(self.df, x=group_var, y=metric_var,
-                                       title=f"{metric_var} by {group_var}")
-                            fig.update_layout(xaxis_tickangle=-45)
-                            st.plotly_chart(fig, use_container_width=True)
-                        with col2:
-                            # Mean comparison
-                            group_means = self.df.groupby(group_var)[metric_var].mean().sort_values(ascending=False)
-                            fig = px.bar(x=group_means.index, y=group_means.values,
-                                       title=f"Average {metric_var} by {group_var}")
-                            fig.update_layout(xaxis_tickangle=-45)
-                            st.plotly_chart(fig, use_container_width=True)
-                    else:
-                        st.info(f"ℹ️ Too many groups ({unique_groups}) for visualization. Showing statistics only.")
-                    # Find insights
-                    best_group = group_stats.loc[group_stats['mean'].idxmax(), group_var]
-                    best_value = group_stats['mean'].max()
-                    worst_group = group_stats.loc[group_stats['mean'].idxmin(), group_var]
-                    worst_value = group_stats['mean'].min()
-                    col1, col2 = st.columns(2)
-                    with col1:
-                        st.success(f"🏆 **Highest {metric_var}:** {best_group} ({best_value:.2f})")
-                    with col2:
-                        st.info(f"📉 **Lowest {metric_var}:** {worst_group} ({worst_value:.2f})")
-                    self.add_insight(f"'{best_group}' has highest average {metric_var}: {best_value:.2f}", 4, "success")
-            except Exception as e:
-                st.error(f"❌ Error in group analysis: {str(e)}")
-    def _statistical_testing(self, numeric_cols: List[str], categorical_cols: List[str]):
-        """Enhanced statistical testing capabilities"""
-        if len(numeric_cols) >= 2:
-            st.subheader("📊 Statistical Testing")
-            test_type = st.selectbox("Select test type:",
-                                   ["Correlation Test", "Group Comparison"])
-            if test_type == "Correlation Test" and len(numeric_cols) >= 2:
-                col1, col2 = st.columns(2)
-                with col1:
-                    var1 = st.selectbox("Variable 1:", numeric_cols, key="corr_var1")
-                with col2:
-                    var2 = st.selectbox("Variable 2:",
-                                      [col for col in numeric_cols if col != var1],
-                                      key="corr_var2")
-                if st.button("🧪 Run Correlation Test"):
-                    try:
-                        from scipy.stats import pearsonr, spearmanr
-                        # Clean data for testing
-                        clean_data = self.df[[var1, var2]].dropna()
-                        if len(clean_data) < 10:
-                            st.warning("⚠️ Insufficient data for reliable correlation testing")
-                        else:
-                            # Pearson correlation
-                            pearson_corr, pearson_p = pearsonr(clean_data[var1], clean_data[var2])
-                            # Spearman correlation (rank-based)
-                            spearman_corr, spearman_p = spearmanr(clean_data[var1], clean_data[var2])
-                            col1, col2 = st.columns(2)
-                            with col1:
-                                st.subheader("Pearson Correlation")
-                                st.metric("Correlation", f"{pearson_corr:.3f}")
-                                st.metric("P-value", f"{pearson_p:.4f}")
-                                if pearson_p < 0.05:
-                                    st.success("✅ Statistically significant")
-                                else:
-                                    st.warning("⚠️ Not statistically significant")
-                            with col2:
-                                st.subheader("Spearman Correlation")
-                                st.metric("Correlation", f"{spearman_corr:.3f}")
-                                st.metric("P-value", f"{spearman_p:.4f}")
-                                if spearman_p < 0.05:
-                                    st.success("✅ Statistically significant")
-                                else:
-                                    st.warning("⚠️ Not statistically significant")
-                            # Interpretation
-                            if pearson_p < 0.05:
-                                self.add_insight(f"Significant correlation between {var1} and {var2} (p={pearson_p:.4f})", 4, "success")
-                    except Exception as e:
-                        st.error(f"❌ Error in correlation testing: {str(e)}")
     def stage_5_summary(self):
-        """Stage 5: Enhanced Summary and Export"""
-        st.subheader("📈 Analysis Summary & Export")
-        with st.expander("ℹ️ Help - Summary & Export", expanded=False):
-            st.markdown("""
-            **This final stage provides:**
-            - Complete analysis summary with all insights
-            - Multiple export formats for your results
-            - Code generation for reproducible analysis
-            - Data quality final report
-            """)
-        # Enhanced Key Metrics Dashboard
-        col1, col2, col3, col4 = st.columns(4)
         with col1:
-            st.metric("📊 Total Insights", len(self.insights))
         with col2:
-            success_insights = len([i for i in self.insights if i.get('type') == 'success'])
-            st.metric("✅ Positive Findings", success_insights)
         with col3:
-            warning_insights = len([i for i in self.insights if i.get('type') == 'warning'])
-            st.metric("⚠️ Issues Found", warning_insights)
-        with col4:
-            final_quality = "High" if self.stats['missing_values'] == 0 and self.stats['duplicates'] == 0 else "Medium"
-            st.metric("🎯 Final Quality", final_quality)
-        # Categorized Insights Summary
-        self._display_categorized_insights()
-        # Data Transformation Summary
-        if self.cleaning_history:
-            st.subheader("🔄 Data Transformations Applied")
-            for i, operation in enumerate(self.cleaning_history, 1):
-                st.write(f"**{i}.** {operation}")
-            st.info(f"✨ Dataset transformed from {self.original_df.shape} to {self.df.shape}")
-        # Enhanced Export Options
-        self._display_export_options()
-    def _display_categorized_insights(self):
-        """Display insights organized by category and stage"""
-        st.subheader("💡 Key Insights by Stage")
-        stage_names = {
-            0: "🔍 Validation",
-            1: "📊 Overview",
-            2: "🔍 Exploration",
-            3: "🧹 Cleaning",
-            4: "🔬 Analysis"
-        }
-        for stage in range(5):
-            stage_insights = [i for i in self.insights if i['stage'] == stage]
-            if stage_insights:
-                st.write(f"**{stage_names.get(stage, f'Stage {stage}')}**")
-                for insight in stage_insights:
-                    icon = {"success": "✅", "warning": "⚠️", "error": "❌"}.get(insight.get('type'), "ℹ️")
-                    st.write(f"  {icon} {insight['insight']}")
-    def _display_export_options(self):
-        """Enhanced export options with previews"""
-        st.subheader("📤 Export Results")
-        export_type = st.selectbox("Choose export type:",
-                                 ["Analysis Report", "Cleaned Dataset", "Python Code", "Summary Dashboard"])
-        try:
-            if export_type == "Analysis Report":
-                format_choice = st.selectbox("Report format:", ["Markdown", "HTML", "Text"])
-                col1, col2 = st.columns([3, 1])
-                with col1:
-                    if format_choice == "Markdown":
-                        report = self.generate_markdown_report()
-                        st.code(report[:500] + "..." if len(report) > 500 else report, language="markdown")
-                with col2:
-                    st.download_button(
-                        label=f"📄 Download {format_choice} Report",
-                        data=report if format_choice == "Markdown" else self.generate_text_report(),
-                        file_name=f"analysis_report.{format_choice.lower()}",
-                        mime="text/markdown" if format_choice == "Markdown" else "text/plain"
-                    )
-            elif export_type == "Cleaned Dataset":
-                format_choice = st.selectbox("Data format:", ["CSV", "Excel", "Parquet"])
-                col1, col2 = st.columns([3, 1])
-                with col1:
-                    st.write("**Data Preview:**")
-                    st.dataframe(self.df.head(), use_container_width=True)
-                    st.write(f"**Final Shape:** {self.df.shape[0]:,} rows × {self.df.shape[1]:,} columns")
-                with col2:
-                    if st.button(f"📊 Export as {format_choice}"):
-                        try:
-                            if format_choice == "CSV":
-                                csv = self.df.to_csv(index=False)
-                                st.download_button("💾 Download CSV", csv, "cleaned_data.csv", "text/csv")
-                            elif format_choice == "Excel":
-                                buffer = BytesIO()
-                                with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
-                                    self.df.to_excel(writer, sheet_name='Cleaned_Data', index=False)
-                                    # Add summary sheet
-                                    summary_df = pd.DataFrame({
-                                        'Metric': ['Original Rows', 'Final Rows', 'Columns', 'Cleaning Operations'],
-                                        'Value': [self.original_df.shape[0], self.df.shape[0],
-                                                self.df.shape[1], len(self.cleaning_history)]
-                                    })
-                                    summary_df.to_excel(writer, sheet_name='Summary', index=False)
-                                st.download_button("💾 Download Excel", buffer.getvalue(),
-                                                 "cleaned_data.xlsx",
-                                                 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
-                            elif format_choice == "Parquet":
-                                buffer = BytesIO()
-                                self.df.to_parquet(buffer, index=False)
-                                st.download_button("💾 Download Parquet", buffer.getvalue(),
-                                                 "cleaned_data.parquet", "application/octet-stream")
-                        except Exception as e:
-                            st.error(f"❌ Export error: {str(e)}")
-            elif export_type == "Python Code":
-                code = self.generate_enhanced_python_code()
-                st.code(code, language="python")
-                st.download_button("💾 Download Python Script", code,
-                                 "analysis_script.py", "text/plain")
-        except Exception as e:
-            st.error(f"❌ Export error: {str(e)}")
     def generate_markdown_report(self) -> str:
-        """Generate comprehensive markdown report"""
-        report = f"""# 📊 Data Analysis Report
-## Executive Summary
-- **Dataset Size:** {self.stats['shape'][0]:,} rows × {self.stats['shape'][1]:,} columns
-- **Data Quality:** {calculate_data_quality_score(self.df)['grade']} grade
-- **Memory Usage:** {self.stats['memory_usage']:.1f} MB
-- **Analysis Completed:** {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}
-## 📈 Data Overview
-| Metric | Value |
-|--------|-------|
-| Total Records | {self.stats['shape'][0]:,} |
-| Total Features | {self.stats['shape'][1]:,} |
-| Missing Values | {self.stats['missing_values']:,} |
-| Duplicate Rows | {self.stats['duplicates']:,} |
-## 📊 Data Types
 """
-        for dtype, count in self.stats['dtypes'].items():
-            report += f"- **{dtype}:** {count} columns\n"
-        report += "\n## 💡 Key Insights\n"
         # Group insights by stage
-        stage_names = {0: "Validation", 1: "Overview", 2: "Exploration", 3: "Cleaning", 4: "Analysis"}
-        for stage in range(5):
             stage_insights = [i for i in self.insights if i['stage'] == stage]
             if stage_insights:
-                report += f"\n### {stage_names.get(stage, f'Stage {stage}')}\n"
                 for insight in stage_insights:
-                    icon = {"success": "✅", "warning": "⚠️", "error": "❌"}.get(insight.get('type'), "ℹ️")
-                    report += f"- {icon} {insight['insight']}\n"
-        if self.cleaning_history:
-            report += "\n## 🔄 Data Transformations\n"
-            for i, operation in enumerate(self.cleaning_history, 1):
-                report += f"{i}. {operation}\n"
-        report += f"\n---\n*Report generated by Data Analysis Platform*"
         return report
-    def generate_enhanced_python_code(self) -> str:
-        """Generate comprehensive Python code for reproducible analysis"""
-        code = f'''"""
-Data Analysis Script
-Generated on: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}
-Original Dataset: {self.original_df.shape[0]:,} rows × {self.original_df.shape[1]:,} columns
-Final Dataset: {self.df.shape[0]:,} rows × {self.df.shape[1]:,} columns
-"""
-import pandas as pd
 import numpy as np
 import plotly.express as px
-import plotly.graph_objects as go
-from scipy import stats
-import warnings
-warnings.filterwarnings('ignore')
-# Load data
-def load_and_prepare_data(file_path: str) -> pd.DataFrame:
-    """Load and prepare data with error handling"""
-    try:
-        if file_path.endswith('.csv'):
-            df = pd.read_csv(file_path)
-        elif file_path.endswith(('.xlsx', '.xls')):
-            df = pd.read_excel(file_path)
-        else:
-            raise ValueError("Unsupported file format")
-        print(f"Loaded data: {{df.shape[0]:,}} rows × {{df.shape[1]:,}} columns")
-        return df
-    except Exception as e:
-        print(f"Error loading data: {{e}}")
-        return None
-# Data quality assessment
-def assess_data_quality(df: pd.DataFrame) -> dict:
-    """Calculate comprehensive data quality metrics"""
-    total_cells = len(df) * len(df.columns)
-    missing_count = df.isnull().sum().sum()
-    duplicate_count = df.duplicated().sum()
-    return {{
-        'total_rows': len(df),
-        'total_columns': len(df.columns),
-        'missing_percentage': (missing_count / total_cells) * 100,
-        'duplicate_percentage': (duplicate_count / len(df)) * 100,
-        'memory_usage_mb': df.memory_usage(deep=True).sum() / 1024**2
-    }}
-# Main analysis
-if __name__ == "__main__":
-    # Load your data
-    df = load_and_prepare_data('your_data_file.csv')  # Update with your file path
-    if df is not None:
-        # Data quality assessment
-        quality = assess_data_quality(df)
-        print("\\n=== DATA QUALITY REPORT ===")
-        print(f"Rows: {{quality['total_rows']:,}}")
-        print(f"Columns: {{quality['total_columns']:,}}")
-        print(f"Missing Data: {{quality['missing_percentage']:.2f}}%")
-        print(f"Duplicates: {{quality['duplicate_percentage']:.2f}}%")
-        print(f"Memory Usage: {{quality['memory_usage_mb']:.1f}} MB")
-'''
-        # Add cleaning operations if any
-        if self.cleaning_history:
-            code += "\n        # Applied cleaning operations:\n"
             for operation in self.cleaning_history:
-                if "missing" in operation.lower():
-                    code += "        # df = df.fillna(method='your_chosen_method')\n"
                 elif "duplicate" in operation.lower():
-                    code += "        df = df.drop_duplicates()\n"
                 elif "outlier" in operation.lower():
-                    code += """        # Remove outliers using IQR method
-        def remove_outliers(df, column):
-            Q1 = df[column].quantile(0.25)
-            Q3 = df[column].quantile(0.75)
-            IQR = Q3 - Q1
-            return df[~((df[column] < (Q1 - 1.5 * IQR)) | (df[column] > (Q3 + 1.5 * IQR)))]
-        # df = remove_outliers(df, 'your_column')
-"""
-        # Add analysis code
-        code += f"""
-        # Basic statistics
-        print("\\n=== BASIC STATISTICS ===")
-        print(df.describe())
-        # Correlation analysis (if numeric columns exist)
-        numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
-        if len(numeric_cols) > 1:
-            print("\\n=== CORRELATION MATRIX ===")
-            corr_matrix = df[numeric_cols].corr()
-            print(corr_matrix)
-            # Visualize correlation matrix
-            fig = px.imshow(corr_matrix, title='Correlation Matrix')
-            fig.show()
-        # Missing values visualization
-        missing = df.isnull().sum()
-        if missing.sum() > 0:
-            missing = missing[missing > 0]
-            fig = px.bar(x=missing.index, y=missing.values,
-                        title='Missing Values by Column')
-            fig.show()
-        # Final data quality report
-        final_quality = assess_data_quality(df)
-        print("\\n=== FINAL QUALITY REPORT ===")
-        for key, value in final_quality.items():
-            print(f"{{key}}: {{value}}")
 """
-        return code
-    def generate_text_report(self) -> str:
-        """Generate enhanced text analysis report"""
-        report = f"""DATA ANALYSIS REPORT
-{'='*50}
-EXECUTIVE SUMMARY
-Dataset: {self.stats['shape'][0]:,} rows × {self.stats['shape'][1]:,} columns
-Quality Grade: {calculate_data_quality_score(self.df)['grade']}
-Memory Usage: {self.stats['memory_usage']:.1f} MB
-Analysis Date: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}
-DATA OVERVIEW
-- Total Records: {self.stats['shape'][0]:,}
-- Total Features: {self.stats['shape'][1]:,}
-- Missing Values: {self.stats['missing_values']:,}
-- Duplicate Rows: {self.stats['duplicates']:,}
-DATA TYPES DISTRIBUTION
 """
-        for dtype, count in self.stats['dtypes'].items():
-            report += f"- {dtype}: {count} columns\n"
-        report += "\nKEY INSIGHTS\n" + "="*20 + "\n"
-        # Organize insights by stage
-        stage_names = {0: "VALIDATION", 1: "OVERVIEW", 2: "EXPLORATION", 3: "CLEANING", 4: "ANALYSIS"}
-        for stage in range(5):
-            stage_insights = [i for i in self.insights if i['stage'] == stage]
-            if stage_insights:
-                report += f"\n{stage_names.get(stage, f'STAGE {stage}')}:\n"
-                for i, insight in enumerate(stage_insights, 1):
-                    report += f"  {i}. {insight['insight']}\n"
-        if self.cleaning_history:
-            report += f"\nDATA TRANSFORMATIONS\n{'='*20}\n"
-            for i, operation in enumerate(self.cleaning_history, 1):
-                report += f"{i}. {operation}\n"
-        report += f"\n{'='*50}\nReport generated by Data Analysis Platform\n"
-        return report

 import plotly.graph_objects as go
 from typing import Dict, List, Any, Optional
 import os
 from dotenv import load_dotenv
 from data_handler import *
 from io import BytesIO
 # Load environment variables
 load_dotenv()
+# Optional AI Integration
 try:
     import openai
     OPENAI_AVAILABLE = True
 except ImportError:
     OPENAI_AVAILABLE = False
 try:
     import google.generativeai as genai
     GEMINI_AVAILABLE = True
 except ImportError:
     GEMINI_AVAILABLE = False
 class AIAssistant:
+    """AI-powered analysis assistant"""
     def __init__(self):
         self.openai_key = os.getenv('OPENAI_API_KEY')
         self.gemini_key = os.getenv('GOOGLE_API_KEY')
+        if self.gemini_key and GEMINI_AVAILABLE:
+            genai.configure(api_key=self.gemini_key)
+            self.gemini_model = genai.GenerativeModel('gemini-1.5-flash')
     def get_available_models(self) -> List[str]:
         """Get list of available AI models"""
         return models
     def analyze_insights(self, df: pd.DataFrame, insights: List[Dict], model: str = "Google Gemini") -> str:
+        """Get AI analysis of insights"""
+        # Prepare data summary
+        summary = f"""
+        Dataset Summary:
+        - Shape: {df.shape}
+        - Columns: {list(df.columns)}
+        - Data types: {df.dtypes.value_counts().to_dict()}
+        Key Insights Found:
+        """
+        for insight in insights:
+            summary += f"\n- {insight['insight']}"
+        prompt = f"""
+        As a senior data scientist, analyze this dataset and provide:
+        1. Business implications of the findings
+        2. Potential opportunities or risks
+        3. Recommendations for decision-making
+        4. Suggestions for further analysis
+        {summary}
+        Provide actionable insights in a professional format.
+        """
         try:
             if model == "Google Gemini" and hasattr(self, 'gemini_model'):
                 response = self.gemini_model.generate_content(prompt)
+                return response.text
+            elif model == "OpenAI GPT" and self.openai_key:
                 client = openai.OpenAI(api_key=self.openai_key)
                 response = client.chat.completions.create(
                     model="gpt-3.5-turbo",
+                    messages=[{"role": "user", "content": prompt}]
                 )
+                return response.choices[0].message.content
             else:
+                return "AI analysis not available. Please configure API keys."
         except Exception as e:
+            return f"AI Analysis Error: {str(e)}"
 class DataAnalysisWorkflow:
+    """Optimized data analysis workflow with caching and pagination"""
     def __init__(self, df: pd.DataFrame):
         self.df = df
         self.stats = calculate_basic_stats(df)
         self.column_types = get_column_types(df)
         self.insights = []
+        self.page_size = 1000  # For pagination
+    def add_insight(self, insight: str, stage: int):
+        """Add insight to analysis report"""
         self.insights.append({
             'stage': stage,
             'insight': insight,
             'timestamp': pd.Timestamp.now()
         })
     def get_paginated_data(self, page: int = 0) -> pd.DataFrame:
+        """Get paginated data for display"""
+        start_idx = page * self.page_size
+        end_idx = start_idx + self.page_size
+        return self.df.iloc[start_idx:end_idx]
     def stage_1_overview(self):
+        """Stage 1: Data Overview with caching"""
         st.subheader("📊 Data Overview")
+        # Data Quality Score
         quality_metrics = calculate_data_quality_score(self.df)
         col1, col2, col3, col4 = st.columns(4)
         with col1:
+            st.metric("Rows", f"{self.stats['shape'][0]:,}")
         with col2:
+            st.metric("Columns", f"{self.stats['shape'][1]:,}")
         with col3:
+            st.metric("Quality Score", f"{quality_metrics['score']:.1f}/100")
         with col4:
+            st.metric("Grade", quality_metrics['grade'])
         if quality_metrics['issues']:
+            st.warning("Quality Issues Found:")
             for issue in quality_metrics['issues']:
                 st.write(f"• {issue}")
+        # Memory Usage and Optimization
+        st.subheader("Memory Analysis")
         memory_opt = calculate_memory_optimization(self.df)
+        col1, col2 = st.columns(2)
         with col1:
             st.metric("Current Memory", f"{memory_opt['current_memory_mb']:.1f} MB")
         with col2:
             if memory_opt['potential_savings_mb'] > 0:
                 st.metric("Potential Savings",
                          f"{memory_opt['potential_savings_mb']:.1f} MB",
+                         f"{memory_opt['potential_savings_pct']:.1f}%")
+                if st.button("Show Optimization Details"):
+                    st.dataframe(pd.DataFrame(memory_opt['suggestions']))
+        # Column Cardinality Analysis
+        st.subheader("Column Cardinality Analysis")
         cardinality_df = calculate_column_cardinality(self.df)
+        # Filter options
+        col_types = cardinality_df['Type'].unique()
+        selected_types = st.multiselect("Filter by Column Type",
+                                      col_types,
+                                      default=col_types)
+        filtered_df = cardinality_df[cardinality_df['Type'].isin(selected_types)]
+        st.dataframe(filtered_df, use_container_width=True)
+        # Highlight important findings
+        id_cols = filtered_df[filtered_df['Type'] == 'Unique Identifier']['Column'].tolist()
         if id_cols:
+            st.info(f"📌 Potential ID columns found: {', '.join(id_cols)}")
+        const_cols = filtered_df[filtered_df['Type'] == 'Constant']['Column'].tolist()
         if const_cols:
+            st.warning(f"⚠️ Constant columns found: {', '.join(const_cols)}")
+        # Data types visualization
+        if self.stats['dtypes']:
+            st.subheader("Data Types Distribution")
+            fig = px.pie(values=list(self.stats['dtypes'].values()),
+                        names=list(self.stats['dtypes'].keys()),
+                        title="Data Types")
+            st.plotly_chart(fig, use_container_width=True)
+        # Sample data with pagination
+        st.subheader("Sample Data")
         total_pages = (len(self.df) - 1) // self.page_size + 1
+        if total_pages > 1:
+            page = st.slider("Page", 0, total_pages - 1, 0)
+            sample_data = self.get_paginated_data(page)
+            st.write(f"Showing rows {page * self.page_size + 1} to {min((page + 1) * self.page_size, len(self.df))}")
+        else:
+            sample_data = self.df.head(10)
+        st.dataframe(sample_data, use_container_width=True)
+        # Missing values analysis
         missing_df = calculate_missing_data(self.df)
         if not missing_df.empty:
+            st.subheader("Missing Values Analysis")
             st.dataframe(missing_df, use_container_width=True)
+            worst_column = missing_df.iloc[0]['Column']
+            worst_percentage = missing_df.iloc[0]['Missing %']
+            self.add_insight(f"Column '{worst_column}' has highest missing data: {worst_percentage:.1f}%", 1)
         else:
+            st.success("✅ No missing values found!")
+            self.add_insight("Dataset has no missing values - excellent data quality", 1)
+        # Add insights about data quality and cardinality
+        if quality_metrics['score'] < 80:
+            self.add_insight(f"Data quality needs improvement (Score: {quality_metrics['score']:.1f}/100)", 1)
         if memory_opt['potential_savings_pct'] > 20:
+            self.add_insight(f"Potential memory optimization of {memory_opt['potential_savings_pct']:.1f}% identified", 1)
+        if id_cols:
+            self.add_insight(f"Found {len(id_cols)} potential ID columns", 1)
     def stage_2_exploration(self):
+        """Stage 2: Exploratory Data Analysis with caching"""
         st.subheader("🔍 Exploratory Data Analysis")
         numeric_cols = self.column_types['numeric']
         categorical_cols = self.column_types['categorical']
+        # Numeric analysis
         if numeric_cols:
+            st.subheader("Numeric Variables")
+            selected_numeric = st.selectbox("Select numeric column:", numeric_cols)
+            col1, col2 = st.columns(2)
+            with col1:
+                fig = px.histogram(self.df, x=selected_numeric,
+                                 title=f"Distribution of {selected_numeric}")
+                st.plotly_chart(fig, use_container_width=True)
+            with col2:
+                fig = px.box(self.df, y=selected_numeric,
+                           title=f"Box Plot of {selected_numeric}")
+                st.plotly_chart(fig, use_container_width=True)
+            # Statistical summary
+            st.subheader("Statistical Summary")
+            summary_stats = self.df[numeric_cols].describe()
+            st.dataframe(summary_stats, use_container_width=True)
+            # Correlation analysis
+            if len(numeric_cols) > 1:
+                st.subheader("Correlation Analysis")
+                corr_matrix = calculate_correlation_matrix(self.df)
+                if not corr_matrix.empty:
+                    fig = px.imshow(corr_matrix, text_auto=True, aspect="auto",
+                                   title="Correlation Matrix")
                     st.plotly_chart(fig, use_container_width=True)
+                    # Find highest correlation
+                    corr_values = []
                     for i in range(len(corr_matrix.columns)):
                         for j in range(i+1, len(corr_matrix.columns)):
+                            corr_values.append(abs(corr_matrix.iloc[i, j]))
+                    if corr_values:
+                        max_corr = max(corr_values)
+                        self.add_insight(f"Maximum correlation coefficient: {max_corr:.3f}", 2)
+        # Categorical analysis
+        if categorical_cols:
+            st.subheader("Categorical Variables")
+            selected_categorical = st.selectbox("Select categorical column:", categorical_cols)
+            value_counts = get_value_counts(self.df, selected_categorical)
+            fig = px.bar(x=value_counts.index, y=value_counts.values,
+                        title=f"Top 10 {selected_categorical} Values")
+            st.plotly_chart(fig, use_container_width=True)
+            total_categories = self.df[selected_categorical].nunique()
+            self.add_insight(f"Column '{selected_categorical}' has {total_categories} unique categories", 2)
     def stage_3_cleaning(self):
+        """Stage 3: Data Quality Assessment"""
+        st.subheader("🧹 Data Quality Assessment")
+        cleaning_actions = []
+        cleaning_history = []
+        # Missing values handling
+        if self.stats['missing_values'] > 0:
+            st.subheader("Missing Values Treatment")
+            missing_df = calculate_missing_data(self.df)
+            st.dataframe(missing_df, use_container_width=True)
+            col1, col2 = st.columns(2)
             with col1:
+                selected_col = st.selectbox("Select column to handle missing values:",
+                                          missing_df['Column'].tolist())
             with col2:
+                fill_method = st.selectbox("Choose fill method:",
+                                         ["Drop rows", "Mean", "Median", "Mode", "Custom value"])
+            if st.button("Apply Missing Value Treatment"):
                 try:
                     if fill_method == "Drop rows":
                         self.df = self.df.dropna(subset=[selected_col])
+                        cleaning_history.append(f"Dropped rows with missing values in {selected_col}")
                     else:
                         if fill_method == "Mean":
                             fill_value = self.df[selected_col].mean()
                         elif fill_method == "Median":
                             fill_value = self.df[selected_col].median()
                         elif fill_method == "Mode":
+                            fill_value = self.df[selected_col].mode()[0]
+                        else:  # Custom value
+                            fill_value = st.number_input("Enter custom value:", value=0.0)
                         self.df[selected_col] = self.df[selected_col].fillna(fill_value)
+                        cleaning_history.append(f"Filled missing values in {selected_col} with {fill_method}")
+                    st.success("✅ Missing values handled successfully!")
                 except Exception as e:
+                    st.error(f"Error handling missing values: {str(e)}")
+        # Duplicates handling
         if self.stats['duplicates'] > 0:
+            st.subheader("Duplicate Rows")
+            st.warning(f"Found {self.stats['duplicates']} duplicate rows")
+            if st.button("Remove Duplicate Rows"):
+                original_len = len(self.df)
+                self.df = self.df.drop_duplicates()
+                removed = original_len - len(self.df)
+                cleaning_history.append(f"Removed {removed} duplicate rows")
+                st.success(f"✅ Removed {removed} duplicate rows")
+        else:
+            st.success("✅ No duplicate rows found")
+        # Mixed type detection and handling
+        mixed_types = detect_mixed_types(self.df)
+        if mixed_types:
+            st.subheader("Mixed Data Types")
+            mixed_df = pd.DataFrame(mixed_types)
+            st.dataframe(mixed_df, use_container_width=True)
+            selected_col = st.selectbox("Select column to fix data type:",
+                                      [item['column'] for item in mixed_types])
+            fix_method = st.selectbox("Choose fix method:",
+                                    ["Convert to numeric", "Convert to string"])
+            if st.button("Fix Data Type"):
                 try:
+                    if fix_method == "Convert to numeric":
+                        self.df[selected_col] = pd.to_numeric(self.df[selected_col], errors='coerce')
+                    else:
+                        self.df[selected_col] = self.df[selected_col].astype(str)
+                    cleaning_history.append(f"Fixed data type for {selected_col} to {fix_method}")
+                    st.success("✅ Data type fixed successfully!")
                 except Exception as e:
+                    st.error(f"Error fixing data type: {str(e)}")
+        # Outlier detection and handling
+        numeric_cols = self.column_types['numeric']
+        if numeric_cols:
+            st.subheader("Outlier Detection")
+            selected_col = st.selectbox("Select column for outlier detection:", numeric_cols)
+            outliers = calculate_outliers(self.df, selected_col)
+            outlier_count = len(outliers)
+            if outlier_count > 0:
+                st.warning(f"Found {outlier_count} potential outliers in '{selected_col}'")
+                st.dataframe(outliers[[selected_col]].head(100), use_container_width=True)
+                treatment_method = st.selectbox("Choose outlier treatment method:",
+                                              ["None", "Remove", "Cap at percentiles"])
+                if treatment_method != "None" and st.button("Apply Outlier Treatment"):
+                    try:
+                        if treatment_method == "Remove":
+                            self.df = self.df[~self.df.index.isin(outliers.index)]
+                            cleaning_history.append(f"Removed {outlier_count} outliers from {selected_col}")
+                        else:  # Cap at percentiles
+                            Q1 = self.df[selected_col].quantile(0.25)
+                            Q3 = self.df[selected_col].quantile(0.75)
+                            IQR = Q3 - Q1
+                            lower_bound = Q1 - 1.5 * IQR
+                            upper_bound = Q3 + 1.5 * IQR
+                            self.df[selected_col] = self.df[selected_col].clip(lower_bound, upper_bound)
+                            cleaning_history.append(f"Capped outliers in {selected_col} at percentiles")
+                        st.success("✅ Outliers handled successfully!")
+                    except Exception as e:
+                        st.error(f"Error handling outliers: {str(e)}")
+            else:
+                st.success(f"✅ No outliers detected in '{selected_col}'")
+        # Cleaning History
+        if cleaning_history:
+            st.subheader("Cleaning Operations History")
+            for i, operation in enumerate(cleaning_history, 1):
+                st.write(f"{i}. {operation}")
+            self.add_insight(f"Performed {len(cleaning_history)} data cleaning operations", 3)
+        # Summary
+        if cleaning_actions:
+            st.subheader("Remaining Action Items")
+            for i, action in enumerate(cleaning_actions, 1):
+                st.write(f"{i}. {action}")
+            self.add_insight(f"Identified {len(cleaning_actions)} data quality issues", 3)
         else:
+            st.success("✅ Data quality is excellent!")
+            self.add_insight("No major data quality issues found", 3)
+    def stage_4_analysis(self):
+        """Stage 4: Advanced Analysis"""
+        st.subheader("🔬 Advanced Analysis")
         numeric_cols = self.column_types['numeric']
+        categorical_cols = self.column_types['categorical']
+        # Relationship analysis
+        if len(numeric_cols) >= 2:
+            st.subheader("Variable Relationships")
+            col1, col2 = st.columns(2)
             with col1:
+                x_var = st.selectbox("X Variable:", numeric_cols)
             with col2:
+                y_var = st.selectbox("Y Variable:",
+                                   [col for col in numeric_cols if col != x_var])
+            # Sample data for performance if dataset is large
+            sample_size = min(5000, len(self.df))
+            sample_df = self.df.sample(n=sample_size) if len(self.df) > sample_size else self.df
+            fig = px.scatter(sample_df, x=x_var, y=y_var,
+                           title=f"Relationship: {x_var} vs {y_var}")
+            st.plotly_chart(fig, use_container_width=True)
+            correlation = self.df[x_var].corr(self.df[y_var])
+            st.metric("Correlation", f"{correlation:.3f}")
+            if abs(correlation) > 0.7:
+                strength = "Strong"
+            elif abs(correlation) > 0.3:
+                strength = "Moderate"
+            else:
+                strength = "Weak"
+            direction = "positive" if correlation > 0 else "negative"
+            st.write(f"**Result:** {strength} {direction} correlation")
+            self.add_insight(f"{strength} correlation ({correlation:.3f}) between {x_var} and {y_var}", 4)
+        # Group analysis
         if categorical_cols and numeric_cols:
+            st.subheader("Group Analysis")
+            col1, col2 = st.columns(2)
+            with col1:
+                group_var = st.selectbox("Group by:", categorical_cols)
+            with col2:
+                metric_var = st.selectbox("Analyze:", numeric_cols)
+            group_stats = calculate_group_stats(self.df, group_var, metric_var)
+            st.dataframe(group_stats, use_container_width=True)
+            # Sample for visualization if too many groups
+            unique_groups = self.df[group_var].nunique()
+            if unique_groups <= 20:
+                fig = px.box(self.df, x=group_var, y=metric_var,
+                           title=f"{metric_var} by {group_var}")
+                st.plotly_chart(fig, use_container_width=True)
+            else:
+                st.info(f"Too many groups ({unique_groups}) for visualization. Showing statistics only.")
+            best_group = group_stats['mean'].idxmax()
+            best_value = group_stats.loc[best_group, 'mean']
+            self.add_insight(f"'{best_group}' has highest average {metric_var}: {best_value:.2f}", 4)
     def stage_5_summary(self):
+        """Stage 5: Summary and Export"""
+        st.subheader("📈 Analysis Summary")
+        # Key metrics
+        col1, col2, col3 = st.columns(3)
         with col1:
+            st.metric("Total Insights", len(self.insights))
         with col2:
+            quality = "High" if self.stats['missing_values'] == 0 else "Medium"
+            st.metric("Data Quality", quality)
         with col3:
+            st.metric("Analysis Complete", "✅")
+        # Insights summary
+        st.subheader("Key Insights")
+        for i, insight in enumerate(self.insights, 1):
+            st.write(f"{i}. **Stage {insight['stage']}:** {insight['insight']}")
+        # Export options
+        st.subheader("Export Results")
+        export_format = st.selectbox("Choose export format:",
+                                   ["Text Report", "Markdown Report", "Python Code", "Cleaned Data"])
+        if export_format == "Text Report":
+            report = self.generate_text_report()
+            st.download_button(
+                label="Download Text Report",
+                data=report,
+                file_name="analysis_report.txt",
+                mime="text/plain"
+            )
+        elif export_format == "Markdown Report":
+            report = self.generate_markdown_report()
+            st.download_button(
+                label="Download Markdown Report",
+                data=report,
+                file_name="analysis_report.md",
+                mime="text/markdown"
+            )
+        elif export_format == "Python Code":
+            code = self.generate_python_code()
+            st.code(code, language="python")
+            st.download_button(
+                label="Download Python Script",
+                data=code,
+                file_name="analysis_script.py",
+                mime="text/plain"
+            )
+        else:  # Cleaned Data
+            # Offer different export formats
+            data_format = st.selectbox("Choose data format:",
+                                     ["CSV", "Excel", "Parquet"])
+            if st.button("Export Data"):
+                try:
+                    if data_format == "CSV":
+                        csv = self.df.to_csv(index=False)
+                        st.download_button(
+                            label="Download CSV",
+                            data=csv,
+                            file_name="cleaned_data.csv",
+                            mime="text/csv"
+                        )
+                    elif data_format == "Excel":
+                        excel_buffer = BytesIO()
+                        self.df.to_excel(excel_buffer, index=False)
+                        excel_data = excel_buffer.getvalue()
+                        st.download_button(
+                            label="Download Excel",
+                            data=excel_data,
+                            file_name="cleaned_data.xlsx",
+                            mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+                        )
+                    else:  # Parquet
+                        parquet_buffer = BytesIO()
+                        self.df.to_parquet(parquet_buffer, index=False)
+                        parquet_data = parquet_buffer.getvalue()
+                        st.download_button(
+                            label="Download Parquet",
+                            data=parquet_data,
+                            file_name="cleaned_data.parquet",
+                            mime="application/octet-stream"
+                        )
+                except Exception as e:
+                    st.error(f"Error exporting data: {str(e)}")
+    def generate_text_report(self) -> str:
+        """Generate text analysis report"""
+        report = f"""DATA ANALYSIS REPORT
+==================
+Dataset Overview:
+- Rows: {self.stats['shape'][0]:,}
+- Columns: {self.stats['shape'][1]:,}
+- Missing Values: {self.stats['missing_values']:,}
+- Memory Usage: {self.stats['memory_usage']:.1f} MB
+Key Insights:
+"""
+        for insight in self.insights:
+            report += f"\n- Stage {insight['stage']}: {insight['insight']}"
+        report += f"\n\nGenerated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}"
+        return report
     def generate_markdown_report(self) -> str:
+        """Generate markdown analysis report"""
+        report = f"""# Data Analysis Report
+## Dataset Overview
+* **Rows:** {self.stats['shape'][0]:,}
+* **Columns:** {self.stats['shape'][1]:,}
+* **Missing Values:** {self.stats['missing_values']:,}
+* **Memory Usage:** {self.stats['memory_usage']:.1f} MB
+## Data Types
+```
+{pd.DataFrame(self.stats['dtypes'].items(), columns=['Type', 'Count']).to_markdown()}
+```
+## Key Insights
 """
         # Group insights by stage
+        for stage in range(1, 6):
             stage_insights = [i for i in self.insights if i['stage'] == stage]
             if stage_insights:
+                report += f"\n### Stage {stage}\n"
                 for insight in stage_insights:
+                    report += f"* {insight['insight']}\n"
+        report += f"\n\n*Generated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}*"
         return report
+    def generate_python_code(self) -> str:
+        """Generate reproducible Python code"""
+        code = """import pandas as pd
 import numpy as np
 import plotly.express as px
+from typing import Dict, List, Any
+# Load and prepare data
+df = pd.read_csv('your_data.csv')  # Update with your data source
+# Basic statistics
+def calculate_basic_stats(df: pd.DataFrame) -> Dict[str, Any]:
+    return {
+        'shape': df.shape,
+        'memory_usage': float(df.memory_usage(deep=True).sum() / 1024**2),
+        'missing_values': int(df.isnull().sum().sum()),
+        'dtypes': df.dtypes.value_counts().to_dict(),
+        'duplicates': int(df.duplicated().sum())
+    }
+stats = calculate_basic_stats(df)
+print("\\nBasic Statistics:")
+print(f"- Shape: {stats['shape']}")
+print(f"- Memory Usage: {stats['memory_usage']:.1f} MB")
+print(f"- Missing Values: {stats['missing_values']}")
+print(f"- Duplicates: {stats['duplicates']}")
+"""
+        # Add data cleaning operations if any were performed
+        if hasattr(self, 'cleaning_history'):
+            code += "\n# Data Cleaning\n"
             for operation in self.cleaning_history:
+                if "missing values" in operation.lower():
+                    code += "# Handle missing values\n"
+                    code += "df = df.fillna(method='ffill')  # Update with your chosen method\n"
                 elif "duplicate" in operation.lower():
+                    code += "# Remove duplicates\n"
+                    code += "df = df.drop_duplicates()\n"
                 elif "outlier" in operation.lower():
+                    code += """# Handle outliers
+def remove_outliers(df: pd.DataFrame, column: str) -> pd.DataFrame:
+    Q1 = df[column].quantile(0.25)
+    Q3 = df[column].quantile(0.75)
+    IQR = Q3 - Q1
+    return df[~((df[column] < (Q1 - 1.5 * IQR)) | (df[column] > (Q3 + 1.5 * IQR)))]
+# Apply to numeric columns as needed
+numeric_cols = df.select_dtypes(include=[np.number]).columns
+for col in numeric_cols:
+    df = remove_outliers(df, col)
 """
+        # Add visualization code
+        code += """
+# Visualizations
+def plot_missing_values(df: pd.DataFrame):
+    missing = df.isnull().sum()
+    if missing.sum() > 0:
+        missing = missing[missing > 0]
+        fig = px.bar(x=missing.index, y=missing.values,
+                    title='Missing Values by Column')
+        fig.show()
+def plot_correlations(df: pd.DataFrame):
+    numeric_cols = df.select_dtypes(include=[np.number]).columns
+    if len(numeric_cols) > 1:
+        corr = df[numeric_cols].corr()
+        fig = px.imshow(corr, title='Correlation Matrix')
+        fig.show()
+# Generate plots
+plot_missing_values(df)
+plot_correlations(df)
 """
+        return code