Spaces:

entropy25
/

data-analysis-platform

Build error

App Files Files Community

entropy25 commited on Aug 9, 2025

Commit

86805f4

verified ·

1 Parent(s): 1cc5290

Update analyzer.py

Browse files

Files changed (1) hide show

analyzer.py +1261 -483

analyzer.py CHANGED Viewed

@@ -5,6 +5,7 @@ import plotly.express as px
 import plotly.graph_objects as go
 from typing import Dict, List, Any, Optional
 import os
 from dotenv import load_dotenv
 from data_handler import *
 from io import BytesIO
@@ -12,29 +13,42 @@ from io import BytesIO
 # Load environment variables
 load_dotenv()
-# Optional AI Integration
 try:
     import openai
     OPENAI_AVAILABLE = True
 except ImportError:
     OPENAI_AVAILABLE = False
 try:
     import google.generativeai as genai
     GEMINI_AVAILABLE = True
 except ImportError:
     GEMINI_AVAILABLE = False
 class AIAssistant:
-    """AI-powered analysis assistant"""
     def __init__(self):
         self.openai_key = os.getenv('OPENAI_API_KEY')
         self.gemini_key = os.getenv('GOOGLE_API_KEY')
-        if self.gemini_key and GEMINI_AVAILABLE:
-            genai.configure(api_key=self.gemini_key)
-            self.gemini_model = genai.GenerativeModel('gemini-1.5-flash')
     def get_available_models(self) -> List[str]:
         """Get list of available AI models"""
@@ -46,648 +60,1412 @@ class AIAssistant:
         return models
     def analyze_insights(self, df: pd.DataFrame, insights: List[Dict], model: str = "Google Gemini") -> str:
-        """Get AI analysis of insights"""
-        # Prepare data summary
-        summary = f"""
-        Dataset Summary:
-        - Shape: {df.shape}
-        - Columns: {list(df.columns)}
-        - Data types: {df.dtypes.value_counts().to_dict()}
-        Key Insights Found:
-        """
-        for insight in insights:
-            summary += f"\n- {insight['insight']}"
-        prompt = f"""
-        As a senior data scientist, analyze this dataset and provide:
-        1. Business implications of the findings
-        2. Potential opportunities or risks
-        3. Recommendations for decision-making
-        4. Suggestions for further analysis
-        {summary}
-        Provide actionable insights in a professional format.
-        """
         try:
             if model == "Google Gemini" and hasattr(self, 'gemini_model'):
                 response = self.gemini_model.generate_content(prompt)
-                return response.text
-            elif model == "OpenAI GPT" and self.openai_key:
                 client = openai.OpenAI(api_key=self.openai_key)
                 response = client.chat.completions.create(
                     model="gpt-3.5-turbo",
-                    messages=[{"role": "user", "content": prompt}]
                 )
-                return response.choices[0].message.content
             else:
-                return "AI analysis not available. Please configure API keys."
         except Exception as e:
-            return f"AI Analysis Error: {str(e)}"
 class DataAnalysisWorkflow:
-    """Optimized data analysis workflow with caching and pagination"""
     def __init__(self, df: pd.DataFrame):
         self.df = df
         self.stats = calculate_basic_stats(df)
         self.column_types = get_column_types(df)
         self.insights = []
-        self.page_size = 1000  # For pagination
-    def add_insight(self, insight: str, stage: int):
-        """Add insight to analysis report"""
         self.insights.append({
             'stage': stage,
             'insight': insight,
             'timestamp': pd.Timestamp.now()
         })
     def get_paginated_data(self, page: int = 0) -> pd.DataFrame:
-        """Get paginated data for display"""
-        start_idx = page * self.page_size
-        end_idx = start_idx + self.page_size
-        return self.df.iloc[start_idx:end_idx]
     def stage_1_overview(self):
-        """Stage 1: Data Overview with caching"""
         st.subheader("📊 Data Overview")
-        # Data Quality Score
         quality_metrics = calculate_data_quality_score(self.df)
         col1, col2, col3, col4 = st.columns(4)
         with col1:
-            st.metric("Rows", f"{self.stats['shape'][0]:,}")
         with col2:
-            st.metric("Columns", f"{self.stats['shape'][1]:,}")
         with col3:
-            st.metric("Quality Score", f"{quality_metrics['score']:.1f}/100")
         with col4:
-            st.metric("Grade", quality_metrics['grade'])
         if quality_metrics['issues']:
-            st.warning("Quality Issues Found:")
             for issue in quality_metrics['issues']:
                 st.write(f"• {issue}")
-        # Memory Usage and Optimization
-        st.subheader("Memory Analysis")
         memory_opt = calculate_memory_optimization(self.df)
-        col1, col2 = st.columns(2)
         with col1:
             st.metric("Current Memory", f"{memory_opt['current_memory_mb']:.1f} MB")
         with col2:
             if memory_opt['potential_savings_mb'] > 0:
                 st.metric("Potential Savings",
                          f"{memory_opt['potential_savings_mb']:.1f} MB",
-                         f"{memory_opt['potential_savings_pct']:.1f}%")
-                if st.button("Show Optimization Details"):
-                    st.dataframe(pd.DataFrame(memory_opt['suggestions']))
-        # Column Cardinality Analysis
-        st.subheader("Column Cardinality Analysis")
         cardinality_df = calculate_column_cardinality(self.df)
-        # Filter options
-        col_types = cardinality_df['Type'].unique()
-        selected_types = st.multiselect("Filter by Column Type",
-                                      col_types,
-                                      default=col_types)
-        filtered_df = cardinality_df[cardinality_df['Type'].isin(selected_types)]
-        st.dataframe(filtered_df, use_container_width=True)
-        # Highlight important findings
-        id_cols = filtered_df[filtered_df['Type'] == 'Unique Identifier']['Column'].tolist()
         if id_cols:
-            st.info(f"📌 Potential ID columns found: {', '.join(id_cols)}")
-        const_cols = filtered_df[filtered_df['Type'] == 'Constant']['Column'].tolist()
         if const_cols:
-            st.warning(f"⚠️ Constant columns found: {', '.join(const_cols)}")
-        # Data types visualization
-        if self.stats['dtypes']:
-            st.subheader("Data Types Distribution")
-            fig = px.pie(values=list(self.stats['dtypes'].values()),
-                        names=list(self.stats['dtypes'].keys()),
-                        title="Data Types")
-            st.plotly_chart(fig, use_container_width=True)
-        # Sample data with pagination
-        st.subheader("Sample Data")
         total_pages = (len(self.df) - 1) // self.page_size + 1
-        if total_pages > 1:
-            page = st.slider("Page", 0, total_pages - 1, 0)
-            sample_data = self.get_paginated_data(page)
-            st.write(f"Showing rows {page * self.page_size + 1} to {min((page + 1) * self.page_size, len(self.df))}")
-        else:
-            sample_data = self.df.head(10)
-        st.dataframe(sample_data, use_container_width=True)
-        # Missing values analysis
         missing_df = calculate_missing_data(self.df)
         if not missing_df.empty:
-            st.subheader("Missing Values Analysis")
             st.dataframe(missing_df, use_container_width=True)
-            worst_column = missing_df.iloc[0]['Column']
-            worst_percentage = missing_df.iloc[0]['Missing %']
-            self.add_insight(f"Column '{worst_column}' has highest missing data: {worst_percentage:.1f}%", 1)
         else:
-            st.success("✅ No missing values found!")
-            self.add_insight("Dataset has no missing values - excellent data quality", 1)
-        # Add insights about data quality and cardinality
-        if quality_metrics['score'] < 80:
-            self.add_insight(f"Data quality needs improvement (Score: {quality_metrics['score']:.1f}/100)", 1)
         if memory_opt['potential_savings_pct'] > 20:
-            self.add_insight(f"Potential memory optimization of {memory_opt['potential_savings_pct']:.1f}% identified", 1)
-        if id_cols:
-            self.add_insight(f"Found {len(id_cols)} potential ID columns", 1)
     def stage_2_exploration(self):
-        """Stage 2: Exploratory Data Analysis with caching"""
         st.subheader("🔍 Exploratory Data Analysis")
         numeric_cols = self.column_types['numeric']
         categorical_cols = self.column_types['categorical']
-        # Numeric analysis
         if numeric_cols:
-            st.subheader("Numeric Variables")
-            selected_numeric = st.selectbox("Select numeric column:", numeric_cols)
-            col1, col2 = st.columns(2)
-            with col1:
-                fig = px.histogram(self.df, x=selected_numeric,
-                                 title=f"Distribution of {selected_numeric}")
-                st.plotly_chart(fig, use_container_width=True)
-            with col2:
-                fig = px.box(self.df, y=selected_numeric,
-                           title=f"Box Plot of {selected_numeric}")
-                st.plotly_chart(fig, use_container_width=True)
-            # Statistical summary
-            st.subheader("Statistical Summary")
-            summary_stats = self.df[numeric_cols].describe()
-            st.dataframe(summary_stats, use_container_width=True)
-            # Correlation analysis
-            if len(numeric_cols) > 1:
-                st.subheader("Correlation Analysis")
-                corr_matrix = calculate_correlation_matrix(self.df)
-                if not corr_matrix.empty:
-                    fig = px.imshow(corr_matrix, text_auto=True, aspect="auto",
-                                   title="Correlation Matrix")
                     st.plotly_chart(fig, use_container_width=True)
-                    # Find highest correlation
-                    corr_values = []
-                    for i in range(len(corr_matrix.columns)):
-                        for j in range(i+1, len(corr_matrix.columns)):
-                            corr_values.append(abs(corr_matrix.iloc[i, j]))
-                    if corr_values:
-                        max_corr = max(corr_values)
-                        self.add_insight(f"Maximum correlation coefficient: {max_corr:.3f}", 2)
-        # Categorical analysis
-        if categorical_cols:
-            st.subheader("Categorical Variables")
-            selected_categorical = st.selectbox("Select categorical column:", categorical_cols)
-            value_counts = get_value_counts(self.df, selected_categorical)
-            fig = px.bar(x=value_counts.index, y=value_counts.values,
-                        title=f"Top 10 {selected_categorical} Values")
-            st.plotly_chart(fig, use_container_width=True)
-            total_categories = self.df[selected_categorical].nunique()
-            self.add_insight(f"Column '{selected_categorical}' has {total_categories} unique categories", 2)
     def stage_3_cleaning(self):
-        """Stage 3: Data Quality Assessment"""
-        st.subheader("🧹 Data Quality Assessment")
-        cleaning_actions = []
-        cleaning_history = []
-        # Missing values handling
-        if self.stats['missing_values'] > 0:
-            st.subheader("Missing Values Treatment")
-            missing_df = calculate_missing_data(self.df)
-            st.dataframe(missing_df, use_container_width=True)
-            col1, col2 = st.columns(2)
             with col1:
-                selected_col = st.selectbox("Select column to handle missing values:",
-                                          missing_df['Column'].tolist())
             with col2:
-                fill_method = st.selectbox("Choose fill method:",
-                                         ["Drop rows", "Mean", "Median", "Mode", "Custom value"])
-            if st.button("Apply Missing Value Treatment"):
                 try:
                     if fill_method == "Drop rows":
                         self.df = self.df.dropna(subset=[selected_col])
-                        cleaning_history.append(f"Dropped rows with missing values in {selected_col}")
                     else:
                         if fill_method == "Mean":
                             fill_value = self.df[selected_col].mean()
                         elif fill_method == "Median":
                             fill_value = self.df[selected_col].median()
                         elif fill_method == "Mode":
-                            fill_value = self.df[selected_col].mode()[0]
-                        else:  # Custom value
-                            fill_value = st.number_input("Enter custom value:", value=0.0)
                         self.df[selected_col] = self.df[selected_col].fillna(fill_value)
-                        cleaning_history.append(f"Filled missing values in {selected_col} with {fill_method}")
-                    st.success("✅ Missing values handled successfully!")
                 except Exception as e:
-                    st.error(f"Error handling missing values: {str(e)}")
-        # Duplicates handling
-        if self.stats['duplicates'] > 0:
-            st.subheader("Duplicate Rows")
-            st.warning(f"Found {self.stats['duplicates']} duplicate rows")
-            if st.button("Remove Duplicate Rows"):
-                original_len = len(self.df)
-                self.df = self.df.drop_duplicates()
-                removed = original_len - len(self.df)
-                cleaning_history.append(f"Removed {removed} duplicate rows")
-                st.success(f"✅ Removed {removed} duplicate rows")
         else:
-            st.success("✅ No duplicate rows found")
-        # Mixed type detection and handling
-        mixed_types = detect_mixed_types(self.df)
-        if mixed_types:
-            st.subheader("Mixed Data Types")
-            mixed_df = pd.DataFrame(mixed_types)
-            st.dataframe(mixed_df, use_container_width=True)
-            selected_col = st.selectbox("Select column to fix data type:",
-                                      [item['column'] for item in mixed_types])
-            fix_method = st.selectbox("Choose fix method:",
-                                    ["Convert to numeric", "Convert to string"])
-            if st.button("Fix Data Type"):
                 try:
-                    if fix_method == "Convert to numeric":
-                        self.df[selected_col] = pd.to_numeric(self.df[selected_col], errors='coerce')
-                    else:
-                        self.df[selected_col] = self.df[selected_col].astype(str)
-                    cleaning_history.append(f"Fixed data type for {selected_col} to {fix_method}")
-                    st.success("✅ Data type fixed successfully!")
                 except Exception as e:
-                    st.error(f"Error fixing data type: {str(e)}")
-        # Outlier detection and handling
-        numeric_cols = self.column_types['numeric']
-        if numeric_cols:
-            st.subheader("Outlier Detection")
-            selected_col = st.selectbox("Select column for outlier detection:", numeric_cols)
-            outliers = calculate_outliers(self.df, selected_col)
-            outlier_count = len(outliers)
-            if outlier_count > 0:
-                st.warning(f"Found {outlier_count} potential outliers in '{selected_col}'")
-                st.dataframe(outliers[[selected_col]].head(100), use_container_width=True)
-                treatment_method = st.selectbox("Choose outlier treatment method:",
-                                              ["None", "Remove", "Cap at percentiles"])
-                if treatment_method != "None" and st.button("Apply Outlier Treatment"):
-                    try:
-                        if treatment_method == "Remove":
-                            self.df = self.df[~self.df.index.isin(outliers.index)]
-                            cleaning_history.append(f"Removed {outlier_count} outliers from {selected_col}")
-                        else:  # Cap at percentiles
-                            Q1 = self.df[selected_col].quantile(0.25)
-                            Q3 = self.df[selected_col].quantile(0.75)
-                            IQR = Q3 - Q1
-                            lower_bound = Q1 - 1.5 * IQR
-                            upper_bound = Q3 + 1.5 * IQR
-                            self.df[selected_col] = self.df[selected_col].clip(lower_bound, upper_bound)
-                            cleaning_history.append(f"Capped outliers in {selected_col} at percentiles")
-                        st.success("✅ Outliers handled successfully!")
-                    except Exception as e:
-                        st.error(f"Error handling outliers: {str(e)}")
-            else:
-                st.success(f"✅ No outliers detected in '{selected_col}'")
-        # Cleaning History
-        if cleaning_history:
-            st.subheader("Cleaning Operations History")
-            for i, operation in enumerate(cleaning_history, 1):
-                st.write(f"{i}. {operation}")
-            self.add_insight(f"Performed {len(cleaning_history)} data cleaning operations", 3)
-        # Summary
-        if cleaning_actions:
-            st.subheader("Remaining Action Items")
-            for i, action in enumerate(cleaning_actions, 1):
-                st.write(f"{i}. {action}")
-            self.add_insight(f"Identified {len(cleaning_actions)} data quality issues", 3)
         else:
-            st.success("✅ Data quality is excellent!")
-            self.add_insight("No major data quality issues found", 3)
-    def stage_4_analysis(self):
-        """Stage 4: Advanced Analysis"""
-        st.subheader("🔬 Advanced Analysis")
         numeric_cols = self.column_types['numeric']
-        categorical_cols = self.column_types['categorical']
-        # Relationship analysis
-        if len(numeric_cols) >= 2:
-            st.subheader("Variable Relationships")
-            col1, col2 = st.columns(2)
             with col1:
-                x_var = st.selectbox("X Variable:", numeric_cols)
             with col2:
-                y_var = st.selectbox("Y Variable:",
-                                   [col for col in numeric_cols if col != x_var])
-            # Sample data for performance if dataset is large
-            sample_size = min(5000, len(self.df))
-            sample_df = self.df.sample(n=sample_size) if len(self.df) > sample_size else self.df
-            fig = px.scatter(sample_df, x=x_var, y=y_var,
-                           title=f"Relationship: {x_var} vs {y_var}")
-            st.plotly_chart(fig, use_container_width=True)
-            correlation = self.df[x_var].corr(self.df[y_var])
-            st.metric("Correlation", f"{correlation:.3f}")
-            if abs(correlation) > 0.7:
-                strength = "Strong"
-            elif abs(correlation) > 0.3:
-                strength = "Moderate"
-            else:
-                strength = "Weak"
-            direction = "positive" if correlation > 0 else "negative"
-            st.write(f"**Result:** {strength} {direction} correlation")
-            self.add_insight(f"{strength} correlation ({correlation:.3f}) between {x_var} and {y_var}", 4)
-        # Group analysis
-        if categorical_cols and numeric_cols:
-            st.subheader("Group Analysis")
             col1, col2 = st.columns(2)
             with col1:
-                group_var = st.selectbox("Group by:", categorical_cols)
             with col2:
-                metric_var = st.selectbox("Analyze:", numeric_cols)
-            group_stats = calculate_group_stats(self.df, group_var, metric_var)
-            st.dataframe(group_stats, use_container_width=True)
-            # Sample for visualization if too many groups
-            unique_groups = self.df[group_var].nunique()
-            if unique_groups <= 20:
-                fig = px.box(self.df, x=group_var, y=metric_var,
-                           title=f"{metric_var} by {group_var}")
                 st.plotly_chart(fig, use_container_width=True)
-            else:
-                st.info(f"Too many groups ({unique_groups}) for visualization. Showing statistics only.")
-            best_group = group_stats['mean'].idxmax()
-            best_value = group_stats.loc[best_group, 'mean']
-            self.add_insight(f"'{best_group}' has highest average {metric_var}: {best_value:.2f}", 4)
     def stage_5_summary(self):
-        """Stage 5: Summary and Export"""
-        st.subheader("📈 Analysis Summary")
-        # Key metrics
-        col1, col2, col3 = st.columns(3)
         with col1:
-            st.metric("Total Insights", len(self.insights))
         with col2:
-            quality = "High" if self.stats['missing_values'] == 0 else "Medium"
-            st.metric("Data Quality", quality)
         with col3:
-            st.metric("Analysis Complete", "✅")
-        # Insights summary
-        st.subheader("Key Insights")
-        for i, insight in enumerate(self.insights, 1):
-            st.write(f"{i}. **Stage {insight['stage']}:** {insight['insight']}")
-        # Export options
-        st.subheader("Export Results")
-        export_format = st.selectbox("Choose export format:",
-                                   ["Text Report", "Markdown Report", "Python Code", "Cleaned Data"])
-        if export_format == "Text Report":
-            report = self.generate_text_report()
-            st.download_button(
-                label="Download Text Report",
-                data=report,
-                file_name="analysis_report.txt",
-                mime="text/plain"
-            )
-        elif export_format == "Markdown Report":
-            report = self.generate_markdown_report()
-            st.download_button(
-                label="Download Markdown Report",
-                data=report,
-                file_name="analysis_report.md",
-                mime="text/markdown"
-            )
-        elif export_format == "Python Code":
-            code = self.generate_python_code()
-            st.code(code, language="python")
-            st.download_button(
-                label="Download Python Script",
-                data=code,
-                file_name="analysis_script.py",
-                mime="text/plain"
-            )
-        else:  # Cleaned Data
-            # Offer different export formats
-            data_format = st.selectbox("Choose data format:",
-                                     ["CSV", "Excel", "Parquet"])
-            if st.button("Export Data"):
-                try:
-                    if data_format == "CSV":
-                        csv = self.df.to_csv(index=False)
-                        st.download_button(
-                            label="Download CSV",
-                            data=csv,
-                            file_name="cleaned_data.csv",
-                            mime="text/csv"
-                        )
-                    elif data_format == "Excel":
-                        excel_buffer = BytesIO()
-                        self.df.to_excel(excel_buffer, index=False)
-                        excel_data = excel_buffer.getvalue()
-                        st.download_button(
-                            label="Download Excel",
-                            data=excel_data,
-                            file_name="cleaned_data.xlsx",
-                            mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
-                        )
-                    else:  # Parquet
-                        parquet_buffer = BytesIO()
-                        self.df.to_parquet(parquet_buffer, index=False)
-                        parquet_data = parquet_buffer.getvalue()
-                        st.download_button(
-                            label="Download Parquet",
-                            data=parquet_data,
-                            file_name="cleaned_data.parquet",
-                            mime="application/octet-stream"
-                        )
-                except Exception as e:
-                    st.error(f"Error exporting data: {str(e)}")
-    def generate_text_report(self) -> str:
-        """Generate text analysis report"""
-        report = f"""DATA ANALYSIS REPORT
-==================
-Dataset Overview:
-- Rows: {self.stats['shape'][0]:,}
-- Columns: {self.stats['shape'][1]:,}
-- Missing Values: {self.stats['missing_values']:,}
-- Memory Usage: {self.stats['memory_usage']:.1f} MB
-Key Insights:
-"""
-        for insight in self.insights:
-            report += f"\n- Stage {insight['stage']}: {insight['insight']}"
-        report += f"\n\nGenerated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}"
-        return report
     def generate_markdown_report(self) -> str:
-        """Generate markdown analysis report"""
-        report = f"""# Data Analysis Report
-## Dataset Overview
-* **Rows:** {self.stats['shape'][0]:,}
-* **Columns:** {self.stats['shape'][1]:,}
-* **Missing Values:** {self.stats['missing_values']:,}
-* **Memory Usage:** {self.stats['memory_usage']:.1f} MB
-## Data Types
-```
-{pd.DataFrame(self.stats['dtypes'].items(), columns=['Type', 'Count']).to_markdown()}
-```
-## Key Insights
 """
         # Group insights by stage
-        for stage in range(1, 6):
             stage_insights = [i for i in self.insights if i['stage'] == stage]
             if stage_insights:
-                report += f"\n### Stage {stage}\n"
                 for insight in stage_insights:
-                    report += f"* {insight['insight']}\n"
-        report += f"\n\n*Generated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}*"
         return report
-    def generate_python_code(self) -> str:
-        """Generate reproducible Python code"""
-        code = """import pandas as pd
 import numpy as np
 import plotly.express as px
-from typing import Dict, List, Any
-# Load and prepare data
-df = pd.read_csv('your_data.csv')  # Update with your data source
-# Basic statistics
-def calculate_basic_stats(df: pd.DataFrame) -> Dict[str, Any]:
-    return {
-        'shape': df.shape,
-        'memory_usage': float(df.memory_usage(deep=True).sum() / 1024**2),
-        'missing_values': int(df.isnull().sum().sum()),
-        'dtypes': df.dtypes.value_counts().to_dict(),
-        'duplicates': int(df.duplicated().sum())
-    }
-stats = calculate_basic_stats(df)
-print("\\nBasic Statistics:")
-print(f"- Shape: {stats['shape']}")
-print(f"- Memory Usage: {stats['memory_usage']:.1f} MB")
-print(f"- Missing Values: {stats['missing_values']}")
-print(f"- Duplicates: {stats['duplicates']}")
-"""
-        # Add data cleaning operations if any were performed
-        if hasattr(self, 'cleaning_history'):
-            code += "\n# Data Cleaning\n"
             for operation in self.cleaning_history:
-                if "missing values" in operation.lower():
-                    code += "# Handle missing values\n"
-                    code += "df = df.fillna(method='ffill')  # Update with your chosen method\n"
                 elif "duplicate" in operation.lower():
-                    code += "# Remove duplicates\n"
-                    code += "df = df.drop_duplicates()\n"
                 elif "outlier" in operation.lower():
-                    code += """# Handle outliers
-def remove_outliers(df: pd.DataFrame, column: str) -> pd.DataFrame:
-    Q1 = df[column].quantile(0.25)
-    Q3 = df[column].quantile(0.75)
-    IQR = Q3 - Q1
-    return df[~((df[column] < (Q1 - 1.5 * IQR)) | (df[column] > (Q3 + 1.5 * IQR)))]
-# Apply to numeric columns as needed
-numeric_cols = df.select_dtypes(include=[np.number]).columns
-for col in numeric_cols:
-    df = remove_outliers(df, col)
 """
-        # Add visualization code
-        code += """
-# Visualizations
-def plot_missing_values(df: pd.DataFrame):
-    missing = df.isnull().sum()
-    if missing.sum() > 0:
-        missing = missing[missing > 0]
-        fig = px.bar(x=missing.index, y=missing.values,
-                    title='Missing Values by Column')
-        fig.show()
-def plot_correlations(df: pd.DataFrame):
-    numeric_cols = df.select_dtypes(include=[np.number]).columns
-    if len(numeric_cols) > 1:
-        corr = df[numeric_cols].corr()
-        fig = px.imshow(corr, title='Correlation Matrix')
-        fig.show()
-# Generate plots
-plot_missing_values(df)
-plot_correlations(df)
 """
-        return code

 import plotly.graph_objects as go
 from typing import Dict, List, Any, Optional
 import os
+import logging
 from dotenv import load_dotenv
 from data_handler import *
 from io import BytesIO
 # Load environment variables
 load_dotenv()
+# Configure logging
+logger = logging.getLogger(__name__)
+# Optional AI Integration with enhanced error handling
 try:
     import openai
     OPENAI_AVAILABLE = True
 except ImportError:
     OPENAI_AVAILABLE = False
+    logger.info("OpenAI not available - install openai package for AI features")
 try:
     import google.generativeai as genai
     GEMINI_AVAILABLE = True
 except ImportError:
     GEMINI_AVAILABLE = False
+    logger.info("Gemini not available - install google-generativeai package for AI features")
 class AIAssistant:
+    """Enhanced AI-powered analysis assistant with better error handling"""
     def __init__(self):
         self.openai_key = os.getenv('OPENAI_API_KEY')
         self.gemini_key = os.getenv('GOOGLE_API_KEY')
+        self.setup_models()
+    def setup_models(self):
+        """Initialize AI models with error handling"""
+        try:
+            if self.gemini_key and GEMINI_AVAILABLE:
+                genai.configure(api_key=self.gemini_key)
+                self.gemini_model = genai.GenerativeModel('gemini-1.5-flash')
+                logger.info("Gemini model initialized successfully")
+        except Exception as e:
+            logger.error(f"Failed to initialize Gemini: {str(e)}")
+            self.gemini_key = None
     def get_available_models(self) -> List[str]:
         """Get list of available AI models"""
         return models
     def analyze_insights(self, df: pd.DataFrame, insights: List[Dict], model: str = "Google Gemini") -> str:
+        """Get AI analysis with enhanced error handling and rate limiting"""
+        if not insights:
+            return "No insights available for analysis. Please complete the data analysis stages first."
         try:
+            # Prepare concise data summary
+            summary = self._prepare_data_summary(df, insights)
+            prompt = self._create_analysis_prompt(summary)
             if model == "Google Gemini" and hasattr(self, 'gemini_model'):
                 response = self.gemini_model.generate_content(prompt)
+                return self._format_ai_response(response.text)
+            elif model == "OpenAI GPT" and self.openai_key and OPENAI_AVAILABLE:
                 client = openai.OpenAI(api_key=self.openai_key)
                 response = client.chat.completions.create(
                     model="gpt-3.5-turbo",
+                    messages=[{"role": "user", "content": prompt}],
+                    max_tokens=800,
+                    temperature=0.7
                 )
+                return self._format_ai_response(response.choices[0].message.content)
             else:
+                return "❌ AI analysis not available. Please check your API configuration."
         except Exception as e:
+            error_msg = f"AI Analysis Error: {str(e)}"
+            logger.error(error_msg)
+            return f"❌ {error_msg}\n\n💡 Try checking your API keys or internet connection."
+    def _prepare_data_summary(self, df: pd.DataFrame, insights: List[Dict]) -> str:
+        """Prepare concise data summary for AI analysis"""
+        summary = f"""Dataset: {df.shape[0]} rows × {df.shape[1]} columns
+Data Types: {dict(df.dtypes.value_counts())}
+Missing Data: {df.isnull().sum().sum()} cells
+Key Findings:"""
+        for insight in insights[-5:]:  # Last 5 insights
+            summary += f"\n• {insight['insight']}"
+        return summary
+    def _create_analysis_prompt(self, summary: str) -> str:
+        """Create optimized prompt for AI analysis"""
+        return f"""As a data scientist, provide a brief analysis focusing on:
+1. **Business Impact**: What do these findings mean?
+2. **Recommendations**: 2-3 actionable next steps
+3. **Risks**: Potential data quality concerns
+{summary}
+Keep response under 300 words and focus on actionable insights."""
+    def _format_ai_response(self, response: str) -> str:
+        """Format AI response for better readability"""
+        if not response:
+            return "No response received from AI model."
+        # Clean up response
+        formatted = response.strip()
+        # Add emoji headers if not present
+        if "Business Impact" in formatted and "🎯" not in formatted:
+            formatted = formatted.replace("Business Impact", "🎯 **Business Impact**")
+        if "Recommendations" in formatted and "💡" not in formatted:
+            formatted = formatted.replace("Recommendations", "💡 **Recommendations**")
+        if "Risks" in formatted and "⚠️" not in formatted:
+            formatted = formatted.replace("Risks", "⚠️ **Risks**")
+        return formatted
 class DataAnalysisWorkflow:
+    """Enhanced data analysis workflow with improved UX and error handling"""
     def __init__(self, df: pd.DataFrame):
         self.df = df
+        self.original_df = df.copy()  # Keep original for rollback
         self.stats = calculate_basic_stats(df)
         self.column_types = get_column_types(df)
         self.insights = []
+        self.page_size = 1000
+        self.cleaning_history = []
+        # Validate data on initialization
+        is_valid, validation_issues = validate_dataframe(df)
+        if not is_valid:
+            for issue in validation_issues:
+                self.add_insight(f"Data validation issue: {issue}", 0)
+    def add_insight(self, insight: str, stage: int, insight_type: str = "info"):
+        """Enhanced insight tracking with types"""
         self.insights.append({
             'stage': stage,
             'insight': insight,
+            'type': insight_type,
             'timestamp': pd.Timestamp.now()
         })
     def get_paginated_data(self, page: int = 0) -> pd.DataFrame:
+        """Get paginated data with validation"""
+        try:
+            start_idx = page * self.page_size
+            end_idx = min(start_idx + self.page_size, len(self.df))
+            return self.df.iloc[start_idx:end_idx]
+        except Exception as e:
+            logger.error(f"Pagination error: {str(e)}")
+            return self.df.head(10)
     def stage_1_overview(self):
+        """Stage 1: Enhanced Data Overview with better UX"""
         st.subheader("📊 Data Overview")
+        # Help section
+        with st.expander("ℹ️ Help - Understanding Your Data", expanded=False):
+            st.markdown("""
+            **This stage provides:**
+            - Basic dataset statistics and structure
+            - Data quality assessment and scoring
+            - Memory usage analysis and optimization suggestions
+            - Column type classification and cardinality analysis
+            """)
+        # Data Quality Score with enhanced display
         quality_metrics = calculate_data_quality_score(self.df)
         col1, col2, col3, col4 = st.columns(4)
         with col1:
+            st.metric("Rows", f"{self.stats['shape'][0]:,}", help="Total number of records")
         with col2:
+            st.metric("Columns", f"{self.stats['shape'][1]:,}", help="Total number of features")
         with col3:
+            score_color = "normal" if quality_metrics['score'] >= 80 else "inverse"
+            st.metric("Quality Score", f"{quality_metrics['score']:.1f}/100",
+                     help="Overall data quality assessment")
         with col4:
+            grade_emoji = {"A+": "🌟", "A": "✅", "B+": "👍", "B": "👌", "C+": "⚠️", "C": "⚠️", "D": "❌", "F": "💥"}
+            st.metric("Grade", f"{grade_emoji.get(quality_metrics['grade'], '❓')} {quality_metrics['grade']}")
+        # Quality Issues and Recommendations
         if quality_metrics['issues']:
+            st.error("🚨 **Data Quality Issues Found:**")
             for issue in quality_metrics['issues']:
                 st.write(f"• {issue}")
+        if quality_metrics.get('recommendations'):
+            st.info("💡 **Recommendations:**")
+            for rec in quality_metrics['recommendations']:
+                st.write(f"• {rec}")
+        # Memory Analysis with actionable insights
+        st.subheader("💾 Memory Analysis")
         memory_opt = calculate_memory_optimization(self.df)
+        col1, col2, col3 = st.columns(3)
         with col1:
             st.metric("Current Memory", f"{memory_opt['current_memory_mb']:.1f} MB")
         with col2:
             if memory_opt['potential_savings_mb'] > 0:
                 st.metric("Potential Savings",
                          f"{memory_opt['potential_savings_mb']:.1f} MB",
+                         f"-{memory_opt['potential_savings_pct']:.1f}%")
+        with col3:
+            efficiency = 100 - memory_opt['potential_savings_pct']
+            st.metric("Memory Efficiency", f"{efficiency:.1f}%")
+        if memory_opt['suggestions']:
+            with st.expander("🔧 View Optimization Suggestions", expanded=False):
+                st.dataframe(pd.DataFrame(memory_opt['suggestions']), use_container_width=True)
+                st.info("💡 Converting object columns to categories can significantly reduce memory usage for repeated values.")
+        # Enhanced Column Analysis
+        st.subheader("📋 Column Analysis")
         cardinality_df = calculate_column_cardinality(self.df)
+        if not cardinality_df.empty:
+            # Interactive filters
+            col1, col2 = st.columns(2)
+            with col1:
+                col_types = cardinality_df['Type'].unique()
+                selected_types = st.multiselect("Filter by Cardinality Type",
+                                              col_types,
+                                              default=col_types,
+                                              help="Filter columns by their cardinality classification")
+            with col2:
+                data_types = cardinality_df['Data Type'].unique()
+                selected_data_types = st.multiselect("Filter by Data Type",
+                                                   data_types,
+                                                   default=data_types,
+                                                   help="Filter columns by their pandas data type")
+            # Apply filters
+            filtered_df = cardinality_df[
+                (cardinality_df['Type'].isin(selected_types)) &
+                (cardinality_df['Data Type'].isin(selected_data_types))
+            ]
+            st.dataframe(filtered_df, use_container_width=True)
+            # Actionable insights
+            self._display_cardinality_insights(filtered_df)
+        # Data Types Visualization
+        if self.stats['dtypes']:
+            col1, col2 = st.columns(2)
+            with col1:
+                st.subheader("📊 Data Types Distribution")
+                fig = px.pie(values=list(self.stats['dtypes'].values()),
+                            names=list(self.stats['dtypes'].keys()),
+                            title="Data Types Distribution")
+                fig.update_traces(textposition='inside', textinfo='percent+label')
+                st.plotly_chart(fig, use_container_width=True)
+            with col2:
+                st.subheader("📈 Column Count by Type")
+                fig = px.bar(x=list(self.stats['dtypes'].keys()),
+                           y=list(self.stats['dtypes'].values()),
+                           title="Column Count by Data Type")
+                st.plotly_chart(fig, use_container_width=True)
+        # Enhanced Sample Data Display
+        self._display_sample_data()
+        # Missing Values Analysis
+        self._analyze_missing_values()
+        # Record insights
+        self._record_stage1_insights(quality_metrics, memory_opt, cardinality_df)
+    def _display_cardinality_insights(self, cardinality_df: pd.DataFrame):
+        """Display actionable insights from cardinality analysis"""
+        if cardinality_df.empty:
+            return
+        # Key findings
+        id_cols = cardinality_df[cardinality_df['Type'] == 'Unique Identifier']['Column'].tolist()
+        const_cols = cardinality_df[cardinality_df['Type'] == 'Constant']['Column'].tolist()
+        low_card_cols = cardinality_df[cardinality_df['Type'].str.contains('Low')]['Column'].tolist()
         if id_cols:
+            st.success(f"🔑 **Potential ID Columns:** {', '.join(id_cols[:3])}" +
+                      (f" (+{len(id_cols)-3} more)" if len(id_cols) > 3 else ""))
         if const_cols:
+            st.warning(f"⚠️ **Constant Columns (consider removing):** {', '.join(const_cols[:3])}" +
+                      (f" (+{len(const_cols)-3} more)" if len(const_cols) > 3 else ""))
+        if low_card_cols:
+            st.info(f"📊 **Good for Grouping/Filtering:** {', '.join(low_card_cols[:3])}" +
+                   (f" (+{len(low_card_cols)-3} more)" if len(low_card_cols) > 3 else ""))
+    def _display_sample_data(self):
+        """Enhanced sample data display with pagination"""
+        st.subheader("👀 Sample Data")
         total_pages = (len(self.df) - 1) // self.page_size + 1
+        col1, col2, col3 = st.columns([2, 1, 1])
+        with col1:
+            if total_pages > 1:
+                page = st.slider("Page", 0, total_pages - 1, 0,
+                               help=f"Navigate through {total_pages} pages of data")
+                sample_data = self.get_paginated_data(page)
+                start_row = page * self.page_size + 1
+                end_row = min((page + 1) * self.page_size, len(self.df))
+                st.caption(f"Showing rows {start_row:,} to {end_row:,} of {len(self.df):,}")
+            else:
+                sample_data = self.df.head(20)
+                page = 0
+        with col2:
+            show_dtypes = st.checkbox("Show Data Types", help="Display column data types")
+        with col3:
+            max_cols = st.number_input("Max Columns", min_value=5, max_value=50, value=10,
+                                     help="Limit displayed columns for better readability")
+        # Display data with optional type info
+        display_df = sample_data.iloc[:, :max_cols]
+        if show_dtypes:
+            # Create a summary row with data types
+            type_row = pd.DataFrame([display_df.dtypes.astype(str)],
+                                  index=['Data Type'])
+            type_row.columns = display_df.columns
+            st.dataframe(type_row, use_container_width=True)
+            st.dataframe(display_df, use_container_width=True)
+        else:
+            st.dataframe(display_df, use_container_width=True)
+    def _analyze_missing_values(self):
+        """Enhanced missing values analysis"""
         missing_df = calculate_missing_data(self.df)
         if not missing_df.empty:
+            st.subheader("🕳️ Missing Values Analysis")
+            # Summary metrics
+            total_missing = missing_df['Missing Count'].sum()
+            affected_cols = len(missing_df)
+            col1, col2, col3 = st.columns(3)
+            with col1:
+                st.metric("Total Missing", f"{total_missing:,}")
+            with col2:
+                st.metric("Affected Columns", affected_cols)
+            with col3:
+                worst_col_pct = missing_df.iloc[0]['Missing %'] if len(missing_df) > 0 else 0
+                st.metric("Worst Column", f"{worst_col_pct:.1f}%")
+            # Detailed table
             st.dataframe(missing_df, use_container_width=True)
+            # Visualization for top missing columns
+            if len(missing_df) > 1:
+                top_missing = missing_df.head(10)
+                fig = px.bar(top_missing, x='Column', y='Missing %',
+                           title="Missing Values by Column",
+                           color='Missing %',
+                           color_continuous_scale='Reds')
+                fig.update_layout(xaxis_tickangle=-45)
+                st.plotly_chart(fig, use_container_width=True)
+            # Actionable recommendations
+            high_missing = missing_df[missing_df['Missing %'] > 50]
+            if not high_missing.empty:
+                st.error(f"⚠️ **Critical:** {len(high_missing)} columns have >50% missing data")
+                st.write("Consider removing these columns or investigating data collection issues.")
         else:
+            st.success("✅ **Excellent!** No missing values found in the dataset")
+    def _record_stage1_insights(self, quality_metrics, memory_opt, cardinality_df):
+        """Record insights from stage 1 analysis"""
+        # Quality insights
+        if quality_metrics['score'] >= 90:
+            self.add_insight("Excellent data quality detected", 1, "success")
+        elif quality_metrics['score'] < 70:
+            self.add_insight(f"Data quality needs attention (Score: {quality_metrics['score']:.1f}/100)", 1, "warning")
+        # Memory insights
         if memory_opt['potential_savings_pct'] > 20:
+            self.add_insight(f"Significant memory optimization opportunity: {memory_opt['potential_savings_pct']:.1f}%", 1, "info")
+        # Structure insights
+        if not cardinality_df.empty:
+            id_cols = len(cardinality_df[cardinality_df['Type'] == 'Unique Identifier'])
+            const_cols = len(cardinality_df[cardinality_df['Type'] == 'Constant'])
+            if id_cols > 0:
+                self.add_insight(f"Found {id_cols} potential identifier column(s)", 1, "info")
+            if const_cols > 0:
+                self.add_insight(f"Found {const_cols} constant column(s) - consider removal", 1, "warning")
     def stage_2_exploration(self):
+        """Stage 2: Enhanced Exploratory Data Analysis"""
         st.subheader("🔍 Exploratory Data Analysis")
+        with st.expander("ℹ️ Help - Exploratory Analysis", expanded=False):
+            st.markdown("""
+            **This stage helps you:**
+            - Understand distributions of your variables
+            - Identify patterns and relationships
+            - Spot potential anomalies or interesting features
+            - Guide further analysis decisions
+            """)
         numeric_cols = self.column_types['numeric']
         categorical_cols = self.column_types['categorical']
+        if not numeric_cols and not categorical_cols:
+            st.warning("⚠️ No suitable columns found for analysis. Please check your data types.")
+            return
+        # Enhanced Numeric Analysis
         if numeric_cols:
+            self._analyze_numeric_variables(numeric_cols)
+        # Enhanced Categorical Analysis
+        if categorical_cols:
+            self._analyze_categorical_variables(categorical_cols)
+        # Relationship Analysis
+        self._analyze_relationships(numeric_cols, categorical_cols)
+    def _analyze_numeric_variables(self, numeric_cols: List[str]):
+        """Enhanced numeric variable analysis"""
+        st.subheader("🔢 Numeric Variables Analysis")
+        col1, col2 = st.columns([1, 1])
+        with col1:
+            selected_numeric = st.selectbox("Select numeric column:", numeric_cols,
+                                          help="Choose a numeric column to analyze its distribution")
+        with col2:
+            chart_type = st.selectbox("Chart type:", ["Histogram", "Box Plot", "Violin Plot", "Q-Q Plot"])
+        if selected_numeric:
+            # Statistics summary
+            stats_dict = calculate_numeric_stats(self.df, selected_numeric)
+            if stats_dict:
+                col1, col2, col3, col4 = st.columns(4)
+                with col1:
+                    st.metric("Mean", f"{stats_dict['mean']:.2f}")
+                with col2:
+                    st.metric("Median", f"{stats_dict['median']:.2f}")
+                with col3:
+                    st.metric("Std Dev", f"{stats_dict['std']:.2f}")
+                with col4:
+                    skew_interpretation = "Right-skewed" if stats_dict['skewness'] > 0.5 else "Left-skewed" if stats_dict['skewness'] < -0.5 else "Symmetric"
+                    st.metric("Skewness", f"{stats_dict['skewness']:.2f}", help=skew_interpretation)
+            # Enhanced visualizations
+            try:
+                col1, col2 = st.columns(2)
+                with col1:
+                    if chart_type == "Histogram":
+                        fig = px.histogram(self.df, x=selected_numeric,
+                                         title=f"Distribution of {selected_numeric}",
+                                         marginal="rug")
+                    elif chart_type == "Box Plot":
+                        fig = px.box(self.df, y=selected_numeric,
+                                   title=f"Box Plot of {selected_numeric}")
+                    elif chart_type == "Violin Plot":
+                        fig = px.violin(self.df, y=selected_numeric,
+                                      title=f"Violin Plot of {selected_numeric}")
+                    else:  # Q-Q Plot
+                        from scipy import stats
+                        qq_data = stats.probplot(self.df[selected_numeric].dropna(), dist="norm")
+                        fig = go.Figure()
+                        fig.add_scatter(x=qq_data[0][0], y=qq_data[0][1], mode='markers',
+                                      name='Data Points')
+                        fig.add_scatter(x=qq_data[0][0], y=qq_data[1][1] + qq_data[1][0] * qq_data[0][0],
+                                      mode='lines', name='Normal Distribution')
+                        fig.update_layout(title=f"Q-Q Plot of {selected_numeric}",
+                                        xaxis_title="Theoretical Quantiles",
+                                        yaxis_title="Sample Quantiles")
                     st.plotly_chart(fig, use_container_width=True)
+                with col2:
+                    # Summary statistics table
+                    if stats_dict:
+                        summary_data = {
+                            'Statistic': ['Count', 'Mean', 'Median', 'Std Dev', 'Min', 'Max', 'Q25', 'Q75', 'Skewness', 'Kurtosis'],
+                            'Value': [
+                                len(self.df[selected_numeric].dropna()),
+                                f"{stats_dict['mean']:.3f}",
+                                f"{stats_dict['median']:.3f}",
+                                f"{stats_dict['std']:.3f}",
+                                f"{stats_dict['min']:.3f}",
+                                f"{stats_dict['max']:.3f}",
+                                f"{stats_dict['q25']:.3f}",
+                                f"{stats_dict['q75']:.3f}",
+                                f"{stats_dict['skewness']:.3f}",
+                                f"{stats_dict['kurtosis']:.3f}"
+                            ]
+                        }
+                        st.dataframe(pd.DataFrame(summary_data), use_container_width=True, hide_index=True)
+                # Distribution insights
+                if abs(stats_dict['skewness']) > 1:
+                    skew_type = "highly right-skewed" if stats_dict['skewness'] > 1 else "highly left-skewed"
+                    self.add_insight(f"{selected_numeric} is {skew_type} (skewness: {stats_dict['skewness']:.2f})", 2, "info")
+                if stats_dict['kurtosis'] > 3:
+                    self.add_insight(f"{selected_numeric} has heavy tails (kurtosis: {stats_dict['kurtosis']:.2f})", 2, "info")
+            except Exception as e:
+                st.error(f"Error creating visualization: {str(e)}")
+                logger.error(f"Visualization error for {selected_numeric}: {str(e)}")
+    def _analyze_categorical_variables(self, categorical_cols: List[str]):
+        """Enhanced categorical variable analysis"""
+        st.subheader("📝 Categorical Variables Analysis")
+        selected_categorical = st.selectbox("Select categorical column:", categorical_cols,
+                                          help="Choose a categorical column to analyze its distribution")
+        if selected_categorical:
+            try:
+                # Get value counts with error handling
+                value_counts = get_value_counts(self.df, selected_categorical, top_n=20)
+                if value_counts is not None and not value_counts.empty:
+                    total_categories = self.df[selected_categorical].nunique()
+                    # Summary metrics
+                    col1, col2, col3 = st.columns(3)
+                    with col1:
+                        st.metric("Total Categories", total_categories)
+                    with col2:
+                        top_category_pct = (value_counts.iloc[0] / len(self.df)) * 100
+                        st.metric("Top Category", f"{top_category_pct:.1f}%")
+                    with col3:
+                        entropy = -sum((value_counts / value_counts.sum()) * np.log2(value_counts / value_counts.sum() + 1e-10))
+                        st.metric("Diversity (Entropy)", f"{entropy:.2f}")
+                    # Visualization
+                    col1, col2 = st.columns(2)
+                    with col1:
+                        fig = px.bar(x=value_counts.index, y=value_counts.values,
+                                   title=f"Top {min(20, len(value_counts))} Values in {selected_categorical}")
+                        fig.update_layout(xaxis_tickangle=-45)
+                        st.plotly_chart(fig, use_container_width=True)
+                    with col2:
+                        # Show data table
+                        display_data = pd.DataFrame({
+                            'Category': value_counts.index,
+                            'Count': value_counts.values,
+                            'Percentage': np.round((value_counts.values / len(self.df)) * 100, 2)
+                        })
+                        st.dataframe(display_data, use_container_width=True, hide_index=True)
+                    # Insights
+                    if total_categories > 100:
+                        self.add_insight(f"{selected_categorical} has very high cardinality ({total_categories} categories)", 2, "warning")
+                    elif top_category_pct > 90:
+                        self.add_insight(f"{selected_categorical} is highly imbalanced (top category: {top_category_pct:.1f}%)", 2, "warning")
+                else:
+                    st.warning(f"⚠️ Unable to analyze column '{selected_categorical}' - it may be empty or have issues")
+            except Exception as e:
+                st.error(f"Error analyzing categorical variable: {str(e)}")
+                logger.error(f"Categorical analysis error for {selected_categorical}: {str(e)}")
+    def _analyze_relationships(self, numeric_cols: List[str], categorical_cols: List[str]):
+        """Enhanced relationship analysis"""
+        if len(numeric_cols) >= 2:
+            st.subheader("🔗 Variable Relationships")
+            # Correlation matrix
+            corr_matrix = calculate_correlation_matrix(self.df)
+            if corr_matrix is not None and not corr_matrix.empty:
+                col1, col2 = st.columns(2)
+                with col1:
+                    fig = px.imshow(corr_matrix,
+                                  text_auto=True,
+                                  aspect="auto",
+                                  title="Correlation Matrix",
+                                  color_continuous_scale='RdBu')
+                    st.plotly_chart(fig, use_container_width=True)
+                with col2:
+                    # Find strongest correlations
+                    corr_pairs = []
+                    for i in range(len(corr_matrix.columns)):
+                        for j in range(i+1, len(corr_matrix.columns)):
+                            col1_name = corr_matrix.columns[i]
+                            col2_name = corr_matrix.columns[j]
+                            corr_val = corr_matrix.iloc[i, j]
+                            if not np.isnan(corr_val):
+                                corr_pairs.append({
+                                    'Variable 1': col1_name,
+                                    'Variable 2': col2_name,
+                                    'Correlation': round(corr_val, 3),
+                                    'Strength': 'Strong' if abs(corr_val) > 0.7 else 'Moderate' if abs(corr_val) > 0.3 else 'Weak'
+                                })
+                    if corr_pairs:
+                        corr_df = pd.DataFrame(corr_pairs).sort_values('Correlation', key=abs, ascending=False)
+                        st.subheader("🎯 Strongest Correlations")
+                        st.dataframe(corr_df.head(10), use_container_width=True, hide_index=True)
+                        # Record strongest correlation insight
+                        strongest = corr_df.iloc[0]
+                        self.add_insight(f"Strongest correlation: {strongest['Variable 1']} ↔ {strongest['Variable 2']} ({strongest['Correlation']})", 2, "info")
     def stage_3_cleaning(self):
+        """Stage 3: Enhanced Data Quality Assessment and Cleaning"""
+        st.subheader("🧹 Data Quality & Cleaning")
+        with st.expander("ℹ️ Help - Data Cleaning", expanded=False):
+            st.markdown("""
+            **Available cleaning operations:**
+            - **Missing Values:** Fill with statistics, drop rows, or use custom values
+            - **Duplicates:** Remove identical rows
+            - **Outliers:** Remove or cap extreme values
+            - **Data Types:** Convert columns to appropriate types
+            """)
+        # Progress tracking
+        cleaning_progress = st.empty()
+        # Enhanced Missing Values Handling
+        self._handle_missing_values()
+        # Enhanced Duplicates Handling
+        self._handle_duplicates()
+        # Enhanced Mixed Types Handling
+        self._handle_mixed_types()
+        # Enhanced Outlier Detection
+        self._handle_outliers()
+        # Cleaning Summary
+        self._display_cleaning_summary()
+    def _handle_missing_values(self):
+        """Enhanced missing values handling with preview"""
+        missing_df = calculate_missing_data(self.df)
+        if not missing_df.empty:
+            st.subheader("🕳️ Missing Values Treatment")
+            # Select column and method
+            col1, col2, col3 = st.columns(3)
             with col1:
+                selected_col = st.selectbox("Column to clean:", missing_df['Column'].tolist())
             with col2:
+                col_dtype = str(self.df[selected_col].dtype)
+                if 'int' in col_dtype or 'float' in col_dtype:
+                    methods = ["Drop rows", "Mean", "Median", "Mode", "Custom value"]
+                else:
+                    methods = ["Drop rows", "Mode", "Custom value"]
+                fill_method = st.selectbox("Fill method:", methods)
+            with col3:
+                if fill_method == "Custom value":
+                    if 'int' in col_dtype or 'float' in col_dtype:
+                        custom_value = st.number_input("Custom value:", value=0.0)
+                    else:
+                        custom_value = st.text_input("Custom value:", value="Unknown")
+            # Preview impact
+            if selected_col:
+                missing_count = self.df[selected_col].isnull().sum()
+                total_count = len(self.df)
+                if fill_method == "Drop rows":
+                    remaining_rows = total_count - missing_count
+                    st.info(f"📊 **Preview:** Will remove {missing_count} rows, keeping {remaining_rows} rows")
+                else:
+                    st.info(f"📊 **Preview:** Will fill {missing_count} missing values")
+            # Apply cleaning
+            if st.button("✨ Apply Missing Value Treatment", type="primary"):
                 try:
+                    original_missing = self.df[selected_col].isnull().sum()
                     if fill_method == "Drop rows":
                         self.df = self.df.dropna(subset=[selected_col])
+                        operation = f"Dropped {original_missing} rows with missing values in {selected_col}"
                     else:
                         if fill_method == "Mean":
                             fill_value = self.df[selected_col].mean()
                         elif fill_method == "Median":
                             fill_value = self.df[selected_col].median()
                         elif fill_method == "Mode":
+                            mode_result = self.df[selected_col].mode()
+                            fill_value = mode_result.iloc[0] if not mode_result.empty else "Unknown"
+                        else:
+                            fill_value = custom_value
                         self.df[selected_col] = self.df[selected_col].fillna(fill_value)
+                        operation = f"Filled {original_missing} missing values in {selected_col} with {fill_method}"
+                    self.cleaning_history.append(operation)
+                    st.success(f"✅ {operation}")
+                    st.rerun()
                 except Exception as e:
+                    st.error(f"❌ Error applying treatment: {str(e)}")
         else:
+            st.success("✅ No missing values found!")
+    def _handle_duplicates(self):
+        """Enhanced duplicate handling"""
+        if self.stats['duplicates'] > 0:
+            st.subheader("👥 Duplicate Rows")
+            duplicate_pct = (self.stats['duplicates'] / len(self.df)) * 100
+            st.warning(f"⚠️ Found **{self.stats['duplicates']:,}** duplicate rows ({duplicate_pct:.1f}% of data)")
+            # Show sample duplicates
+            duplicates = self.df[self.df.duplicated(keep=False)].head(10)
+            if not duplicates.empty:
+                st.write("**Sample duplicate rows:**")
+                st.dataframe(duplicates, use_container_width=True)
+            if st.button("🗑️ Remove Duplicate Rows", type="primary"):
                 try:
+                    original_len = len(self.df)
+                    self.df = self.df.drop_duplicates()
+                    removed = original_len - len(self.df)
+                    operation = f"Removed {removed} duplicate rows"
+                    self.cleaning_history.append(operation)
+                    st.success(f"✅ {operation}")
+                    st.rerun()
                 except Exception as e:
+                    st.error(f"❌ Error removing duplicates: {str(e)}")
+        else:
+            st.success("✅ No duplicate rows found!")
+    def _handle_mixed_types(self):
+        """Enhanced mixed types handling"""
+        mixed_types = detect_mixed_types(self.df)
+        if mixed_types:
+            st.subheader("🔀 Mixed Data Types")
+            for issue in mixed_types:
+                col = issue['column']
+                problems = issue['problematic_values']
+                pct = issue['percentage']
+                st.warning(f"⚠️ **{col}:** {problems} values ({pct:.1f}%) cannot be converted to numeric")
+                # Show sample problematic values
+                if 'sample_issues' in issue:
+                    sample_issues = issue['sample_issues']
+                    st.write("**Sample problematic values:**")
+                    for value, count in list(sample_issues.items())[:5]:
+                        st.write(f"• '{value}' ({count} occurrences)")
+                col1, col2 = st.columns(2)
+                with col1:
+                    fix_method = st.selectbox(f"Fix method for {col}:",
+                                            ["Convert to numeric (coerce errors)", "Keep as text"],
+                                            key=f"fix_{col}")
+                with col2:
+                    if st.button(f"🔧 Fix {col}", key=f"apply_{col}"):
+                        try:
+                            if fix_method == "Convert to numeric (coerce errors)":
+                                self.df[col] = pd.to_numeric(self.df[col], errors='coerce')
+                                operation = f"Converted {col} to numeric (with coercion)"
+                            else:
+                                operation = f"Kept {col} as text type"
+                            self.cleaning_history.append(operation)
+                            st.success(f"✅ {operation}")
+                            st.rerun()
+                        except Exception as e:
+                            st.error(f"❌ Error fixing {col}: {str(e)}")
         else:
+            st.success("✅ No mixed data type issues found!")
+    def _handle_outliers(self):
+        """Enhanced outlier detection and handling"""
         numeric_cols = self.column_types['numeric']
+        if numeric_cols:
+            st.subheader("🎯 Outlier Detection")
+            col1, col2, col3 = st.columns(3)
             with col1:
+                selected_col = st.selectbox("Column for outlier detection:", numeric_cols)
             with col2:
+                detection_method = st.selectbox("Detection method:",
+                                              ["IQR (Interquartile Range)", "Z-Score", "Percentile"])
+            with col3:
+                if detection_method == "Z-Score":
+                    threshold = st.number_input("Z-Score threshold:", min_value=1.0, max_value=5.0, value=3.0)
+                elif detection_method == "Percentile":
+                    percentile = st.slider("Outlier percentile:", 0.1, 5.0, 1.0)
+            if selected_col:
+                try:
+                    method_map = {
+                        "IQR (Interquartile Range)": "iqr",
+                        "Z-Score": "zscore",
+                        "Percentile": "percentile"
+                    }
+                    outliers = calculate_outliers(self.df, selected_col, method_map[detection_method])
+                    if outliers is not None and not outliers.empty:
+                        outlier_count = len(outliers)
+                        outlier_pct = (outlier_count / len(self.df)) * 100
+                        st.warning(f"⚠️ Found **{outlier_count}** potential outliers ({outlier_pct:.1f}% of data)")
+                        # Show outlier statistics
+                        col1, col2 = st.columns(2)
+                        with col1:
+                            outlier_stats = outliers[selected_col].describe()
+                            st.write("**Outlier Statistics:**")
+                            st.dataframe(outlier_stats.to_frame().T, use_container_width=True)
+                        with col2:
+                            # Visualization of outliers
+                            fig = go.Figure()
+                            fig.add_trace(go.Scatter(
+                                x=self.df.index,
+                                y=self.df[selected_col],
+                                mode='markers',
+                                name='Normal Data',
+                                marker=dict(color='blue', opacity=0.6)
+                            ))
+                            fig.add_trace(go.Scatter(
+                                x=outliers.index,
+                                y=outliers[selected_col],
+                                mode='markers',
+                                name='Outliers',
+                                marker=dict(color='red', size=8)
+                            ))
+                            fig.update_layout(title=f"Outliers in {selected_col}")
+                            st.plotly_chart(fig, use_container_width=True)
+                        # Treatment options
+                        treatment_method = st.selectbox("Outlier treatment:",
+                                                      ["None", "Remove outliers", "Cap at bounds"])
+                        if treatment_method != "None":
+                            st.info(f"📊 **Preview:** This will affect {outlier_count} data points")
+                            if st.button("🔧 Apply Outlier Treatment", type="primary"):
+                                try:
+                                    if treatment_method == "Remove outliers":
+                                        self.df = self.df[~self.df.index.isin(outliers.index)]
+                                        operation = f"Removed {outlier_count} outliers from {selected_col}"
+                                    else:  # Cap at bounds
+                                        Q1 = self.df[selected_col].quantile(0.25)
+                                        Q3 = self.df[selected_col].quantile(0.75)
+                                        IQR = Q3 - Q1
+                                        lower_bound = Q1 - 1.5 * IQR
+                                        upper_bound = Q3 + 1.5 * IQR
+                                        self.df[selected_col] = self.df[selected_col].clip(lower_bound, upper_bound)
+                                        operation = f"Capped outliers in {selected_col} to bounds"
+                                    self.cleaning_history.append(operation)
+                                    st.success(f"✅ {operation}")
+                                    st.rerun()
+                                except Exception as e:
+                                    st.error(f"❌ Error treating outliers: {str(e)}")
+                    else:
+                        st.success(f"✅ No outliers detected in '{selected_col}' using {detection_method}")
+                except Exception as e:
+                    st.error(f"❌ Error detecting outliers: {str(e)}")
+    def _display_cleaning_summary(self):
+        """Display comprehensive cleaning summary"""
+        if self.cleaning_history:
+            st.subheader("📋 Cleaning Operations History")
+            for i, operation in enumerate(self.cleaning_history, 1):
+                st.write(f"**{i}.** {operation}")
+            # Show data changes
             col1, col2 = st.columns(2)
             with col1:
+                st.metric("Original Rows", f"{self.original_df.shape[0]:,}")
+                st.metric("Original Memory", f"{self.original_df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
             with col2:
+                st.metric("Current Rows", f"{self.df.shape[0]:,}",
+                         delta=f"{self.df.shape[0] - self.original_df.shape[0]:,}")
+                current_memory = self.df.memory_usage(deep=True).sum() / 1024**2
+                original_memory = self.original_df.memory_usage(deep=True).sum() / 1024**2
+                st.metric("Current Memory", f"{current_memory:.1f} MB",
+                         delta=f"{current_memory - original_memory:.1f} MB")
+            # Rollback option
+            if st.button("↩️ Reset to Original Data", help="Restore original dataset"):
+                self.df = self.original_df.copy()
+                self.cleaning_history = []
+                st.success("✅ Data reset to original state")
+                st.rerun()
+            self.add_insight(f"Applied {len(self.cleaning_history)} cleaning operations", 3, "info")
+        else:
+            st.info("ℹ️ No cleaning operations performed yet")
+    def stage_4_analysis(self):
+        """Stage 4: Enhanced Advanced Analysis"""
+        st.subheader("🔬 Advanced Analysis")
+        with st.expander("ℹ️ Help - Advanced Analysis", expanded=False):
+            st.markdown("""
+            **Advanced analysis includes:**
+            - **Relationships:** Correlation and scatter plot analysis
+            - **Group Analysis:** Compare metrics across categories
+            - **Distribution Analysis:** Statistical testing and comparisons
+            """)
+        numeric_cols = self.column_types['numeric']
+        categorical_cols = self.column_types['categorical']
+        # Enhanced Relationship Analysis
+        if len(numeric_cols) >= 2:
+            self._advanced_relationship_analysis(numeric_cols)
+        # Enhanced Group Analysis
+        if categorical_cols and numeric_cols:
+            self._advanced_group_analysis(categorical_cols, numeric_cols)
+        # Statistical Testing
+        if len(numeric_cols) >= 2:
+            self._statistical_testing(numeric_cols, categorical_cols)
+    def _advanced_relationship_analysis(self, numeric_cols: List[str]):
+        """Enhanced relationship analysis with statistical insights"""
+        st.subheader("🔗 Variable Relationships")
+        col1, col2, col3 = st.columns(3)
+        with col1:
+            x_var = st.selectbox("X Variable:", numeric_cols)
+        with col2:
+            y_var = st.selectbox("Y Variable:", [col for col in numeric_cols if col != x_var])
+        with col3:
+            color_var = st.selectbox("Color by (optional):",
+                                   ["None"] + self.column_types['categorical'][:10])
+        if x_var and y_var:
+            try:
+                # Sample for performance
+                sample_size = min(5000, len(self.df))
+                if len(self.df) > sample_size:
+                    sample_df = self.df.sample(n=sample_size, random_state=42)
+                    st.info(f"📊 Showing sample of {sample_size:,} points for performance")
+                else:
+                    sample_df = self.df
+                # Create scatter plot
+                if color_var != "None":
+                    fig = px.scatter(sample_df, x=x_var, y=y_var, color=color_var,
+                                   title=f"Relationship: {x_var} vs {y_var}",
+                                   trendline="ols")
+                else:
+                    fig = px.scatter(sample_df, x=x_var, y=y_var,
+                                   title=f"Relationship: {x_var} vs {y_var}",
+                                   trendline="ols")
                 st.plotly_chart(fig, use_container_width=True)
+                # Statistical analysis
+                correlation = self.df[x_var].corr(self.df[y_var])
+                col1, col2, col3 = st.columns(3)
+                with col1:
+                    st.metric("Correlation", f"{correlation:.3f}")
+                with col2:
+                    if abs(correlation) > 0.7:
+                        strength = "Strong"
+                    elif abs(correlation) > 0.3:
+                        strength = "Moderate"
+                    else:
+                        strength = "Weak"
+                    st.metric("Strength", strength)
+                with col3:
+                    direction = "Positive" if correlation > 0 else "Negative"
+                    st.metric("Direction", direction)
+                # Record insight
+                self.add_insight(f"{strength} {direction.lower()} correlation ({correlation:.3f}) between {x_var} and {y_var}", 4, "info")
+            except Exception as e:
+                st.error(f"❌ Error in relationship analysis: {str(e)}")
+    def _advanced_group_analysis(self, categorical_cols: List[str], numeric_cols: List[str]):
+        """Enhanced group analysis with statistical comparisons"""
+        st.subheader("👥 Group Analysis")
+        col1, col2 = st.columns(2)
+        with col1:
+            group_var = st.selectbox("Group by:", categorical_cols)
+        with col2:
+            metric_var = st.selectbox("Analyze metric:", numeric_cols)
+        if group_var and metric_var:
+            try:
+                group_stats = calculate_group_stats(self.df, group_var, metric_var)
+                if group_stats is not None and not group_stats.empty:
+                    # Display statistics
+                    st.dataframe(group_stats, use_container_width=True)
+                    # Visualization
+                    unique_groups = self.df[group_var].nunique()
+                    if unique_groups <= 20:
+                        col1, col2 = st.columns(2)
+                        with col1:
+                            fig = px.box(self.df, x=group_var, y=metric_var,
+                                       title=f"{metric_var} by {group_var}")
+                            fig.update_layout(xaxis_tickangle=-45)
+                            st.plotly_chart(fig, use_container_width=True)
+                        with col2:
+                            # Mean comparison
+                            group_means = self.df.groupby(group_var)[metric_var].mean().sort_values(ascending=False)
+                            fig = px.bar(x=group_means.index, y=group_means.values,
+                                       title=f"Average {metric_var} by {group_var}")
+                            fig.update_layout(xaxis_tickangle=-45)
+                            st.plotly_chart(fig, use_container_width=True)
+                    else:
+                        st.info(f"ℹ️ Too many groups ({unique_groups}) for visualization. Showing statistics only.")
+                    # Find insights
+                    best_group = group_stats.loc[group_stats['mean'].idxmax(), group_var]
+                    best_value = group_stats['mean'].max()
+                    worst_group = group_stats.loc[group_stats['mean'].idxmin(), group_var]
+                    worst_value = group_stats['mean'].min()
+                    col1, col2 = st.columns(2)
+                    with col1:
+                        st.success(f"🏆 **Highest {metric_var}:** {best_group} ({best_value:.2f})")
+                    with col2:
+                        st.info(f"📉 **Lowest {metric_var}:** {worst_group} ({worst_value:.2f})")
+                    self.add_insight(f"'{best_group}' has highest average {metric_var}: {best_value:.2f}", 4, "success")
+            except Exception as e:
+                st.error(f"❌ Error in group analysis: {str(e)}")
+    def _statistical_testing(self, numeric_cols: List[str], categorical_cols: List[str]):
+        """Enhanced statistical testing capabilities"""
+        if len(numeric_cols) >= 2:
+            st.subheader("📊 Statistical Testing")
+            test_type = st.selectbox("Select test type:",
+                                   ["Correlation Test", "Group Comparison"])
+            if test_type == "Correlation Test" and len(numeric_cols) >= 2:
+                col1, col2 = st.columns(2)
+                with col1:
+                    var1 = st.selectbox("Variable 1:", numeric_cols, key="corr_var1")
+                with col2:
+                    var2 = st.selectbox("Variable 2:",
+                                      [col for col in numeric_cols if col != var1],
+                                      key="corr_var2")
+                if st.button("🧪 Run Correlation Test"):
+                    try:
+                        from scipy.stats import pearsonr, spearmanr
+                        # Clean data for testing
+                        clean_data = self.df[[var1, var2]].dropna()
+                        if len(clean_data) < 10:
+                            st.warning("⚠️ Insufficient data for reliable correlation testing")
+                        else:
+                            # Pearson correlation
+                            pearson_corr, pearson_p = pearsonr(clean_data[var1], clean_data[var2])
+                            # Spearman correlation (rank-based)
+                            spearman_corr, spearman_p = spearmanr(clean_data[var1], clean_data[var2])
+                            col1, col2 = st.columns(2)
+                            with col1:
+                                st.subheader("Pearson Correlation")
+                                st.metric("Correlation", f"{pearson_corr:.3f}")
+                                st.metric("P-value", f"{pearson_p:.4f}")
+                                if pearson_p < 0.05:
+                                    st.success("✅ Statistically significant")
+                                else:
+                                    st.warning("⚠️ Not statistically significant")
+                            with col2:
+                                st.subheader("Spearman Correlation")
+                                st.metric("Correlation", f"{spearman_corr:.3f}")
+                                st.metric("P-value", f"{spearman_p:.4f}")
+                                if spearman_p < 0.05:
+                                    st.success("✅ Statistically significant")
+                                else:
+                                    st.warning("⚠️ Not statistically significant")
+                            # Interpretation
+                            if pearson_p < 0.05:
+                                self.add_insight(f"Significant correlation between {var1} and {var2} (p={pearson_p:.4f})", 4, "success")
+                    except Exception as e:
+                        st.error(f"❌ Error in correlation testing: {str(e)}")
     def stage_5_summary(self):
+        """Stage 5: Enhanced Summary and Export"""
+        st.subheader("📈 Analysis Summary & Export")
+        with st.expander("ℹ️ Help - Summary & Export", expanded=False):
+            st.markdown("""
+            **This final stage provides:**
+            - Complete analysis summary with all insights
+            - Multiple export formats for your results
+            - Code generation for reproducible analysis
+            - Data quality final report
+            """)
+        # Enhanced Key Metrics Dashboard
+        col1, col2, col3, col4 = st.columns(4)
         with col1:
+            st.metric("📊 Total Insights", len(self.insights))
         with col2:
+            success_insights = len([i for i in self.insights if i.get('type') == 'success'])
+            st.metric("✅ Positive Findings", success_insights)
         with col3:
+            warning_insights = len([i for i in self.insights if i.get('type') == 'warning'])
+            st.metric("⚠️ Issues Found", warning_insights)
+        with col4:
+            final_quality = "High" if self.stats['missing_values'] == 0 and self.stats['duplicates'] == 0 else "Medium"
+            st.metric("🎯 Final Quality", final_quality)
+        # Categorized Insights Summary
+        self._display_categorized_insights()
+        # Data Transformation Summary
+        if self.cleaning_history:
+            st.subheader("🔄 Data Transformations Applied")
+            for i, operation in enumerate(self.cleaning_history, 1):
+                st.write(f"**{i}.** {operation}")
+            st.info(f"✨ Dataset transformed from {self.original_df.shape} to {self.df.shape}")
+        # Enhanced Export Options
+        self._display_export_options()
+    def _display_categorized_insights(self):
+        """Display insights organized by category and stage"""
+        st.subheader("💡 Key Insights by Stage")
+        stage_names = {
+            0: "🔍 Validation",
+            1: "📊 Overview",
+            2: "🔍 Exploration",
+            3: "🧹 Cleaning",
+            4: "🔬 Analysis"
+        }
+        for stage in range(5):
+            stage_insights = [i for i in self.insights if i['stage'] == stage]
+            if stage_insights:
+                st.write(f"**{stage_names.get(stage, f'Stage {stage}')}**")
+                for insight in stage_insights:
+                    icon = {"success": "✅", "warning": "⚠️", "error": "❌"}.get(insight.get('type'), "ℹ️")
+                    st.write(f"  {icon} {insight['insight']}")
+    def _display_export_options(self):
+        """Enhanced export options with previews"""
+        st.subheader("📤 Export Results")
+        export_type = st.selectbox("Choose export type:",
+                                 ["Analysis Report", "Cleaned Dataset", "Python Code", "Summary Dashboard"])
+        try:
+            if export_type == "Analysis Report":
+                format_choice = st.selectbox("Report format:", ["Markdown", "HTML", "Text"])
+                col1, col2 = st.columns([3, 1])
+                with col1:
+                    if format_choice == "Markdown":
+                        report = self.generate_markdown_report()
+                        st.code(report[:500] + "..." if len(report) > 500 else report, language="markdown")
+                with col2:
+                    st.download_button(
+                        label=f"📄 Download {format_choice} Report",
+                        data=report if format_choice == "Markdown" else self.generate_text_report(),
+                        file_name=f"analysis_report.{format_choice.lower()}",
+                        mime="text/markdown" if format_choice == "Markdown" else "text/plain"
+                    )
+            elif export_type == "Cleaned Dataset":
+                format_choice = st.selectbox("Data format:", ["CSV", "Excel", "Parquet"])
+                col1, col2 = st.columns([3, 1])
+                with col1:
+                    st.write("**Data Preview:**")
+                    st.dataframe(self.df.head(), use_container_width=True)
+                    st.write(f"**Final Shape:** {self.df.shape[0]:,} rows × {self.df.shape[1]:,} columns")
+                with col2:
+                    if st.button(f"📊 Export as {format_choice}"):
+                        try:
+                            if format_choice == "CSV":
+                                csv = self.df.to_csv(index=False)
+                                st.download_button("💾 Download CSV", csv, "cleaned_data.csv", "text/csv")
+                            elif format_choice == "Excel":
+                                buffer = BytesIO()
+                                with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
+                                    self.df.to_excel(writer, sheet_name='Cleaned_Data', index=False)
+                                    # Add summary sheet
+                                    summary_df = pd.DataFrame({
+                                        'Metric': ['Original Rows', 'Final Rows', 'Columns', 'Cleaning Operations'],
+                                        'Value': [self.original_df.shape[0], self.df.shape[0],
+                                                self.df.shape[1], len(self.cleaning_history)]
+                                    })
+                                    summary_df.to_excel(writer, sheet_name='Summary', index=False)
+                                st.download_button("💾 Download Excel", buffer.getvalue(),
+                                                 "cleaned_data.xlsx",
+                                                 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
+                            elif format_choice == "Parquet":
+                                buffer = BytesIO()
+                                self.df.to_parquet(buffer, index=False)
+                                st.download_button("💾 Download Parquet", buffer.getvalue(),
+                                                 "cleaned_data.parquet", "application/octet-stream")
+                        except Exception as e:
+                            st.error(f"❌ Export error: {str(e)}")
+            elif export_type == "Python Code":
+                code = self.generate_enhanced_python_code()
+                st.code(code, language="python")
+                st.download_button("💾 Download Python Script", code,
+                                 "analysis_script.py", "text/plain")
+        except Exception as e:
+            st.error(f"❌ Export error: {str(e)}")
     def generate_markdown_report(self) -> str:
+        """Generate comprehensive markdown report"""
+        report = f"""# 📊 Data Analysis Report
+## Executive Summary
+- **Dataset Size:** {self.stats['shape'][0]:,} rows × {self.stats['shape'][1]:,} columns
+- **Data Quality:** {calculate_data_quality_score(self.df)['grade']} grade
+- **Memory Usage:** {self.stats['memory_usage']:.1f} MB
+- **Analysis Completed:** {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}
+## 📈 Data Overview
+| Metric | Value |
+|--------|-------|
+| Total Records | {self.stats['shape'][0]:,} |
+| Total Features | {self.stats['shape'][1]:,} |
+| Missing Values | {self.stats['missing_values']:,} |
+| Duplicate Rows | {self.stats['duplicates']:,} |
+## 📊 Data Types
 """
+        for dtype, count in self.stats['dtypes'].items():
+            report += f"- **{dtype}:** {count} columns\n"
+        report += "\n## 💡 Key Insights\n"
         # Group insights by stage
+        stage_names = {0: "Validation", 1: "Overview", 2: "Exploration", 3: "Cleaning", 4: "Analysis"}
+        for stage in range(5):
             stage_insights = [i for i in self.insights if i['stage'] == stage]
             if stage_insights:
+                report += f"\n### {stage_names.get(stage, f'Stage {stage}')}\n"
                 for insight in stage_insights:
+                    icon = {"success": "✅", "warning": "⚠️", "error": "❌"}.get(insight.get('type'), "ℹ️")
+                    report += f"- {icon} {insight['insight']}\n"
+        if self.cleaning_history:
+            report += "\n## 🔄 Data Transformations\n"
+            for i, operation in enumerate(self.cleaning_history, 1):
+                report += f"{i}. {operation}\n"
+        report += f"\n---\n*Report generated by Data Analysis Platform*"
         return report
+    def generate_enhanced_python_code(self) -> str:
+        """Generate comprehensive Python code for reproducible analysis"""
+        code = f'''"""
+Data Analysis Script
+Generated on: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}
+Original Dataset: {self.original_df.shape[0]:,} rows × {self.original_df.shape[1]:,} columns
+Final Dataset: {self.df.shape[0]:,} rows × {self.df.shape[1]:,} columns
+"""
+import pandas as pd
 import numpy as np
 import plotly.express as px
+import plotly.graph_objects as go
+from scipy import stats
+import warnings
+warnings.filterwarnings('ignore')
+# Load data
+def load_and_prepare_data(file_path: str) -> pd.DataFrame:
+    """Load and prepare data with error handling"""
+    try:
+        if file_path.endswith('.csv'):
+            df = pd.read_csv(file_path)
+        elif file_path.endswith(('.xlsx', '.xls')):
+            df = pd.read_excel(file_path)
+        else:
+            raise ValueError("Unsupported file format")
+        print(f"Loaded data: {{df.shape[0]:,}} rows × {{df.shape[1]:,}} columns")
+        return df
+    except Exception as e:
+        print(f"Error loading data: {{e}}")
+        return None
+# Data quality assessment
+def assess_data_quality(df: pd.DataFrame) -> dict:
+    """Calculate comprehensive data quality metrics"""
+    total_cells = len(df) * len(df.columns)
+    missing_count = df.isnull().sum().sum()
+    duplicate_count = df.duplicated().sum()
+    return {{
+        'total_rows': len(df),
+        'total_columns': len(df.columns),
+        'missing_percentage': (missing_count / total_cells) * 100,
+        'duplicate_percentage': (duplicate_count / len(df)) * 100,
+        'memory_usage_mb': df.memory_usage(deep=True).sum() / 1024**2
+    }}
+# Main analysis
+if __name__ == "__main__":
+    # Load your data
+    df = load_and_prepare_data('your_data_file.csv')  # Update with your file path
+    if df is not None:
+        # Data quality assessment
+        quality = assess_data_quality(df)
+        print("\\n=== DATA QUALITY REPORT ===")
+        print(f"Rows: {{quality['total_rows']:,}}")
+        print(f"Columns: {{quality['total_columns']:,}}")
+        print(f"Missing Data: {{quality['missing_percentage']:.2f}}%")
+        print(f"Duplicates: {{quality['duplicate_percentage']:.2f}}%")
+        print(f"Memory Usage: {{quality['memory_usage_mb']:.1f}} MB")
+'''
+        # Add cleaning operations if any
+        if self.cleaning_history:
+            code += "\n        # Applied cleaning operations:\n"
             for operation in self.cleaning_history:
+                if "missing" in operation.lower():
+                    code += "        # df = df.fillna(method='your_chosen_method')\n"
                 elif "duplicate" in operation.lower():
+                    code += "        df = df.drop_duplicates()\n"
                 elif "outlier" in operation.lower():
+                    code += """        # Remove outliers using IQR method
+        def remove_outliers(df, column):
+            Q1 = df[column].quantile(0.25)
+            Q3 = df[column].quantile(0.75)
+            IQR = Q3 - Q1
+            return df[~((df[column] < (Q1 - 1.5 * IQR)) | (df[column] > (Q3 + 1.5 * IQR)))]
+        # df = remove_outliers(df, 'your_column')
+"""
+        # Add analysis code
+        code += f"""
+        # Basic statistics
+        print("\\n=== BASIC STATISTICS ===")
+        print(df.describe())
+        # Correlation analysis (if numeric columns exist)
+        numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
+        if len(numeric_cols) > 1:
+            print("\\n=== CORRELATION MATRIX ===")
+            corr_matrix = df[numeric_cols].corr()
+            print(corr_matrix)
+            # Visualize correlation matrix
+            fig = px.imshow(corr_matrix, title='Correlation Matrix')
+            fig.show()
+        # Missing values visualization
+        missing = df.isnull().sum()
+        if missing.sum() > 0:
+            missing = missing[missing > 0]
+            fig = px.bar(x=missing.index, y=missing.values,
+                        title='Missing Values by Column')
+            fig.show()
+        # Final data quality report
+        final_quality = assess_data_quality(df)
+        print("\\n=== FINAL QUALITY REPORT ===")
+        for key, value in final_quality.items():
+            print(f"{{key}}: {{value}}")
 """
+        return code
+    def generate_text_report(self) -> str:
+        """Generate enhanced text analysis report"""
+        report = f"""DATA ANALYSIS REPORT
+{'='*50}
+EXECUTIVE SUMMARY
+Dataset: {self.stats['shape'][0]:,} rows × {self.stats['shape'][1]:,} columns
+Quality Grade: {calculate_data_quality_score(self.df)['grade']}
+Memory Usage: {self.stats['memory_usage']:.1f} MB
+Analysis Date: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}
+DATA OVERVIEW
+- Total Records: {self.stats['shape'][0]:,}
+- Total Features: {self.stats['shape'][1]:,}
+- Missing Values: {self.stats['missing_values']:,}
+- Duplicate Rows: {self.stats['duplicates']:,}
+DATA TYPES DISTRIBUTION
 """
+        for dtype, count in self.stats['dtypes'].items():
+            report += f"- {dtype}: {count} columns\n"
+        report += "\nKEY INSIGHTS\n" + "="*20 + "\n"
+        # Organize insights by stage
+        stage_names = {0: "VALIDATION", 1: "OVERVIEW", 2: "EXPLORATION", 3: "CLEANING", 4: "ANALYSIS"}
+        for stage in range(5):
+            stage_insights = [i for i in self.insights if i['stage'] == stage]
+            if stage_insights:
+                report += f"\n{stage_names.get(stage, f'STAGE {stage}')}:\n"
+                for i, insight in enumerate(stage_insights, 1):
+                    report += f"  {i}. {insight['insight']}\n"
+        if self.cleaning_history:
+            report += f"\nDATA TRANSFORMATIONS\n{'='*20}\n"
+            for i, operation in enumerate(self.cleaning_history, 1):
+                report += f"{i}. {operation}\n"
+        report += f"\n{'='*50}\nReport generated by Data Analysis Platform\n"
+        return report