Spaces:

entropy25
/

data-analysis-platform

Sleeping

App Files Files Community

entropy25 commited on Aug 9, 2025

Commit

7583e80

verified ·

1 Parent(s): f0413da

Update analyzer.py

Browse files

Files changed (1) hide show

analyzer.py +1262 -536

analyzer.py CHANGED Viewed

@@ -3,102 +3,190 @@ import pandas as pd
 import numpy as np
 import plotly.express as px
 import plotly.graph_objects as go
 from typing import Dict, List, Any, Optional
 import os
-from dotenv import load_dotenv
 from data_handler import *
 from io import BytesIO
-# Load environment variables
-load_dotenv()
-# Optional AI Integration
-try:
-    import openai
-    OPENAI_AVAILABLE = True
-except ImportError:
-    OPENAI_AVAILABLE = False
-try:
-    import google.generativeai as genai
-    GEMINI_AVAILABLE = True
-except ImportError:
-    GEMINI_AVAILABLE = False
 class AIAssistant:
-    """AI-powered analysis assistant"""
     def __init__(self):
-        self.openai_key = os.getenv('OPENAI_API_KEY')
-        self.gemini_key = os.getenv('GOOGLE_API_KEY')
-        if self.gemini_key and GEMINI_AVAILABLE:
-            genai.configure(api_key=self.gemini_key)
-            self.gemini_model = genai.GenerativeModel('gemini-1.5-flash')
     def get_available_models(self) -> List[str]:
         """Get list of available AI models"""
-        models = []
-        if self.openai_key and OPENAI_AVAILABLE:
-            models.append("OpenAI GPT")
-        if self.gemini_key and GEMINI_AVAILABLE:
-            models.append("Google Gemini")
-        return models
-    def analyze_insights(self, df: pd.DataFrame, insights: List[Dict], model: str = "Google Gemini") -> str:
-        """Get AI analysis of insights"""
-        # Prepare data summary
-        summary = f"""
-        Dataset Summary:
-        - Shape: {df.shape}
-        - Columns: {list(df.columns)}
-        - Data types: {df.dtypes.value_counts().to_dict()}
-        Key Insights Found:
-        """
-        for insight in insights:
-            summary += f"\n- {insight['insight']}"
-        prompt = f"""
-        As a senior data scientist, analyze this dataset and provide:
-        1. Business implications of the findings
-        2. Potential opportunities or risks
-        3. Recommendations for decision-making
-        4. Suggestions for further analysis
-        {summary}
-        Provide actionable insights in a professional format.
-        """
-        try:
-            if model == "Google Gemini" and hasattr(self, 'gemini_model'):
-                response = self.gemini_model.generate_content(prompt)
-                return response.text
-            elif model == "OpenAI GPT" and self.openai_key:
-                client = openai.OpenAI(api_key=self.openai_key)
-                response = client.chat.completions.create(
-                    model="gpt-3.5-turbo",
-                    messages=[{"role": "user", "content": prompt}]
-                )
-                return response.choices[0].message.content
-            else:
-                return "AI analysis not available. Please configure API keys."
-        except Exception as e:
-            return f"AI Analysis Error: {str(e)}"
 class DataAnalysisWorkflow:
-    """Optimized data analysis workflow with caching and pagination"""
     def __init__(self, df: pd.DataFrame):
         self.df = df
         self.stats = calculate_basic_stats(df)
         self.column_types = get_column_types(df)
         self.insights = []
-        self.page_size = 1000  # For pagination
     def add_insight(self, insight: str, stage: int):
         """Add insight to analysis report"""
@@ -108,586 +196,1224 @@ class DataAnalysisWorkflow:
             'timestamp': pd.Timestamp.now()
         })
-    def get_paginated_data(self, page: int = 0) -> pd.DataFrame:
-        """Get paginated data for display"""
-        start_idx = page * self.page_size
-        end_idx = start_idx + self.page_size
-        return self.df.iloc[start_idx:end_idx]
     def stage_1_overview(self):
-        """Stage 1: Data Overview with caching"""
-        st.subheader("📊 Data Overview")
-        # Data Quality Score
-        quality_metrics = calculate_data_quality_score(self.df)
-        col1, col2, col3, col4 = st.columns(4)
         with col1:
-            st.metric("Rows", f"{self.stats['shape'][0]:,}")
         with col2:
-            st.metric("Columns", f"{self.stats['shape'][1]:,}")
         with col3:
-            st.metric("Quality Score", f"{quality_metrics['score']:.1f}/100")
         with col4:
-            st.metric("Grade", quality_metrics['grade'])
         if quality_metrics['issues']:
-            st.warning("Quality Issues Found:")
-            for issue in quality_metrics['issues']:
-                st.write(f"• {issue}")
-        # Memory Usage and Optimization
-        st.subheader("Memory Analysis")
-        memory_opt = calculate_memory_optimization(self.df)
         col1, col2 = st.columns(2)
         with col1:
-            st.metric("Current Memory", f"{memory_opt['current_memory_mb']:.1f} MB")
         with col2:
-            if memory_opt['potential_savings_mb'] > 0:
-                st.metric("Potential Savings",
-                         f"{memory_opt['potential_savings_mb']:.1f} MB",
-                         f"{memory_opt['potential_savings_pct']:.1f}%")
-                if st.button("Show Optimization Details"):
-                    st.dataframe(pd.DataFrame(memory_opt['suggestions']))
-        # Column Cardinality Analysis
-        st.subheader("Column Cardinality Analysis")
-        cardinality_df = calculate_column_cardinality(self.df)
-        # Filter options
-        col_types = cardinality_df['Type'].unique()
-        selected_types = st.multiselect("Filter by Column Type",
-                                      col_types,
-                                      default=col_types)
-        filtered_df = cardinality_df[cardinality_df['Type'].isin(selected_types)]
-        st.dataframe(filtered_df, use_container_width=True)
-        # Highlight important findings
-        id_cols = filtered_df[filtered_df['Type'] == 'Unique Identifier']['Column'].tolist()
-        if id_cols:
-            st.info(f"📌 Potential ID columns found: {', '.join(id_cols)}")
-        const_cols = filtered_df[filtered_df['Type'] == 'Constant']['Column'].tolist()
-        if const_cols:
-            st.warning(f"⚠️ Constant columns found: {', '.join(const_cols)}")
-        # Data types visualization
-        if self.stats['dtypes']:
-            st.subheader("Data Types Distribution")
-            fig = px.pie(values=list(self.stats['dtypes'].values()),
-                        names=list(self.stats['dtypes'].keys()),
-                        title="Data Types")
-            st.plotly_chart(fig, use_container_width=True)
-        # Sample data with pagination
-        st.subheader("Sample Data")
-        total_pages = (len(self.df) - 1) // self.page_size + 1
-        if total_pages > 1:
-            page = st.slider("Page", 0, total_pages - 1, 0)
-            sample_data = self.get_paginated_data(page)
-            st.write(f"Showing rows {page * self.page_size + 1} to {min((page + 1) * self.page_size, len(self.df))}")
-        else:
-            sample_data = self.df.head(10)
-        st.dataframe(sample_data, use_container_width=True)
-        # Missing values analysis
-        missing_df = calculate_missing_data(self.df)
-        if not missing_df.empty:
-            st.subheader("Missing Values Analysis")
-            st.dataframe(missing_df, use_container_width=True)
-            worst_column = missing_df.iloc[0]['Column']
-            worst_percentage = missing_df.iloc[0]['Missing %']
-            self.add_insight(f"Column '{worst_column}' has highest missing data: {worst_percentage:.1f}%", 1)
         else:
-            st.success("✅ No missing values found!")
-            self.add_insight("Dataset has no missing values - excellent data quality", 1)
-        # Add insights about data quality and cardinality
         if quality_metrics['score'] < 80:
             self.add_insight(f"Data quality needs improvement (Score: {quality_metrics['score']:.1f}/100)", 1)
-        if memory_opt['potential_savings_pct'] > 20:
-            self.add_insight(f"Potential memory optimization of {memory_opt['potential_savings_pct']:.1f}% identified", 1)
-        if id_cols:
-            self.add_insight(f"Found {len(id_cols)} potential ID columns", 1)
     def stage_2_exploration(self):
-        """Stage 2: Exploratory Data Analysis with caching"""
-        st.subheader("🔍 Exploratory Data Analysis")
         numeric_cols = self.column_types['numeric']
         categorical_cols = self.column_types['categorical']
-        # Numeric analysis
         if numeric_cols:
-            st.subheader("Numeric Variables")
-            selected_numeric = st.selectbox("Select numeric column:", numeric_cols)
             col1, col2 = st.columns(2)
             with col1:
-                fig = px.histogram(self.df, x=selected_numeric,
-                                 title=f"Distribution of {selected_numeric}")
-                st.plotly_chart(fig, use_container_width=True)
             with col2:
-                fig = px.box(self.df, y=selected_numeric,
-                           title=f"Box Plot of {selected_numeric}")
-                st.plotly_chart(fig, use_container_width=True)
-            # Statistical summary
-            st.subheader("Statistical Summary")
-            summary_stats = self.df[numeric_cols].describe()
-            st.dataframe(summary_stats, use_container_width=True)
-            # Correlation analysis
-            if len(numeric_cols) > 1:
-                st.subheader("Correlation Analysis")
-                corr_matrix = calculate_correlation_matrix(self.df)
-                if not corr_matrix.empty:
-                    fig = px.imshow(corr_matrix, text_auto=True, aspect="auto",
-                                   title="Correlation Matrix")
-                    st.plotly_chart(fig, use_container_width=True)
-                    # Find highest correlation
-                    corr_values = []
-                    for i in range(len(corr_matrix.columns)):
-                        for j in range(i+1, len(corr_matrix.columns)):
-                            corr_values.append(abs(corr_matrix.iloc[i, j]))
-                    if corr_values:
-                        max_corr = max(corr_values)
-                        self.add_insight(f"Maximum correlation coefficient: {max_corr:.3f}", 2)
-        # Categorical analysis
         if categorical_cols:
-            st.subheader("Categorical Variables")
             selected_categorical = st.selectbox("Select categorical column:", categorical_cols)
-            value_counts = get_value_counts(self.df, selected_categorical)
-            fig = px.bar(x=value_counts.index, y=value_counts.values,
-                        title=f"Top 10 {selected_categorical} Values")
-            st.plotly_chart(fig, use_container_width=True)
             total_categories = self.df[selected_categorical].nunique()
-            self.add_insight(f"Column '{selected_categorical}' has {total_categories} unique categories", 2)
     def stage_3_cleaning(self):
-        """Stage 3: Data Quality Assessment"""
-        st.subheader("🧹 Data Quality Assessment")
-        cleaning_actions = []
-        cleaning_history = []
-        # Missing values handling
-        if self.stats['missing_values'] > 0:
-            st.subheader("Missing Values Treatment")
-            missing_df = calculate_missing_data(self.df)
-            st.dataframe(missing_df, use_container_width=True)
-            col1, col2 = st.columns(2)
             with col1:
-                selected_col = st.selectbox("Select column to handle missing values:",
-                                          missing_df['Column'].tolist())
-            with col2:
-                fill_method = st.selectbox("Choose fill method:",
-                                         ["Drop rows", "Mean", "Median", "Mode", "Custom value"])
-            if st.button("Apply Missing Value Treatment"):
-                try:
-                    if fill_method == "Drop rows":
-                        self.df = self.df.dropna(subset=[selected_col])
-                        cleaning_history.append(f"Dropped rows with missing values in {selected_col}")
                     else:
-                        if fill_method == "Mean":
-                            fill_value = self.df[selected_col].mean()
-                        elif fill_method == "Median":
-                            fill_value = self.df[selected_col].median()
-                        elif fill_method == "Mode":
-                            fill_value = self.df[selected_col].mode()[0]
-                        else:  # Custom value
-                            fill_value = st.number_input("Enter custom value:", value=0.0)
-                        self.df[selected_col] = self.df[selected_col].fillna(fill_value)
-                        cleaning_history.append(f"Filled missing values in {selected_col} with {fill_method}")
-                    st.success("✅ Missing values handled successfully!")
-                except Exception as e:
-                    st.error(f"Error handling missing values: {str(e)}")
-        # Duplicates handling
         if self.stats['duplicates'] > 0:
-            st.subheader("Duplicate Rows")
-            st.warning(f"Found {self.stats['duplicates']} duplicate rows")
-            if st.button("Remove Duplicate Rows"):
-                original_len = len(self.df)
-                self.df = self.df.drop_duplicates()
-                removed = original_len - len(self.df)
-                cleaning_history.append(f"Removed {removed} duplicate rows")
-                st.success(f"✅ Removed {removed} duplicate rows")
-        else:
-            st.success("✅ No duplicate rows found")
-        # Mixed type detection and handling
-        mixed_types = detect_mixed_types(self.df)
-        if mixed_types:
-            st.subheader("Mixed Data Types")
-            mixed_df = pd.DataFrame(mixed_types)
-            st.dataframe(mixed_df, use_container_width=True)
-            selected_col = st.selectbox("Select column to fix data type:",
-                                      [item['column'] for item in mixed_types])
-            fix_method = st.selectbox("Choose fix method:",
-                                    ["Convert to numeric", "Convert to string"])
-            if st.button("Fix Data Type"):
-                try:
-                    if fix_method == "Convert to numeric":
-                        self.df[selected_col] = pd.to_numeric(self.df[selected_col], errors='coerce')
-                    else:
-                        self.df[selected_col] = self.df[selected_col].astype(str)
-                    cleaning_history.append(f"Fixed data type for {selected_col} to {fix_method}")
-                    st.success("✅ Data type fixed successfully!")
-                except Exception as e:
-                    st.error(f"Error fixing data type: {str(e)}")
-        # Outlier detection and handling
         numeric_cols = self.column_types['numeric']
         if numeric_cols:
-            st.subheader("Outlier Detection")
-            selected_col = st.selectbox("Select column for outlier detection:", numeric_cols)
-            outliers = calculate_outliers(self.df, selected_col)
-            outlier_count = len(outliers)
-            if outlier_count > 0:
-                st.warning(f"Found {outlier_count} potential outliers in '{selected_col}'")
-                st.dataframe(outliers[[selected_col]].head(100), use_container_width=True)
-                treatment_method = st.selectbox("Choose outlier treatment method:",
-                                              ["None", "Remove", "Cap at percentiles"])
-                if treatment_method != "None" and st.button("Apply Outlier Treatment"):
-                    try:
-                        if treatment_method == "Remove":
-                            self.df = self.df[~self.df.index.isin(outliers.index)]
-                            cleaning_history.append(f"Removed {outlier_count} outliers from {selected_col}")
-                        else:  # Cap at percentiles
-                            Q1 = self.df[selected_col].quantile(0.25)
-                            Q3 = self.df[selected_col].quantile(0.75)
-                            IQR = Q3 - Q1
-                            lower_bound = Q1 - 1.5 * IQR
-                            upper_bound = Q3 + 1.5 * IQR
-                            self.df[selected_col] = self.df[selected_col].clip(lower_bound, upper_bound)
-                            cleaning_history.append(f"Capped outliers in {selected_col} at percentiles")
-                        st.success("✅ Outliers handled successfully!")
-                    except Exception as e:
-                        st.error(f"Error handling outliers: {str(e)}")
-            else:
-                st.success(f"✅ No outliers detected in '{selected_col}'")
-        # Cleaning History
-        if cleaning_history:
-            st.subheader("Cleaning Operations History")
-            for i, operation in enumerate(cleaning_history, 1):
-                st.write(f"{i}. {operation}")
-            self.add_insight(f"Performed {len(cleaning_history)} data cleaning operations", 3)
-        # Summary
-        if cleaning_actions:
-            st.subheader("Remaining Action Items")
-            for i, action in enumerate(cleaning_actions, 1):
-                st.write(f"{i}. {action}")
-            self.add_insight(f"Identified {len(cleaning_actions)} data quality issues", 3)
-        else:
-            st.success("✅ Data quality is excellent!")
-            self.add_insight("No major data quality issues found", 3)
     def stage_4_analysis(self):
-        """Stage 4: Advanced Analysis"""
-        st.subheader("🔬 Advanced Analysis")
         numeric_cols = self.column_types['numeric']
         categorical_cols = self.column_types['categorical']
-        # Relationship analysis
         if len(numeric_cols) >= 2:
-            st.subheader("Variable Relationships")
-            col1, col2 = st.columns(2)
             with col1:
                 x_var = st.selectbox("X Variable:", numeric_cols)
             with col2:
-                y_var = st.selectbox("Y Variable:",
-                                   [col for col in numeric_cols if col != x_var])
-            # Sample data for performance if dataset is large
             sample_size = min(5000, len(self.df))
-            sample_df = self.df.sample(n=sample_size) if len(self.df) > sample_size else self.df
-            fig = px.scatter(sample_df, x=x_var, y=y_var,
-                           title=f"Relationship: {x_var} vs {y_var}")
-            st.plotly_chart(fig, use_container_width=True)
             correlation = self.df[x_var].corr(self.df[y_var])
-            st.metric("Correlation", f"{correlation:.3f}")
             if abs(correlation) > 0.7:
-                strength = "Strong"
             elif abs(correlation) > 0.3:
-                strength = "Moderate"
-            else:
-                strength = "Weak"
-            direction = "positive" if correlation > 0 else "negative"
-            st.write(f"**Result:** {strength} {direction} correlation")
-            self.add_insight(f"{strength} correlation ({correlation:.3f}) between {x_var} and {y_var}", 4)
-        # Group analysis
         if categorical_cols and numeric_cols:
-            st.subheader("Group Analysis")
             col1, col2 = st.columns(2)
             with col1:
                 group_var = st.selectbox("Group by:", categorical_cols)
             with col2:
-                metric_var = st.selectbox("Analyze:", numeric_cols)
             group_stats = calculate_group_stats(self.df, group_var, metric_var)
-            st.dataframe(group_stats, use_container_width=True)
-            # Sample for visualization if too many groups
-            unique_groups = self.df[group_var].nunique()
-            if unique_groups <= 20:
-                fig = px.box(self.df, x=group_var, y=metric_var,
-                           title=f"{metric_var} by {group_var}")
-                st.plotly_chart(fig, use_container_width=True)
-            else:
-                st.info(f"Too many groups ({unique_groups}) for visualization. Showing statistics only.")
-            best_group = group_stats['mean'].idxmax()
-            best_value = group_stats.loc[best_group, 'mean']
-            self.add_insight(f"'{best_group}' has highest average {metric_var}: {best_value:.2f}", 4)
     def stage_5_summary(self):
-        """Stage 5: Summary and Export"""
-        st.subheader("📈 Analysis Summary")
-        # Key metrics
-        col1, col2, col3 = st.columns(3)
         with col1:
-            st.metric("Total Insights", len(self.insights))
         with col2:
-            quality = "High" if self.stats['missing_values'] == 0 else "Medium"
-            st.metric("Data Quality", quality)
         with col3:
-            st.metric("Analysis Complete", "✅")
-        # Insights summary
-        st.subheader("Key Insights")
-        for i, insight in enumerate(self.insights, 1):
-            st.write(f"{i}. **Stage {insight['stage']}:** {insight['insight']}")
-        # Export options
-        st.subheader("Export Results")
-        export_format = st.selectbox("Choose export format:",
-                                   ["Text Report", "Markdown Report", "Python Code", "Cleaned Data"])
-        if export_format == "Text Report":
-            report = self.generate_text_report()
-            st.download_button(
-                label="Download Text Report",
-                data=report,
-                file_name="analysis_report.txt",
-                mime="text/plain"
-            )
-        elif export_format == "Markdown Report":
-            report = self.generate_markdown_report()
-            st.download_button(
-                label="Download Markdown Report",
-                data=report,
-                file_name="analysis_report.md",
-                mime="text/markdown"
-            )
-        elif export_format == "Python Code":
-            code = self.generate_python_code()
             st.code(code, language="python")
             st.download_button(
-                label="Download Python Script",
                 data=code,
-                file_name="analysis_script.py",
-                mime="text/plain"
             )
-        else:  # Cleaned Data
-            # Offer different export formats
-            data_format = st.selectbox("Choose data format:",
-                                     ["CSV", "Excel", "Parquet"])
-            if st.button("Export Data"):
-                try:
-                    if data_format == "CSV":
-                        csv = self.df.to_csv(index=False)
-                        st.download_button(
-                            label="Download CSV",
-                            data=csv,
-                            file_name="cleaned_data.csv",
-                            mime="text/csv"
-                        )
-                    elif data_format == "Excel":
-                        excel_buffer = BytesIO()
-                        self.df.to_excel(excel_buffer, index=False)
-                        excel_data = excel_buffer.getvalue()
-                        st.download_button(
-                            label="Download Excel",
-                            data=excel_data,
-                            file_name="cleaned_data.xlsx",
-                            mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
-                        )
-                    else:  # Parquet
-                        parquet_buffer = BytesIO()
-                        self.df.to_parquet(parquet_buffer, index=False)
-                        parquet_data = parquet_buffer.getvalue()
-                        st.download_button(
-                            label="Download Parquet",
-                            data=parquet_data,
-                            file_name="cleaned_data.parquet",
-                            mime="application/octet-stream"
-                        )
-                except Exception as e:
-                    st.error(f"Error exporting data: {str(e)}")
-    def generate_text_report(self) -> str:
-        """Generate text analysis report"""
-        report = f"""DATA ANALYSIS REPORT
-==================
-Dataset Overview:
-- Rows: {self.stats['shape'][0]:,}
-- Columns: {self.stats['shape'][1]:,}
-- Missing Values: {self.stats['missing_values']:,}
-- Memory Usage: {self.stats['memory_usage']:.1f} MB
-Key Insights:
 """
-        for insight in self.insights:
-            report += f"\n- Stage {insight['stage']}: {insight['insight']}"
-        report += f"\n\nGenerated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}"
-        return report
-    def generate_markdown_report(self) -> str:
-        """Generate markdown analysis report"""
-        report = f"""# Data Analysis Report
-## Dataset Overview
-* **Rows:** {self.stats['shape'][0]:,}
-* **Columns:** {self.stats['shape'][1]:,}
-* **Missing Values:** {self.stats['missing_values']:,}
-* **Memory Usage:** {self.stats['memory_usage']:.1f} MB
-## Data Types
-```
-{pd.DataFrame(self.stats['dtypes'].items(), columns=['Type', 'Count']).to_markdown()}
-```
-## Key Insights
 """
         # Group insights by stage
         for stage in range(1, 6):
             stage_insights = [i for i in self.insights if i['stage'] == stage]
             if stage_insights:
-                report += f"\n### Stage {stage}\n"
                 for insight in stage_insights:
-                    report += f"* {insight['insight']}\n"
-        report += f"\n\n*Generated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}*"
         return report
-    def generate_python_code(self) -> str:
-        """Generate reproducible Python code"""
-        code = """import pandas as pd
 import numpy as np
 import plotly.express as px
-from typing import Dict, List, Any
-# Load and prepare data
-df = pd.read_csv('your_data.csv')  # Update with your data source
-# Basic statistics
-def calculate_basic_stats(df: pd.DataFrame) -> Dict[str, Any]:
-    return {
-        'shape': df.shape,
-        'memory_usage': float(df.memory_usage(deep=True).sum() / 1024**2),
-        'missing_values': int(df.isnull().sum().sum()),
-        'dtypes': df.dtypes.value_counts().to_dict(),
-        'duplicates': int(df.duplicated().sum())
-    }
-stats = calculate_basic_stats(df)
-print("\\nBasic Statistics:")
-print(f"- Shape: {stats['shape']}")
-print(f"- Memory Usage: {stats['memory_usage']:.1f} MB")
-print(f"- Missing Values: {stats['missing_values']}")
-print(f"- Duplicates: {stats['duplicates']}")
-"""
-        # Add data cleaning operations if any were performed
-        if hasattr(self, 'cleaning_history'):
-            code += "\n# Data Cleaning\n"
-            for operation in self.cleaning_history:
-                if "missing values" in operation.lower():
-                    code += "# Handle missing values\n"
-                    code += "df = df.fillna(method='ffill')  # Update with your chosen method\n"
-                elif "duplicate" in operation.lower():
-                    code += "# Remove duplicates\n"
-                    code += "df = df.drop_duplicates()\n"
-                elif "outlier" in operation.lower():
-                    code += """# Handle outliers
-def remove_outliers(df: pd.DataFrame, column: str) -> pd.DataFrame:
     Q1 = df[column].quantile(0.25)
     Q3 = df[column].quantile(0.75)
     IQR = Q3 - Q1
-    return df[~((df[column] < (Q1 - 1.5 * IQR)) | (df[column] > (Q3 + 1.5 * IQR)))]
-# Apply to numeric columns as needed
-numeric_cols = df.select_dtypes(include=[np.number]).columns
 for col in numeric_cols:
-    df = remove_outliers(df, col)
-"""
-        # Add visualization code
-        code += """
-# Visualizations
-def plot_missing_values(df: pd.DataFrame):
-    missing = df.isnull().sum()
-    if missing.sum() > 0:
-        missing = missing[missing > 0]
-        fig = px.bar(x=missing.index, y=missing.values,
-                    title='Missing Values by Column')
-        fig.show()
-def plot_correlations(df: pd.DataFrame):
     numeric_cols = df.select_dtypes(include=[np.number]).columns
     if len(numeric_cols) > 1:
-        corr = df[numeric_cols].corr()
-        fig = px.imshow(corr, title='Correlation Matrix')
-        fig.show()
-# Generate plots
-plot_missing_values(df)
-plot_correlations(df)
 """
-        return code

 import numpy as np
 import plotly.express as px
 import plotly.graph_objects as go
+from plotly.subplots import make_subplots
 from typing import Dict, List, Any, Optional
 import os
 from data_handler import *
 from io import BytesIO
 class AIAssistant:
+    """Built-in AI analysis for HuggingFace deployment (no external APIs needed)"""
     def __init__(self):
+        self.available = True  # Always available since it's built-in
     def get_available_models(self) -> List[str]:
         """Get list of available AI models"""
+        return ["Built-in AI Engine"]
+    def analyze_insights(self, df: pd.DataFrame, insights: List[Dict], model: str = "Built-in AI Engine") -> str:
+        """Generate comprehensive AI analysis using built-in intelligence"""
+        # Calculate key metrics
+        missing_pct = (df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100
+        duplicate_pct = (df.duplicated().sum() / len(df)) * 100
+        memory_mb = df.memory_usage(deep=True).sum() / 1024**2
+        # Analyze data characteristics
+        numeric_cols = df.select_dtypes(include=[np.number]).columns
+        categorical_cols = df.select_dtypes(include=['object']).columns
+        analysis = f"""
+## 🧠 AI Data Intelligence Report
+### 📊 Executive Summary
+Your dataset contains **{len(df):,} records** across **{len(df.columns)} dimensions** with a **data quality score** that requires attention in several key areas.
+### 🎯 Critical Findings
+**Data Completeness Assessment:**
+"""
+        if missing_pct > 20:
+            analysis += f"""
+- ⚠️ **HIGH RISK**: {missing_pct:.1f}% missing values detected
+- **Business Impact**: Significant risk of biased analysis and incorrect business decisions
+- **Recommended Action**: Immediate data collection process review required
+"""
+        elif missing_pct > 5:
+            analysis += f"""
+- ⚠️ **MODERATE RISK**: {missing_pct:.1f}% missing values detected
+- **Business Impact**: May affect statistical significance of insights
+- **Recommended Action**: Apply intelligent filling strategies before analysis
+"""
+        else:
+            analysis += f"""
+- ✅ **EXCELLENT**: Only {missing_pct:.1f}% missing data - within industry best practices
+- **Business Impact**: High confidence in analysis results
+"""
+        analysis += f"""
+**Data Integrity Assessment:**
+"""
+        if duplicate_pct > 5:
+            analysis += f"""
+- 🚨 **CRITICAL**: {duplicate_pct:.1f}% duplicate records found
+- **Root Cause**: Likely data collection or ETL process issues
+- **Financial Impact**: Potential double-counting affecting revenue/cost metrics
+"""
+        elif duplicate_pct > 0:
+            analysis += f"""
+- ⚠️ **ATTENTION**: {duplicate_pct:.1f}% duplicates detected
+- **Recommendation**: Clean before aggregations to ensure accuracy
+"""
+        else:
+            analysis += "- ✅ **PERFECT**: No duplicate records detected"
+        # Outlier analysis
+        total_outliers = 0
+        outlier_insights = []
+        for col in numeric_cols:
+            Q1 = df[col].quantile(0.25)
+            Q3 = df[col].quantile(0.75)
+            IQR = Q3 - Q1
+            outliers = df[(df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)]
+            if len(outliers) > 0:
+                outlier_pct = (len(outliers) / len(df)) * 100
+                total_outliers += len(outliers)
+                if outlier_pct > 5:
+                    outlier_insights.append(f"'{col}': {outlier_pct:.1f}% outliers (investigate business context)")
+                elif outlier_pct > 1:
+                    outlier_insights.append(f"'{col}': {outlier_pct:.1f}% outliers (consider capping)")
+        if outlier_insights:
+            analysis += f"""
+**Statistical Anomaly Assessment:**
+"""
+            for insight in outlier_insights[:3]:  # Top 3 most problematic
+                analysis += f"- ⚠️ {insight}\n"
+        # Business intelligence insights
+        analysis += f"""
+### 💼 Business Intelligence Opportunities
+**Analytical Readiness:**
+"""
+        if len(numeric_cols) >= 3:
+            analysis += f"""
+- 📊 **{len(numeric_cols)} quantitative variables** available for statistical modeling
+- 🎯 **Correlation analysis** possible - identify key business drivers
+- 📈 **Predictive modeling** feasible with current data structure
+"""
+        if len(categorical_cols) >= 2:
+            analysis += f"""
+- 🏷️ **{len(categorical_cols)} categorical dimensions** for segmentation analysis
+- 💰 **Customer/product grouping** strategies available
+- 📊 **Cross-tabulation** analysis recommended for business insights
+"""
+        # Performance considerations
+        if memory_mb > 50:
+            analysis += f"""
+**Performance Optimization:**
+- 🔧 **Memory Usage**: {memory_mb:.1f}MB - consider data type optimization
+- ⚡ **Processing Speed**: Large dataset detected - implement sampling for interactive analysis
+- 💾 **Storage Efficiency**: Category encoding could reduce memory by 30-50%
+"""
+        # Actionable recommendations
+        analysis += f"""
+### 🎯 Recommended Action Plan
+**Priority 1 (Immediate):**
+"""
+        recommendations = []
+        if missing_pct > 10:
+            recommendations.append("Address missing values in critical business columns")
+        if duplicate_pct > 2:
+            recommendations.append("Remove duplicate records to ensure data integrity")
+        if total_outliers > len(df) * 0.1:
+            recommendations.append("Investigate outliers for business context and data errors")
+        if not recommendations:
+            recommendations.append("Data quality is excellent - proceed with analysis")
+        for i, rec in enumerate(recommendations, 1):
+            analysis += f"\n{i}. {rec}"
+        analysis += f"""
+**Priority 2 (Optimization):**
+1. Implement data type optimization for memory efficiency
+2. Establish data quality monitoring for ongoing datasets
+3. Document data lineage and transformation processes
+### 🏆 Success Metrics
+- **Target Quality Score**: 95+ (currently assessing)
+- **Missing Values**: <2% (currently {missing_pct:.1f}%)
+- **Data Integrity**: 100% unique records (currently {100-duplicate_pct:.1f}%)
+*This analysis was generated using advanced statistical algorithms and business intelligence best practices.*
+"""
+        return analysis
 class DataAnalysisWorkflow:
+    """Enhanced workflow optimized for HuggingFace deployment"""
     def __init__(self, df: pd.DataFrame):
         self.df = df
+        self.original_df = df.copy()  # Keep original for comparison
         self.stats = calculate_basic_stats(df)
         self.column_types = get_column_types(df)
         self.insights = []
+        self.page_size = 1000
+        self.quality_metrics = None
     def add_insight(self, insight: str, stage: int):
         """Add insight to analysis report"""
             'timestamp': pd.Timestamp.now()
         })
+    def calculate_enhanced_quality_score(self) -> Dict[str, Any]:
+        """Calculate comprehensive quality score with business context"""
+        score = 100
+        issues = []
+        recommendations = []
+        # Missing values analysis
+        missing_pct = (self.df.isnull().sum().sum() / (len(self.df) * len(self.df.columns))) * 100
+        if missing_pct > 0:
+            penalty = min(30, missing_pct * 1.5)
+            score -= penalty
+            issues.append(f"Missing values: {missing_pct:.1f}%")
+            if missing_pct > 20:
+                recommendations.append("Critical: Review data collection processes")
+            else:
+                recommendations.append("Apply intelligent filling strategies")
+        # Duplicates analysis
+        duplicate_pct = (self.df.duplicated().sum() / len(self.df)) * 100
+        if duplicate_pct > 0:
+            penalty = min(25, duplicate_pct * 3)
+            score -= penalty
+            issues.append(f"Duplicate rows: {duplicate_pct:.1f}%")
+            recommendations.append("Remove duplicates to ensure data integrity")
+        # Outliers analysis
+        numeric_cols = self.df.select_dtypes(include=[np.number]).columns
+        total_outliers = 0
+        problematic_cols = []
+        for col in numeric_cols:
+            Q1 = self.df[col].quantile(0.25)
+            Q3 = self.df[col].quantile(0.75)
+            IQR = Q3 - Q1
+            outliers = self.df[(self.df[col] < Q1 - 1.5 * IQR) | (self.df[col] > Q3 + 1.5 * IQR)]
+            if len(outliers) > 0:
+                outlier_pct = (len(outliers) / len(self.df)) * 100
+                total_outliers += len(outliers)
+                if outlier_pct > 5:
+                    problematic_cols.append(col)
+        if total_outliers > 0:
+            outlier_overall_pct = (total_outliers / len(self.df)) * 100
+            penalty = min(20, outlier_overall_pct * 2)
+            score -= penalty
+            issues.append(f"Statistical outliers: {outlier_overall_pct:.1f}%")
+            if problematic_cols:
+                recommendations.append(f"Investigate outliers in: {', '.join(problematic_cols)}")
+        # Data type consistency
+        mixed_type_cols = detect_mixed_types(self.df)
+        if mixed_type_cols:
+            penalty = min(15, len(mixed_type_cols) * 5)
+            score -= penalty
+            issues.append(f"Type inconsistencies: {len(mixed_type_cols)} columns")
+            recommendations.append("Standardize data types for consistency")
+        # Determine grade and color
+        if score >= 90:
+            grade, color = "A", "#22c55e"  # Green
+        elif score >= 80:
+            grade, color = "B", "#3b82f6"  # Blue
+        elif score >= 70:
+            grade, color = "C", "#f59e0b"  # Yellow
+        elif score >= 60:
+            grade, color = "D", "#f97316"  # Orange
+        else:
+            grade, color = "F", "#ef4444"  # Red
+        self.quality_metrics = {
+            'score': max(0, score),
+            'grade': grade,
+            'color': color,
+            'issues': issues,
+            'recommendations': recommendations,
+            'missing_pct': missing_pct,
+            'duplicate_pct': duplicate_pct,
+            'outlier_pct': (total_outliers / len(self.df)) * 100 if len(self.df) > 0 else 0,
+            'total_outliers': total_outliers
+        }
+        return self.quality_metrics
     def stage_1_overview(self):
+        """Enhanced Stage 1: Quality-focused overview with visual dashboard"""
+        # Calculate quality metrics
+        quality_metrics = self.calculate_enhanced_quality_score()
+        # Quality Dashboard Header
+        col1, col2, col3, col4, col5 = st.columns(5)
         with col1:
+            # Quality score with color coding
+            st.markdown(f"""
+            <div style="text-align: center; padding: 1rem; background: {quality_metrics['color']}20; border-radius: 0.5rem; border: 2px solid {quality_metrics['color']}40;">
+                <h1 style="color: {quality_metrics['color']}; margin: 0;">{quality_metrics['score']:.0f}</h1>
+                <p style="margin: 0; font-weight: bold;">Quality Score</p>
+                <p style="margin: 0; color: {quality_metrics['color']};">Grade {quality_metrics['grade']}</p>
+            </div>
+            """, unsafe_allow_html=True)
         with col2:
+            st.metric("📊 Rows", f"{self.stats['shape'][0]:,}")
         with col3:
+            st.metric("📋 Columns", f"{self.stats['shape'][1]:,}")
         with col4:
+            st.metric("💾 Memory", f"{self.stats['memory_usage']:.1f} MB")
+        with col5:
+            issues_count = len(quality_metrics['issues'])
+            st.metric("⚠️ Issues", issues_count,
+                     delta=f"-{issues_count}" if issues_count == 0 else None)
+        # Issues breakdown with visual elements
         if quality_metrics['issues']:
+            st.markdown("### 🚨 Quality Issues Detected")
+            col1, col2 = st.columns([2, 1])
+            with col1:
+                # Issues pie chart
+                issue_categories = []
+                issue_values = []
+                issue_colors = []
+                if quality_metrics['missing_pct'] > 0:
+                    issue_categories.append("Missing Values")
+                    issue_values.append(quality_metrics['missing_pct'])
+                    issue_colors.append("#ef4444")
+                if quality_metrics['duplicate_pct'] > 0:
+                    issue_categories.append("Duplicates")
+                    issue_values.append(quality_metrics['duplicate_pct'])
+                    issue_colors.append("#f97316")
+                if quality_metrics['outlier_pct'] > 0:
+                    issue_categories.append("Outliers")
+                    issue_values.append(quality_metrics['outlier_pct'])
+                    issue_colors.append("#eab308")
+                if issue_categories:
+                    fig_issues = px.pie(
+                        values=issue_values,
+                        names=issue_categories,
+                        title="Quality Issues Distribution (%)",
+                        color_discrete_sequence=issue_colors
+                    )
+                    fig_issues.update_traces(textposition='inside', textinfo='percent+label')
+                    st.plotly_chart(fig_issues, use_container_width=True)
+            with col2:
+                st.markdown("#### 🤖 AI Recommendations")
+                for i, rec in enumerate(quality_metrics['recommendations'], 1):
+                    st.markdown(f"**{i}.** {rec}")
+        else:
+            st.success("🎉 Excellent! No major quality issues detected.")
+        # Column-level quality heatmap
+        st.markdown("### 📊 Column Quality Heatmap")
+        col_quality_data = []
+        for col in self.df.columns:
+            missing_rate = self.df[col].isnull().sum() / len(self.df)
+            # Calculate quality score per column
+            col_score = 100
+            if missing_rate > 0:
+                col_score -= missing_rate * 50  # Penalty for missing values
+            # Check for outliers in numeric columns
+            if self.df[col].dtype in ['int64', 'float64']:
+                Q1 = self.df[col].quantile(0.25)
+                Q3 = self.df[col].quantile(0.75)
+                IQR = Q3 - Q1
+                outliers = self.df[(self.df[col] < Q1 - 1.5 * IQR) | (self.df[col] > Q3 + 1.5 * IQR)]
+                outlier_rate = len(outliers) / len(self.df)
+                col_score -= outlier_rate * 30
+            col_quality_data.append({
+                'Column': col,
+                'Quality Score': max(0, col_score),
+                'Missing %': missing_rate * 100,
+                'Data Type': str(self.df[col].dtype)
+            })
+        quality_df = pd.DataFrame(col_quality_data)
+        # Interactive column quality chart
+        fig_quality = px.bar(
+            quality_df,
+            x='Column',
+            y='Quality Score',
+            color='Quality Score',
+            color_continuous_scale='RdYlGn',
+            title="Column Quality Scores",
+            hover_data=['Missing %', 'Data Type']
+        )
+        fig_quality.update_layout(height=400)
+        st.plotly_chart(fig_quality, use_container_width=True)
+        # Data types distribution
+        st.markdown("### 📋 Data Types Analysis")
         col1, col2 = st.columns(2)
         with col1:
+            if self.stats['dtypes']:
+                fig_types = px.pie(
+                    values=list(self.stats['dtypes'].values()),
+                    names=list(self.stats['dtypes'].keys()),
+                    title="Data Types Distribution"
+                )
+                st.plotly_chart(fig_types, use_container_width=True)
         with col2:
+            # Memory optimization opportunities
+            memory_opt = calculate_memory_optimization(self.df)
+            if memory_opt['potential_savings_mb'] > 1:
+                st.warning(f"💾 Memory Optimization Available")
+                st.write(f"Potential savings: {memory_opt['potential_savings_mb']:.1f} MB ({memory_opt['potential_savings_pct']:.1f}%)")
+                if st.button("🔧 Apply Memory Optimization"):
+                    for suggestion in memory_opt['suggestions']:
+                        if suggestion['suggested_type'] == 'category':
+                            self.df[suggestion['column']] = self.df[suggestion['column']].astype('category')
+                    st.success("✅ Memory optimized!")
+                    st.rerun()
+            else:
+                st.success("✅ Memory usage is optimal")
+        # Quick data preview with enhanced styling
+        st.markdown("### 👀 Data Preview")
+        preview_option = st.radio("Preview type:", ["First 10 rows", "Random sample", "Last 10 rows"], horizontal=True)
+        if preview_option == "Random sample":
+            sample_df = self.df.sample(n=min(10, len(self.df)))
+        elif preview_option == "Last 10 rows":
+            sample_df = self.df.tail(10)
         else:
+            sample_df = self.df.head(10)
+        st.dataframe(sample_df, use_container_width=True)
+        # Add quality insights
         if quality_metrics['score'] < 80:
             self.add_insight(f"Data quality needs improvement (Score: {quality_metrics['score']:.1f}/100)", 1)
+        else:
+            self.add_insight(f"Good data quality detected (Score: {quality_metrics['score']:.1f}/100)", 1)
     def stage_2_exploration(self):
+        """Enhanced Stage 2: Interactive data exploration"""
         numeric_cols = self.column_types['numeric']
         categorical_cols = self.column_types['categorical']
+        # Smart column selection based on quality
+        if self.quality_metrics:
+            st.info(f"🎯 **Focus Areas**: Columns with quality issues detected - prioritize these for exploration")
+        # Numeric analysis with enhanced visualizations
         if numeric_cols:
+            st.markdown("### 📊 Numeric Variables Deep Dive")
             col1, col2 = st.columns(2)
             with col1:
+                selected_numeric = st.selectbox("Select numeric column:", numeric_cols)
+            with col2:
+                chart_type = st.selectbox("Visualization type:",
+                                        ["Distribution + Box Plot", "Only Histogram", "Only Box Plot"])
+            if chart_type == "Distribution + Box Plot":
+                col_a, col_b = st.columns(2)
+                with col_a:
+                    fig_hist = px.histogram(self.df, x=selected_numeric,
+                                          title=f"Distribution: {selected_numeric}",
+                                          nbins=30)
+                    st.plotly_chart(fig_hist, use_container_width=True)
+                with col_b:
+                    fig_box = px.box(self.df, y=selected_numeric,
+                                   title=f"Box Plot: {selected_numeric}")
+                    st.plotly_chart(fig_box, use_container_width=True)
+            elif chart_type == "Only Histogram":
+                fig_hist = px.histogram(self.df, x=selected_numeric,
+                                      title=f"Distribution: {selected_numeric}",
+                                      nbins=50)
+                st.plotly_chart(fig_hist, use_container_width=True)
+            else:  # Only Box Plot
+                fig_box = px.box(self.df, y=selected_numeric,
+                               title=f"Box Plot: {selected_numeric}")
+                st.plotly_chart(fig_box, use_container_width=True)
+            # Enhanced statistical insights
+            col_stats = self.df[selected_numeric].describe()
+            col1, col2, col3, col4 = st.columns(4)
+            with col1:
+                st.metric("Mean", f"{col_stats['mean']:.2f}")
+                st.metric("Std Dev", f"{col_stats['std']:.2f}")
             with col2:
+                st.metric("Minimum", f"{col_stats['min']:.2f}")
+                st.metric("Maximum", f"{col_stats['max']:.2f}")
+            with col3:
+                st.metric("Q1 (25%)", f"{col_stats['25%']:.2f}")
+                st.metric("Q3 (75%)", f"{col_stats['75%']:.2f}")
+            with col4:
+                skewness = self.df[selected_numeric].skew()
+                st.metric("Skewness", f"{skewness:.3f}")
+                kurtosis = self.df[selected_numeric].kurtosis()
+                st.metric("Kurtosis", f"{kurtosis:.3f}")
+            # Business insights for the selected column
+            if abs(skewness) > 1:
+                self.add_insight(f"'{selected_numeric}' shows high skewness ({skewness:.2f}) - consider transformation", 2)
+        # Categorical analysis with enhanced features
         if categorical_cols:
+            st.markdown("### 🏷️ Categorical Variables Analysis")
             selected_categorical = st.selectbox("Select categorical column:", categorical_cols)
+            col1, col2 = st.columns(2)
+            with col1:
+                # Top categories bar chart
+                value_counts = self.df[selected_categorical].value_counts().head(10)
+                fig_bar = px.bar(
+                    x=value_counts.values,
+                    y=value_counts.index,
+                    orientation='h',
+                    title=f"Top 10 Categories: {selected_categorical}",
+                    color=value_counts.values,
+                    color_continuous_scale='Blues'
+                )
+                st.plotly_chart(fig_bar, use_container_width=True)
+            with col2:
+                # Category distribution pie chart
+                top_5 = value_counts.head(5)
+                others_count = value_counts.iloc[5:].sum() if len(value_counts) > 5 else 0
+                if others_count > 0:
+                    pie_data = list(top_5.values) + [others_count]
+                    pie_labels = list(top_5.index) + ['Others']
+                else:
+                    pie_data = list(top_5.values)
+                    pie_labels = list(top_5.index)
+                fig_pie = px.pie(
+                    values=pie_data,
+                    names=pie_labels,
+                    title=f"Distribution: {selected_categorical}"
+                )
+                st.plotly_chart(fig_pie, use_container_width=True)
+            # Category insights
             total_categories = self.df[selected_categorical].nunique()
+            most_common = value_counts.index[0]
+            most_common_pct = (value_counts.iloc[0] / len(self.df)) * 100
+            st.info(f"📈 **Insights**: '{most_common}' is the dominant category ({most_common_pct:.1f}% of data)")
+            self.add_insight(f"'{selected_categorical}' has {total_categories} categories, dominated by '{most_common}' ({most_common_pct:.1f}%)", 2)
+        # Enhanced correlation analysis
+        if len(numeric_cols) > 1:
+            st.markdown("### 🔗 Correlation Analysis")
+            corr_matrix = calculate_correlation_matrix(self.df)
+            if not corr_matrix.empty:
+                # Interactive correlation heatmap
+                fig_corr = px.imshow(
+                    corr_matrix,
+                    text_auto=True,
+                    aspect="auto",
+                    title="Correlation Matrix",
+                    color_continuous_scale='RdBu_r',
+                    zmin=-1, zmax=1
+                )
+                fig_corr.update_layout(height=500)
+                st.plotly_chart(fig_corr, use_container_width=True)
+                # Find and highlight strongest correlations
+                corr_pairs = []
+                for i in range(len(corr_matrix.columns)):
+                    for j in range(i+1, len(corr_matrix.columns)):
+                        corr_val = corr_matrix.iloc[i, j]
+                        if abs(corr_val) > 0.3:  # Only significant correlations
+                            corr_pairs.append({
+                                'Variable 1': corr_matrix.columns[i],
+                                'Variable 2': corr_matrix.columns[j],
+                                'Correlation': corr_val,
+                                'Strength': 'Strong' if abs(corr_val) > 0.7 else 'Moderate'
+                            })
+                if corr_pairs:
+                    st.markdown("#### 🎯 Key Correlations")
+                    corr_df = pd.DataFrame(corr_pairs).sort_values('Correlation', key=abs, ascending=False)
+                    st.dataframe(corr_df, use_container_width=True)
+                    strongest = corr_df.iloc[0]
+                    self.add_insight(f"Strongest correlation: {strongest['Variable 1']} ↔ {strongest['Variable 2']} (r={strongest['Correlation']:.3f})", 2)
     def stage_3_cleaning(self):
+        """Enhanced Stage 3: Visual data cleaning with AI suggestions"""
+        st.markdown("### 🧹 Intelligent Data Cleaning")
+        cleaning_operations = []
+        # Missing values section with enhanced visualization
+        missing_data = calculate_missing_data(self.df)
+        if not missing_data.empty:
+            st.markdown("#### 🕳️ Missing Values Treatment")
+            col1, col2 = st.columns([2, 1])
             with col1:
+                # Missing values heatmap for top 10 problematic columns
+                top_missing_cols = missing_data.head(10)['Column'].tolist()
+                if len(top_missing_cols) > 0:
+                    # Create missing pattern visualization
+                    sample_size = min(100, len(self.df))
+                    sample_df = self.df[top_missing_cols].head(sample_size)
+                    missing_matrix = sample_df.isnull().astype(int)
+                    fig_missing = px.imshow(
+                        missing_matrix.T,
+                        title=f"Missing Values Pattern (Top {len(top_missing_cols)} columns, First {sample_size} rows)",
+                        color_continuous_scale='Reds',
+                        labels={'x': 'Row Index', 'y': 'Columns', 'color': 'Missing'},
+                        aspect='auto'
+                    )
+                    st.plotly_chart(fig_missing, use_container_width=True)
+            with col2:
+                # AI-powered missing value suggestions
+                st.markdown("**🤖 AI Repair Suggestions**")
+                for _, row in missing_data.head(3).iterrows():
+                    col_name = row['Column']
+                    missing_pct = row['Missing %']
+                    # Generate smart suggestion based on column type and missing percentage
+                    if missing_pct > 50:
+                        suggestion_type = "🚨 Critical"
+                        suggestion = f"Drop column (>{missing_pct:.0f}% missing)"
+                        action = "drop"
+                    elif self.df[col_name].dtype in ['int64', 'float64']:
+                        suggestion_type = "🔧 Repair"
+                        suggestion = f"Fill with median ({missing_pct:.1f}% missing)"
+                        action = "median"
                     else:
+                        suggestion_type = "🔧 Repair"
+                        suggestion = f"Fill with mode ({missing_pct:.1f}% missing)"
+                        action = "mode"
+                    with st.expander(f"{suggestion_type}: {col_name}"):
+                        st.write(f"**Issue**: {missing_pct:.1f}% missing values")
+                        st.write(f"**Suggestion**: {suggestion}")
+                        if st.button(f"Apply to {col_name}", key=f"fix_missing_{col_name}"):
+                            if action == "drop":
+                                self.df = self.df.drop(columns=[col_name])
+                                cleaning_operations.append(f"Dropped column '{col_name}' (too many missing values)")
+                            elif action == "median":
+                                self.df[col_name] = self.df[col_name].fillna(self.df[col_name].median())
+                                cleaning_operations.append(f"Filled missing values in '{col_name}' with median")
+                            elif action == "mode":
+                                mode_val = self.df[col_name].mode()
+                                if not mode_val.empty:
+                                    self.df[col_name] = self.df[col_name].fillna(mode_val[0])
+                                    cleaning_operations.append(f"Filled missing values in '{col_name}' with mode")
+                            st.success("✅ Applied successfully!")
+                            st.rerun()
+        # Duplicates handling with enhanced detection
         if self.stats['duplicates'] > 0:
+            st.markdown("#### 🔄 Duplicate Records")
+            duplicate_pct = (self.stats['duplicates'] / len(self.df)) * 100
+            col1, col2 = st.columns([2, 1])
+            with col1:
+                st.warning(f"🚨 Found **{self.stats['duplicates']}** duplicate rows ({duplicate_pct:.1f}% of dataset)")
+                # Show sample duplicates
+                duplicates = self.df[self.df.duplicated(keep=False)].head(10)
+                st.dataframe(duplicates, use_container_width=True)
+            with col2:
+                st.markdown("**🤖 AI Assessment**")
+                if duplicate_pct > 10:
+                    st.error("**Critical**: High duplication rate suggests systematic data collection issues")
+                elif duplicate_pct > 2:
+                    st.warning("**Moderate**: Notable duplication - verify data sources")
+                else:
+                    st.info("**Minor**: Low duplication rate - likely isolated incidents")
+                if st.button("🗑️ Remove All Duplicates"):
+                    original_len = len(self.df)
+                    self.df = self.df.drop_duplicates()
+                    removed = original_len - len(self.df)
+                    cleaning_operations.append(f"Removed {removed} duplicate rows")
+                    st.success(f"✅ Removed {removed} duplicates!")
+                    st.rerun()
+        # Enhanced outlier detection
         numeric_cols = self.column_types['numeric']
         if numeric_cols:
+            st.markdown("#### 📊 Outlier Detection & Treatment")
+            selected_col = st.selectbox("Select column for outlier analysis:", numeric_cols)
+            # Calculate outliers
+            Q1 = self.df[selected_col].quantile(0.25)
+            Q3 = self.df[selected_col].quantile(0.75)
+            IQR = Q3 - Q1
+            lower_bound = Q1 - 1.5 * IQR
+            upper_bound = Q3 + 1.5 * IQR
+            outliers = self.df[(self.df[selected_col] < lower_bound) | (self.df[selected_col] > upper_bound)]
+            outlier_pct = (len(outliers) / len(self.df)) * 100
+            col1, col2 = st.columns([2, 1])
+            with col1:
+                # Enhanced box plot with outlier highlighting
+                fig_outliers = go.Figure()
+                # Box plot
+                fig_outliers.add_trace(go.Box(
+                    y=self.df[selected_col],
+                    name=selected_col,
+                    boxpoints='outliers',
+                    marker_color='lightblue'
+                ))
+                # Highlight outliers
+                if len(outliers) > 0:
+                    fig_outliers.add_trace(go.Scatter(
+                        y=outliers[selected_col],
+                        mode='markers',
+                        marker=dict(color='red', size=8),
+                        name=f'Outliers ({len(outliers)})'
+                    ))
+                fig_outliers.update_layout(
+                    title=f"Outlier Analysis: {selected_col}",
+                    height=400
+                )
+                st.plotly_chart(fig_outliers, use_container_width=True)
+            with col2:
+                st.markdown("**🤖 AI Outlier Assessment**")
+                if outlier_pct > 10:
+                    st.error(f"**High Risk**: {outlier_pct:.1f}% outliers detected")
+                    st.write("**Likely Cause**: Systematic data issues or measurement errors")
+                    recommendation = "Investigate business context before any treatment"
+                elif outlier_pct > 2:
+                    st.warning(f"**Moderate**: {outlier_pct:.1f}% outliers detected")
+                    recommendation = "Consider capping values at statistical bounds"
+                else:
+                    st.info(f"**Normal**: {outlier_pct:.1f}% outliers detected")
+                    recommendation = "Safe to remove if confirmed as errors"
+                st.write(f"**AI Recommendation**: {recommendation}")
+                # Outlier treatment options
+                col_a, col_b = st.columns(2)
+                with col_a:
+                    if st.button("🗑️ Remove", key=f"remove_outliers_{selected_col}"):
+                        self.df = self.df[~self.df.index.isin(outliers.index)]
+                        cleaning_operations.append(f"Removed {len(outliers)} outliers from '{selected_col}'")
+                        st.success("✅ Outliers removed!")
+                        st.rerun()
+                with col_b:
+                    if st.button("📌 Cap", key=f"cap_outliers_{selected_col}"):
+                        self.df[selected_col] = self.df[selected_col].clip(lower_bound, upper_bound)
+                        cleaning_operations.append(f"Capped outliers in '{selected_col}' at statistical bounds")
+                        st.success("✅ Outliers capped!")
+                        st.rerun()
+        # Show cleaning history
+        if cleaning_operations:
+            st.markdown("#### 📋 Cleaning Operations Applied")
+            for i, operation in enumerate(cleaning_operations, 1):
+                st.success(f"{i}. {operation}")
+            self.add_insight(f"Applied {len(cleaning_operations)} data cleaning operations", 3)
     def stage_4_analysis(self):
+        """Enhanced Stage 4: Advanced analysis with AI insights"""
         numeric_cols = self.column_types['numeric']
         categorical_cols = self.column_types['categorical']
+        # Relationship analysis with enhanced visualizations
         if len(numeric_cols) >= 2:
+            st.markdown("### 🔗 Variable Relationships")
+            col1, col2, col3 = st.columns(3)
             with col1:
                 x_var = st.selectbox("X Variable:", numeric_cols)
             with col2:
+                y_var = st.selectbox("Y Variable:", [col for col in numeric_cols if col != x_var])
+            with col3:
+                color_var = st.selectbox("Color by (optional):", ["None"] + categorical_cols)
+            # Smart sampling for large datasets
             sample_size = min(5000, len(self.df))
+            if len(self.df) > sample_size:
+                sample_df = self.df.sample(n=sample_size, random_state=42)
+                st.info(f"📊 Showing sample of {sample_size:,} points for performance")
+            else:
+                sample_df = self.df
+            # Enhanced scatter plot
+            if color_var != "None":
+                fig_scatter = px.scatter(
+                    sample_df, x=x_var, y=y_var, color=color_var,
+                    title=f"Relationship: {x_var} vs {y_var} (colored by {color_var})",
+                    trendline="ols"
+                )
+            else:
+                fig_scatter = px.scatter(
+                    sample_df, x=x_var, y=y_var,
+                    title=f"Relationship: {x_var} vs {y_var}",
+                    trendline="ols"
+                )
+            fig_scatter.update_layout(height=500)
+            st.plotly_chart(fig_scatter, use_container_width=True)
+            # Correlation analysis with business insights
             correlation = self.df[x_var].corr(self.df[y_var])
+            col1, col2, col3 = st.columns(3)
+            with col1:
+                st.metric("Correlation", f"{correlation:.3f}")
+            with col2:
+                if abs(correlation) > 0.7:
+                    strength = "Strong"
+                    color = "🟢"
+                elif abs(correlation) > 0.3:
+                    strength = "Moderate"
+                    color = "🟡"
+                else:
+                    strength = "Weak"
+                    color = "🔴"
+                st.metric("Strength", f"{color} {strength}")
+            with col3:
+                direction = "Positive" if correlation > 0 else "Negative"
+                st.metric("Direction", direction)
+            # Business interpretation
             if abs(correlation) > 0.7:
+                st.success(f"🎯 **Business Insight**: Strong relationship detected! {x_var} and {y_var} move together - valuable for prediction and business planning.")
+                self.add_insight(f"Strong correlation ({correlation:.3f}) between {x_var} and {y_var} - high predictive value", 4)
             elif abs(correlation) > 0.3:
+                st.info(f"📊 **Moderate relationship** between {x_var} and {y_var} - worth investigating further.")
+                self.add_insight(f"Moderate correlation ({correlation:.3f}) between {x_var} and {y_var}", 4)
+        # Group analysis with enhanced insights
         if categorical_cols and numeric_cols:
+            st.markdown("### 👥 Group Analysis")
             col1, col2 = st.columns(2)
             with col1:
                 group_var = st.selectbox("Group by:", categorical_cols)
             with col2:
+                metric_var = st.selectbox("Analyze metric:", numeric_cols)
+            # Calculate group statistics
             group_stats = calculate_group_stats(self.df, group_var, metric_var)
+            col_a, col_b = st.columns([1, 2])
+            with col_a:
+                st.dataframe(group_stats, use_container_width=True)
+                # Find best and worst performing groups
+                best_group = group_stats['mean'].idxmax()
+                worst_group = group_stats['mean'].idxmin()
+                st.success(f"🏆 **Best**: {best_group} (avg: {group_stats.loc[best_group, 'mean']:.2f})")
+                st.error(f"📉 **Needs Attention**: {worst_group} (avg: {group_stats.loc[worst_group, 'mean']:.2f})")
+            with col_b:
+                # Group comparison visualization
+                unique_groups = self.df[group_var].nunique()
+                if unique_groups <= 15:  # Manageable number of groups
+                    fig_groups = px.box(
+                        self.df, x=group_var, y=metric_var,
+                        title=f"{metric_var} Distribution by {group_var}",
+                        color=group_var
+                    )
+                    fig_groups.update_layout(height=400)
+                    st.plotly_chart(fig_groups, use_container_width=True)
+                else:
+                    # Too many groups - show summary statistics
+                    st.info(f"📊 {unique_groups} groups detected - showing statistical summary")
+                    summary_stats = self.df.groupby(group_var)[metric_var].agg(['count', 'mean', 'std']).reset_index()
+                    summary_stats = summary_stats.sort_values('mean', ascending=False).head(10)
+                    fig_summary = px.bar(
+                        summary_stats, x=group_var, y='mean',
+                        title=f"Top 10 {group_var} by Average {metric_var}",
+                        error_y='std'
+                    )
+                    st.plotly_chart(fig_summary, use_container_width=True)
+            # Statistical significance testing
+            if unique_groups <= 10 and len(group_stats) > 1:
+                from scipy import stats as scipy_stats
+                try:
+                    # ANOVA test for multiple groups
+                    groups = [self.df[self.df[group_var] == group][metric_var].dropna()
+                             for group in self.df[group_var].unique() if not pd.isna(group)]
+                    if len(groups) >= 2 and all(len(g) > 1 for g in groups):
+                        f_stat, p_value = scipy_stats.f_oneway(*groups)
+                        st.markdown("#### 📊 Statistical Significance")
+                        col1, col2 = st.columns(2)
+                        with col1:
+                            st.metric("F-statistic", f"{f_stat:.3f}")
+                        with col2:
+                            st.metric("P-value", f"{p_value:.4f}")
+                        if p_value < 0.05:
+                            st.success("✅ **Statistically significant** differences between groups!")
+                            self.add_insight(f"Significant group differences in {metric_var} by {group_var} (p={p_value:.4f})", 4)
+                        else:
+                            st.info("📊 No statistically significant differences between groups")
+                except Exception as e:
+                    st.warning(f"Statistical test failed: {str(e)}")
+            performance_gap = group_stats['mean'].max() - group_stats['mean'].min()
+            self.add_insight(f"Performance gap in {metric_var}: {performance_gap:.2f} between best and worst {group_var}", 4)
     def stage_5_summary(self):
+        """Enhanced Stage 5: Comprehensive summary with AI recommendations"""
+        st.markdown("### 📈 Analysis Summary & Results")
+        # Calculate final quality metrics
+        final_quality = self.calculate_enhanced_quality_score() if hasattr(self, 'calculate_enhanced_quality_score') else calculate_data_quality_score(self.df)
+        # Summary dashboard
+        col1, col2, col3, col4 = st.columns(4)
         with col1:
+            st.metric("Final Quality Score", f"{final_quality['score']:.0f}/100")
         with col2:
+            st.metric("Total Insights Generated", len(self.insights))
         with col3:
+            st.metric("Data Integrity", final_quality['grade'])
+        with col4:
+            improvement = "✅ Improved" if len(self.insights) > 5 else "📊 Analyzed"
+            st.metric("Status", improvement)
+        # Insights timeline
+        st.markdown("### 💡 Analysis Journey")
+        # Group insights by stage
+        stage_insights = {}
+        for insight in self.insights:
+            stage = insight['stage']
+            if stage not in stage_insights:
+                stage_insights[stage] = []
+            stage_insights[stage].append(insight['insight'])
+        for stage in sorted(stage_insights.keys()):
+            with st.expander(f"📋 Stage {stage}: {len(stage_insights[stage])} insights", expanded=True):
+                for i, insight in enumerate(stage_insights[stage], 1):
+                    st.write(f"{i}. {insight}")
+        # Enhanced export options
+        st.markdown("### 📥 Export Your Results")
+        tab1, tab2, tab3 = st.tabs(["📊 Cleaned Data", "📋 Analysis Report", "🐍 Python Code"])
+        with tab1:
+            st.markdown("#### 🔍 Data Preview")
+            col1, col2 = st.columns([3, 1])
+            with col1:
+                # Show comparison if data was modified
+                if not self.df.equals(self.original_df):
+                    st.success("✅ **Data has been cleaned and optimized!**")
+                    comparison_metrics = {
+                        'Original Rows': len(self.original_df),
+                        'Current Rows': len(self.df),
+                        'Rows Changed': len(self.df) - len(self.original_df),
+                        'Original Columns': len(self.original_df.columns),
+                        'Current Columns': len(self.df.columns)
+                    }
+                    comparison_df = pd.DataFrame([comparison_metrics])
+                    st.dataframe(comparison_df, use_container_width=True)
+                else:
+                    st.info("📊 **No cleaning operations applied** - original data maintained")
+                # Data preview
+                st.dataframe(self.df.head(10), use_container_width=True)
+            with col2:
+                st.markdown("**📥 Download Options**")
+                # CSV download
+                csv_data = self.df.to_csv(index=False)
+                st.download_button(
+                    label="📄 Download CSV",
+                    data=csv_data,
+                    file_name="cleaned_data.csv",
+                    mime="text/csv",
+                    use_container_width=True
+                )
+                # Excel download
+                excel_buffer = BytesIO()
+                self.df.to_excel(excel_buffer, index=False)
+                excel_data = excel_buffer.getvalue()
+                st.download_button(
+                    label="📊 Download Excel",
+                    data=excel_data,
+                    file_name="cleaned_data.xlsx",
+                    mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+                    use_container_width=True
+                )
+        with tab2:
+            # Generate comprehensive report
+            report = self.generate_enhanced_report()
+            col1, col2 = st.columns([3, 1])
+            with col1:
+                st.markdown(report)
+            with col2:
+                # Download report
+                st.download_button(
+                    label="📋 Download Report",
+                    data=report,
+                    file_name="data_analysis_report.md",
+                    mime="text/markdown",
+                    use_container_width=True
+                )
+                # Generate executive summary
+                exec_summary = self.generate_executive_summary()
+                st.download_button(
+                    label="📈 Executive Summary",
+                    data=exec_summary,
+                    file_name="executive_summary.txt",
+                    mime="text/plain",
+                    use_container_width=True
+                )
+        with tab3:
+            # Generate reproducible code
+            code = self.generate_enhanced_python_code()
             st.code(code, language="python")
             st.download_button(
+                label="🐍 Download Python Script",
                 data=code,
+                file_name="data_analysis_script.py",
+                mime="text/plain",
+                use_container_width=True
             )
+    def generate_enhanced_report(self) -> str:
+        """Generate comprehensive markdown report"""
+        report = f"""# 🔍 AI Data Quality Analysis Report
+## 📊 Executive Summary
+**Dataset**: {self.df.shape[0]:,} rows × {self.df.shape[1]} columns
+**Analysis Date**: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}
+**Quality Score**: {self.quality_metrics['score'] if self.quality_metrics else 'Not calculated'}/100
+## 🎯 Key Findings
+### Data Quality Assessment
 """
+        if hasattr(self, 'quality_metrics') and self.quality_metrics:
+            for issue in self.quality_metrics['issues']:
+                report += f"- ⚠️ {issue}\n"
+            if not self.quality_metrics['issues']:
+                report += "- ✅ No major quality issues detected\n"
+        report += f"""
+### 📈 Analysis Insights
 """
         # Group insights by stage
+        stage_names = {
+            1: "Data Overview",
+            2: "Exploratory Analysis",
+            3: "Quality Assessment",
+            4: "Advanced Analysis",
+            5: "Summary"
+        }
         for stage in range(1, 6):
             stage_insights = [i for i in self.insights if i['stage'] == stage]
             if stage_insights:
+                report += f"#### {stage_names.get(stage, f'Stage {stage}')}\n"
                 for insight in stage_insights:
+                    report += f"- {insight['insight']}\n"
+                report += "\n"
+        # Add data profile
+        report += f"""
+## 📋 Data Profile
+| Metric | Value |
+|--------|-------|
+| Total Records | {len(self.df):,} |
+| Total Columns | {len(self.df.columns)} |
+| Memory Usage | {self.stats['memory_usage']:.1f} MB |
+| Missing Values | {self.stats['missing_values']:,} |
+| Duplicate Records | {self.stats['duplicates']:,} |
+### Column Types Distribution
+"""
+        for dtype, count in self.stats['dtypes'].items():
+            report += f"- **{dtype}**: {count} columns\n"
+        report += f"""
+## 🚀 Recommendations
+### Immediate Actions
+1. **Data Quality**: Address missing values in critical business columns
+2. **Data Integrity**: Remove duplicate records before analysis
+3. **Outlier Treatment**: Investigate statistical anomalies for business context
+### Long-term Improvements
+1. **Process Enhancement**: Implement data validation at collection points
+2. **Monitoring**: Establish ongoing data quality metrics
+3. **Documentation**: Create data dictionary and lineage documentation
+---
+*Report generated by AI Data Quality Inspector*
+"""
         return report
+    def generate_executive_summary(self) -> str:
+        """Generate executive summary for business stakeholders"""
+        summary = f"""AI DATA QUALITY INSPECTOR - EXECUTIVE SUMMARY
+================================================
+DATASET: {self.df.shape[0]:,} records across {self.df.shape[1]} dimensions
+ANALYSIS DATE: {pd.Timestamp.now().strftime('%Y-%m-%d')}
+QUALITY ASSESSMENT:
+- Overall Score: {self.quality_metrics['score'] if self.quality_metrics else 'Calculating'}/100
+- Data Completeness: {100 - (self.stats['missing_values']/(len(self.df)*len(self.df.columns))*100):.1f}%
+- Data Integrity: {100 - (self.stats['duplicates']/len(self.df)*100):.1f}%
+KEY INSIGHTS:
+"""
+        # Add top 5 most important insights
+        important_insights = [i for i in self.insights if any(keyword in i['insight'].lower()
+                            for keyword in ['critical', 'strong', 'significant', 'high', 'best'])][:5]
+        for i, insight in enumerate(important_insights, 1):
+            summary += f"{i}. {insight['insight']}\n"
+        summary += f"""
+RECOMMENDATIONS:
+1. Address data quality issues before business analysis
+2. Leverage strong correlations for predictive insights
+3. Investigate outliers for business opportunities
+4. Implement ongoing data quality monitoring
+BUSINESS IMPACT:
+- Analysis Confidence: {'High' if len(important_insights) < 3 else 'Medium'}
+- Decision-Making Risk: {'Low' if self.stats['missing_values'] < len(self.df)*0.05 else 'Medium'}
+- Analytical Value: {'High' if len(self.column_types['numeric']) > 2 else 'Medium'}
+Generated by AI Data Quality Inspector
+"""
+        return summary
+    def generate_enhanced_python_code(self) -> str:
+        """Generate production-ready Python code"""
+        code = f"""# AI Data Quality Inspector - Generated Analysis Code
+# Dataset: {self.df.shape[0]:,} rows × {self.df.shape[1]} columns
+# Generated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}
+import pandas as pd
 import numpy as np
 import plotly.express as px
+import plotly.graph_objects as go
+from scipy import stats
+import warnings
+warnings.filterwarnings('ignore')
+# Load your data
+df = pd.read_csv('your_data.csv')  # Replace with your data source
+print(f"Dataset loaded: {{df.shape[0]:,}} rows × {{df.shape[1]}} columns")
+# ===== DATA QUALITY ASSESSMENT =====
+def calculate_quality_score(df):
+    \"\"\"Calculate comprehensive data quality score\"\"\"
+    score = 100
+    issues = []
+    # Missing values penalty
+    missing_pct = (df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100
+    if missing_pct > 0:
+        penalty = min(30, missing_pct * 1.5)
+        score -= penalty
+        issues.append(f"Missing values: {{missing_pct:.1f}}%")
+    # Duplicates penalty
+    duplicate_pct = (df.duplicated().sum() / len(df)) * 100
+    if duplicate_pct > 0:
+        penalty = min(25, duplicate_pct * 3)
+        score -= penalty
+        issues.append(f"Duplicates: {{duplicate_pct:.1f}}%")
+    return {{'score': max(0, score), 'issues': issues}}
+quality_results = calculate_quality_score(df)
+print(f"\\nQuality Score: {{quality_results['score']:.0f}}/100")
+if quality_results['issues']:
+    print("Issues found:")
+    for issue in quality_results['issues']:
+        print(f"  - {{issue}}")
+# ===== DATA CLEANING =====
+def clean_dataset(df):
+    \"\"\"Apply comprehensive data cleaning\"\"\"
+    cleaned_df = df.copy()
+    cleaning_log = []
+    # Remove duplicates
+    original_len = len(cleaned_df)
+    cleaned_df = cleaned_df.drop_duplicates()
+    if len(cleaned_df) < original_len:
+        removed = original_len - len(cleaned_df)
+        cleaning_log.append(f"Removed {{removed}} duplicate rows")
+    # Handle missing values intelligently
+    for col in cleaned_df.columns:
+        missing_count = cleaned_df[col].isnull().sum()
+        if missing_count > 0:
+            missing_pct = (missing_count / len(cleaned_df)) * 100
+            if missing_pct > 50:
+                # Drop columns with too many missing values
+                cleaned_df = cleaned_df.drop(columns=[col])
+                cleaning_log.append(f"Dropped column '{{col}}' ({{missing_pct:.1f}}% missing)")
+            elif cleaned_df[col].dtype in ['int64', 'float64']:
+                # Fill numeric with median
+                cleaned_df[col] = cleaned_df[col].fillna(cleaned_df[col].median())
+                cleaning_log.append(f"Filled missing values in '{{col}}' with median")
+            else:
+                # Fill categorical with mode
+                mode_val = cleaned_df[col].mode()
+                if not mode_val.empty:
+                    cleaned_df[col] = cleaned_df[col].fillna(mode_val[0])
+                    cleaning_log.append(f"Filled missing values in '{{col}}' with mode")
+    return cleaned_df, cleaning_log
+# Apply cleaning
+cleaned_df, cleaning_operations = clean_dataset(df)
+print("\\nCleaning Operations Applied:")
+for operation in cleaning_operations:
+    print(f"  ✅ {{operation}}")
+# ===== ANALYSIS FUNCTIONS =====
+def analyze_correlations(df):
+    \"\"\"Analyze correlations between numeric variables\"\"\"
+    numeric_cols = df.select_dtypes(include=[np.number]).columns
+    if len(numeric_cols) > 1:
+        corr_matrix = df[numeric_cols].corr()
+        # Find strongest correlations
+        correlations = []
+        for i in range(len(corr_matrix.columns)):
+            for j in range(i+1, len(corr_matrix.columns)):
+                corr_val = corr_matrix.iloc[i, j]
+                if abs(corr_val) > 0.3:
+                    correlations.append({{
+                        'var1': corr_matrix.columns[i],
+                        'var2': corr_matrix.columns[j],
+                        'correlation': corr_val,
+                        'strength': 'Strong' if abs(corr_val) > 0.7 else 'Moderate'
+                    }})
+        return correlations
+    return []
+def detect_outliers(df, column):
+    \"\"\"Detect outliers using IQR method\"\"\"
     Q1 = df[column].quantile(0.25)
     Q3 = df[column].quantile(0.75)
     IQR = Q3 - Q1
+    lower_bound = Q1 - 1.5 * IQR
+    upper_bound = Q3 + 1.5 * IQR
+    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
+    return outliers, lower_bound, upper_bound
+# ===== EXECUTE ANALYSIS =====
+print("\\n" + "="*50)
+print("ANALYSIS RESULTS")
+print("="*50)
+# Correlation analysis
+correlations = analyze_correlations(cleaned_df)
+if correlations:
+    print("\\nKey Correlations:")
+    for corr in correlations[:5]:
+        print(f"  {{corr['strength']}}: {{corr['var1']}} ↔ {{corr['var2']}} (r={{corr['correlation']:.3f}})")
+# Outlier analysis for numeric columns
+numeric_cols = cleaned_df.select_dtypes(include=[np.number]).columns
+print("\\nOutlier Analysis:")
 for col in numeric_cols:
+    outliers, lower, upper = detect_outliers(cleaned_df, col)
+    if len(outliers) > 0:
+        outlier_pct = (len(outliers) / len(cleaned_df)) * 100
+        print(f"  ⚠️  {{col}}: {{len(outliers)}} outliers ({{outlier_pct:.1f}}%)")
+    else:
+        print(f"  ✅ {{col}}: No outliers detected")
+# Final quality assessment
+final_quality = calculate_quality_score(cleaned_df)
+print(f"\\nFinal Quality Score: {{final_quality['score']:.0f}}/100")
+print("\\n🎉 Analysis Complete! Use the cleaned dataset for your business analysis.")
+# ===== VISUALIZATION EXAMPLES =====
+def create_quality_dashboard(df):
+    \"\"\"Create quality visualization dashboard\"\"\"
+    # Missing values heatmap
+    if df.isnull().sum().sum() > 0:
+        missing_matrix = df.isnull().head(100)  # First 100 rows
+        fig_missing = px.imshow(
+            missing_matrix.T,
+            title="Missing Values Pattern",
+            color_continuous_scale='Reds'
+        )
+        fig_missing.show()
+    # Correlation heatmap
     numeric_cols = df.select_dtypes(include=[np.number]).columns
     if len(numeric_cols) > 1:
+        corr_matrix = df[numeric_cols].corr()
+        fig_corr = px.imshow(
+            corr_matrix,
+            text_auto=True,
+            title="Correlation Matrix",
+            color_continuous_scale='RdBu_r'
+        )
+        fig_corr.show()
+# Uncomment to generate visualizations
+# create_quality_dashboard(cleaned_df)
+print("\\n📊 Visualization functions available:")
+print("  - create_quality_dashboard(df): Generate quality visualizations")
+print("  - Use plotly.express for interactive charts")
+print("  - All analysis functions are ready to use")
 """
+        return code
+    def get_paginated_data(self, page: int = 0) -> pd.DataFrame:
+        """Get paginated data for display"""
+        start_idx = page * self.page_size
+        end_idx = start_idx + self.page_size
+        return self.df.iloc[start_idx:end_idx]