Spaces:

entropy25
/

data-analysis-platform

Sleeping

App Files Files Community

entropy25 commited on Aug 9, 2025

Commit

fd475db

verified ·

1 Parent(s): 156a07c

Update analyzer.py

Browse files

Files changed (1) hide show

analyzer.py +977 -512

analyzer.py CHANGED Viewed

@@ -1,37 +1,28 @@
-"""
-Data Analysis Platform
-Copyright (c) 2025 JEAN YOUNG
-All rights reserved.
-This software is proprietary and confidential.
-Unauthorized copying, distribution, or use is prohibited.
-"""
 import streamlit as st
 import pandas as pd
 import numpy as np
 import plotly.express as px
 import plotly.graph_objects as go
 from typing import Dict, List, Any, Optional
 import os
 from dotenv import load_dotenv
 from data_handler import *
-from io import BytesIO
-# Load environment variables
-load_dotenv()
-# Optional AI Integration
 try:
-    import openai
-    OPENAI_AVAILABLE = True
 except ImportError:
-    OPENAI_AVAILABLE = False
-try:
-    import google.generativeai as genai
-    GEMINI_AVAILABLE = True
-except ImportError:
-    GEMINI_AVAILABLE = False
 class AIAssistant:
     """AI-powered analysis assistant"""
@@ -40,30 +31,22 @@ class AIAssistant:
         self.openai_key = os.getenv('OPENAI_API_KEY')
         self.gemini_key = os.getenv('GOOGLE_API_KEY')
-        if self.gemini_key and GEMINI_AVAILABLE:
-            genai.configure(api_key=self.gemini_key)
-            self.gemini_model = genai.GenerativeModel('gemini-1.5-flash')
-    def get_available_models(self) -> List[str]:
-        """Get list of available AI models"""
-        models = []
-        if self.openai_key and OPENAI_AVAILABLE:
-            models.append("OpenAI GPT")
-        if self.gemini_key and GEMINI_AVAILABLE:
-            models.append("Google Gemini")
-        return models
     def analyze_insights(self, df: pd.DataFrame, insights: List[Dict], model: str = "Google Gemini") -> str:
         """Get AI analysis of insights"""
-        # Prepare data summary
         summary = f"""
         Dataset Summary:
         - Shape: {df.shape}
         - Columns: {list(df.columns)}
-        - Data types: {df.dtypes.value_counts().to_dict()}
-        Key Insights Found:
         """
         for insight in insights:
@@ -71,42 +54,33 @@ class AIAssistant:
         prompt = f"""
         As a senior data scientist, analyze this dataset and provide:
-        1. Business implications of the findings
-        2. Potential opportunities or risks
-        3. Recommendations for decision-making
         4. Suggestions for further analysis
         {summary}
-        Provide actionable insights in a professional format.
         """
         try:
-            if model == "Google Gemini" and hasattr(self, 'gemini_model'):
                 response = self.gemini_model.generate_content(prompt)
                 return response.text
-            elif model == "OpenAI GPT" and self.openai_key:
-                client = openai.OpenAI(api_key=self.openai_key)
-                response = client.chat.completions.create(
-                    model="gpt-3.5-turbo",
-                    messages=[{"role": "user", "content": prompt}]
-                )
-                return response.choices[0].message.content
             else:
-                return "AI analysis not available. Please configure API keys."
         except Exception as e:
             return f"AI Analysis Error: {str(e)}"
 class DataAnalysisWorkflow:
-    """Optimized data analysis workflow with caching and pagination"""
     def __init__(self, df: pd.DataFrame):
         self.df = df
         self.stats = calculate_basic_stats(df)
         self.column_types = get_column_types(df)
         self.insights = []
-        self.page_size = 1000  # For pagination
     def add_insight(self, insight: str, stage: int):
         """Add insight to analysis report"""
@@ -116,586 +90,1077 @@ class DataAnalysisWorkflow:
             'timestamp': pd.Timestamp.now()
         })
-    def get_paginated_data(self, page: int = 0) -> pd.DataFrame:
-        """Get paginated data for display"""
-        start_idx = page * self.page_size
-        end_idx = start_idx + self.page_size
-        return self.df.iloc[start_idx:end_idx]
     def stage_1_overview(self):
-        """Stage 1: Data Overview with caching"""
         st.subheader("📊 Data Overview")
-        # Data Quality Score
-        quality_metrics = calculate_data_quality_score(self.df)
         col1, col2, col3, col4 = st.columns(4)
         with col1:
-            st.metric("Rows", f"{self.stats['shape'][0]:,}")
         with col2:
-            st.metric("Columns", f"{self.stats['shape'][1]:,}")
         with col3:
-            st.metric("Quality Score", f"{quality_metrics['score']:.1f}/100")
         with col4:
-            st.metric("Grade", quality_metrics['grade'])
-        if quality_metrics['issues']:
-            st.warning("Quality Issues Found:")
-            for issue in quality_metrics['issues']:
-                st.write(f"• {issue}")
-        # Memory Usage and Optimization
-        st.subheader("Memory Analysis")
-        memory_opt = calculate_memory_optimization(self.df)
-        col1, col2 = st.columns(2)
-        with col1:
-            st.metric("Current Memory", f"{memory_opt['current_memory_mb']:.1f} MB")
-        with col2:
-            if memory_opt['potential_savings_mb'] > 0:
-                st.metric("Potential Savings",
-                         f"{memory_opt['potential_savings_mb']:.1f} MB",
-                         f"{memory_opt['potential_savings_pct']:.1f}%")
-                if st.button("Show Optimization Details"):
-                    st.dataframe(pd.DataFrame(memory_opt['suggestions']))
-        # Column Cardinality Analysis
-        st.subheader("Column Cardinality Analysis")
-        cardinality_df = calculate_column_cardinality(self.df)
-        # Filter options
-        col_types = cardinality_df['Type'].unique()
-        selected_types = st.multiselect("Filter by Column Type",
-                                      col_types,
-                                      default=col_types)
-        filtered_df = cardinality_df[cardinality_df['Type'].isin(selected_types)]
-        st.dataframe(filtered_df, use_container_width=True)
-        # Highlight important findings
-        id_cols = filtered_df[filtered_df['Type'] == 'Unique Identifier']['Column'].tolist()
-        if id_cols:
-            st.info(f"📌 Potential ID columns found: {', '.join(id_cols)}")
-        const_cols = filtered_df[filtered_df['Type'] == 'Constant']['Column'].tolist()
-        if const_cols:
-            st.warning(f"⚠️ Constant columns found: {', '.join(const_cols)}")
-        # Data types visualization
         if self.stats['dtypes']:
-            st.subheader("Data Types Distribution")
-            fig = px.pie(values=list(self.stats['dtypes'].values()),
-                        names=list(self.stats['dtypes'].keys()),
-                        title="Data Types")
-            st.plotly_chart(fig, use_container_width=True)
-        # Sample data with pagination
-        st.subheader("Sample Data")
-        total_pages = (len(self.df) - 1) // self.page_size + 1
-        if total_pages > 1:
-            page = st.slider("Page", 0, total_pages - 1, 0)
-            sample_data = self.get_paginated_data(page)
-            st.write(f"Showing rows {page * self.page_size + 1} to {min((page + 1) * self.page_size, len(self.df))}")
         else:
-            sample_data = self.df.head(10)
-        st.dataframe(sample_data, use_container_width=True)
         # Missing values analysis
         missing_df = calculate_missing_data(self.df)
         if not missing_df.empty:
             st.subheader("Missing Values Analysis")
             st.dataframe(missing_df, use_container_width=True)
             worst_column = missing_df.iloc[0]['Column']
             worst_percentage = missing_df.iloc[0]['Missing %']
             self.add_insight(f"Column '{worst_column}' has highest missing data: {worst_percentage:.1f}%", 1)
         else:
-            st.success("✅ No missing values found!")
-            self.add_insight("Dataset has no missing values - excellent data quality", 1)
-        # Add insights about data quality and cardinality
-        if quality_metrics['score'] < 80:
-            self.add_insight(f"Data quality needs improvement (Score: {quality_metrics['score']:.1f}/100)", 1)
-        if memory_opt['potential_savings_pct'] > 20:
-            self.add_insight(f"Potential memory optimization of {memory_opt['potential_savings_pct']:.1f}% identified", 1)
-        if id_cols:
-            self.add_insight(f"Found {len(id_cols)} potential ID columns", 1)
     def stage_2_exploration(self):
-        """Stage 2: Exploratory Data Analysis with caching"""
         st.subheader("🔍 Exploratory Data Analysis")
         numeric_cols = self.column_types['numeric']
         categorical_cols = self.column_types['categorical']
-        # Numeric analysis
         if numeric_cols:
-            st.subheader("Numeric Variables")
-            selected_numeric = st.selectbox("Select numeric column:", numeric_cols)
-            col1, col2 = st.columns(2)
-            with col1:
-                fig = px.histogram(self.df, x=selected_numeric,
-                                 title=f"Distribution of {selected_numeric}")
-                st.plotly_chart(fig, use_container_width=True)
-            with col2:
-                fig = px.box(self.df, y=selected_numeric,
-                           title=f"Box Plot of {selected_numeric}")
-                st.plotly_chart(fig, use_container_width=True)
-            # Statistical summary
-            st.subheader("Statistical Summary")
-            summary_stats = self.df[numeric_cols].describe()
-            st.dataframe(summary_stats, use_container_width=True)
-            # Correlation analysis
-            if len(numeric_cols) > 1:
-                st.subheader("Correlation Analysis")
-                corr_matrix = calculate_correlation_matrix(self.df)
-                if not corr_matrix.empty:
-                    fig = px.imshow(corr_matrix, text_auto=True, aspect="auto",
-                                   title="Correlation Matrix")
                     st.plotly_chart(fig, use_container_width=True)
-                    # Find highest correlation
-                    corr_values = []
                     for i in range(len(corr_matrix.columns)):
                         for j in range(i+1, len(corr_matrix.columns)):
-                            corr_values.append(abs(corr_matrix.iloc[i, j]))
-                    if corr_values:
-                        max_corr = max(corr_values)
-                        self.add_insight(f"Maximum correlation coefficient: {max_corr:.3f}", 2)
-        # Categorical analysis
         if categorical_cols:
-            st.subheader("Categorical Variables")
             selected_categorical = st.selectbox("Select categorical column:", categorical_cols)
-            value_counts = get_value_counts(self.df, selected_categorical)
-            fig = px.bar(x=value_counts.index, y=value_counts.values,
-                        title=f"Top 10 {selected_categorical} Values")
-            st.plotly_chart(fig, use_container_width=True)
             total_categories = self.df[selected_categorical].nunique()
-            self.add_insight(f"Column '{selected_categorical}' has {total_categories} unique categories", 2)
-    def stage_3_cleaning(self):
-        """Stage 3: Data Quality Assessment"""
         st.subheader("🧹 Data Quality Assessment")
-        cleaning_actions = []
-        cleaning_history = []
-        # Missing values handling
         if self.stats['missing_values'] > 0:
-            st.subheader("Missing Values Treatment")
-            missing_df = calculate_missing_data(self.df)
-            st.dataframe(missing_df, use_container_width=True)
-            col1, col2 = st.columns(2)
-            with col1:
-                selected_col = st.selectbox("Select column to handle missing values:",
-                                          missing_df['Column'].tolist())
-            with col2:
-                fill_method = st.selectbox("Choose fill method:",
-                                         ["Drop rows", "Mean", "Median", "Mode", "Custom value"])
-            if st.button("Apply Missing Value Treatment"):
-                try:
-                    if fill_method == "Drop rows":
-                        self.df = self.df.dropna(subset=[selected_col])
-                        cleaning_history.append(f"Dropped rows with missing values in {selected_col}")
-                    else:
-                        if fill_method == "Mean":
-                            fill_value = self.df[selected_col].mean()
-                        elif fill_method == "Median":
-                            fill_value = self.df[selected_col].median()
-                        elif fill_method == "Mode":
-                            fill_value = self.df[selected_col].mode()[0]
-                        else:  # Custom value
-                            fill_value = st.number_input("Enter custom value:", value=0.0)
-                        self.df[selected_col] = self.df[selected_col].fillna(fill_value)
-                        cleaning_history.append(f"Filled missing values in {selected_col} with {fill_method}")
-                    st.success("✅ Missing values handled successfully!")
-                except Exception as e:
-                    st.error(f"Error handling missing values: {str(e)}")
-        # Duplicates handling
         if self.stats['duplicates'] > 0:
-            st.subheader("Duplicate Rows")
-            st.warning(f"Found {self.stats['duplicates']} duplicate rows")
-            if st.button("Remove Duplicate Rows"):
-                original_len = len(self.df)
-                self.df = self.df.drop_duplicates()
-                removed = original_len - len(self.df)
-                cleaning_history.append(f"Removed {removed} duplicate rows")
-                st.success(f"✅ Removed {removed} duplicate rows")
         else:
-            st.success("✅ No duplicate rows found")
-        # Mixed type detection and handling
-        mixed_types = detect_mixed_types(self.df)
-        if mixed_types:
-            st.subheader("Mixed Data Types")
-            mixed_df = pd.DataFrame(mixed_types)
-            st.dataframe(mixed_df, use_container_width=True)
-            selected_col = st.selectbox("Select column to fix data type:",
-                                      [item['column'] for item in mixed_types])
-            fix_method = st.selectbox("Choose fix method:",
-                                    ["Convert to numeric", "Convert to string"])
-            if st.button("Fix Data Type"):
-                try:
-                    if fix_method == "Convert to numeric":
-                        self.df[selected_col] = pd.to_numeric(self.df[selected_col], errors='coerce')
-                    else:
-                        self.df[selected_col] = self.df[selected_col].astype(str)
-                    cleaning_history.append(f"Fixed data type for {selected_col} to {fix_method}")
-                    st.success("✅ Data type fixed successfully!")
-                except Exception as e:
-                    st.error(f"Error fixing data type: {str(e)}")
-        # Outlier detection and handling
         numeric_cols = self.column_types['numeric']
         if numeric_cols:
             st.subheader("Outlier Detection")
-            selected_col = st.selectbox("Select column for outlier detection:", numeric_cols)
-            outliers = calculate_outliers(self.df, selected_col)
-            outlier_count = len(outliers)
-            if outlier_count > 0:
-                st.warning(f"Found {outlier_count} potential outliers in '{selected_col}'")
-                st.dataframe(outliers[[selected_col]].head(100), use_container_width=True)
-                treatment_method = st.selectbox("Choose outlier treatment method:",
-                                              ["None", "Remove", "Cap at percentiles"])
-                if treatment_method != "None" and st.button("Apply Outlier Treatment"):
-                    try:
-                        if treatment_method == "Remove":
-                            self.df = self.df[~self.df.index.isin(outliers.index)]
-                            cleaning_history.append(f"Removed {outlier_count} outliers from {selected_col}")
-                        else:  # Cap at percentiles
-                            Q1 = self.df[selected_col].quantile(0.25)
-                            Q3 = self.df[selected_col].quantile(0.75)
-                            IQR = Q3 - Q1
-                            lower_bound = Q1 - 1.5 * IQR
-                            upper_bound = Q3 + 1.5 * IQR
-                            self.df[selected_col] = self.df[selected_col].clip(lower_bound, upper_bound)
-                            cleaning_history.append(f"Capped outliers in {selected_col} at percentiles")
-                        st.success("✅ Outliers handled successfully!")
-                    except Exception as e:
-                        st.error(f"Error handling outliers: {str(e)}")
             else:
-                st.success(f"✅ No outliers detected in '{selected_col}'")
-        # Cleaning History
-        if cleaning_history:
-            st.subheader("Cleaning Operations History")
-            for i, operation in enumerate(cleaning_history, 1):
-                st.write(f"{i}. {operation}")
-            self.add_insight(f"Performed {len(cleaning_history)} data cleaning operations", 3)
-        # Summary
-        if cleaning_actions:
-            st.subheader("Remaining Action Items")
-            for i, action in enumerate(cleaning_actions, 1):
-                st.write(f"{i}. {action}")
-            self.add_insight(f"Identified {len(cleaning_actions)} data quality issues", 3)
         else:
-            st.success("✅ Data quality is excellent!")
-            self.add_insight("No major data quality issues found", 3)
-    def stage_4_analysis(self):
-        """Stage 4: Advanced Analysis"""
         st.subheader("🔬 Advanced Analysis")
         numeric_cols = self.column_types['numeric']
         categorical_cols = self.column_types['categorical']
-        # Relationship analysis
         if len(numeric_cols) >= 2:
-            st.subheader("Variable Relationships")
             col1, col2 = st.columns(2)
             with col1:
-                x_var = st.selectbox("X Variable:", numeric_cols)
             with col2:
-                y_var = st.selectbox("Y Variable:",
-                                   [col for col in numeric_cols if col != x_var])
-            # Sample data for performance if dataset is large
             sample_size = min(5000, len(self.df))
-            sample_df = self.df.sample(n=sample_size) if len(self.df) > sample_size else self.df
-            fig = px.scatter(sample_df, x=x_var, y=y_var,
-                           title=f"Relationship: {x_var} vs {y_var}")
             st.plotly_chart(fig, use_container_width=True)
             correlation = self.df[x_var].corr(self.df[y_var])
-            st.metric("Correlation", f"{correlation:.3f}")
-            if abs(correlation) > 0.7:
-                strength = "Strong"
-            elif abs(correlation) > 0.3:
-                strength = "Moderate"
-            else:
-                strength = "Weak"
-            direction = "positive" if correlation > 0 else "negative"
-            st.write(f"**Result:** {strength} {direction} correlation")
-            self.add_insight(f"{strength} correlation ({correlation:.3f}) between {x_var} and {y_var}", 4)
-        # Group analysis
         if categorical_cols and numeric_cols:
-            st.subheader("Group Analysis")
             col1, col2 = st.columns(2)
             with col1:
-                group_var = st.selectbox("Group by:", categorical_cols)
             with col2:
-                metric_var = st.selectbox("Analyze:", numeric_cols)
             group_stats = calculate_group_stats(self.df, group_var, metric_var)
-            st.dataframe(group_stats, use_container_width=True)
-            # Sample for visualization if too many groups
             unique_groups = self.df[group_var].nunique()
             if unique_groups <= 20:
-                fig = px.box(self.df, x=group_var, y=metric_var,
-                           title=f"{metric_var} by {group_var}")
-                st.plotly_chart(fig, use_container_width=True)
             else:
-                st.info(f"Too many groups ({unique_groups}) for visualization. Showing statistics only.")
-            best_group = group_stats['mean'].idxmax()
-            best_value = group_stats.loc[best_group, 'mean']
-            self.add_insight(f"'{best_group}' has highest average {metric_var}: {best_value:.2f}", 4)
-    def stage_5_summary(self):
-        """Stage 5: Summary and Export"""
-        st.subheader("📈 Analysis Summary")
-        # Key metrics
-        col1, col2, col3 = st.columns(3)
         with col1:
-            st.metric("Total Insights", len(self.insights))
         with col2:
-            quality = "High" if self.stats['missing_values'] == 0 else "Medium"
             st.metric("Data Quality", quality)
         with col3:
-            st.metric("Analysis Complete", "✅")
-        # Insights summary
-        st.subheader("Key Insights")
-        for i, insight in enumerate(self.insights, 1):
-            st.write(f"{i}. **Stage {insight['stage']}:** {insight['insight']}")
         # Export options
-        st.subheader("Export Results")
-        export_format = st.selectbox("Choose export format:",
-                                   ["Text Report", "Markdown Report", "Python Code", "Cleaned Data"])
-        if export_format == "Text Report":
-            report = self.generate_text_report()
-            st.download_button(
-                label="Download Text Report",
-                data=report,
-                file_name="analysis_report.txt",
-                mime="text/plain"
-            )
-        elif export_format == "Markdown Report":
-            report = self.generate_markdown_report()
-            st.download_button(
-                label="Download Markdown Report",
-                data=report,
-                file_name="analysis_report.md",
-                mime="text/markdown"
-            )
-        elif export_format == "Python Code":
-            code = self.generate_python_code()
-            st.code(code, language="python")
-            st.download_button(
-                label="Download Python Script",
-                data=code,
-                file_name="analysis_script.py",
-                mime="text/plain"
-            )
-        else:  # Cleaned Data
-            # Offer different export formats
-            data_format = st.selectbox("Choose data format:",
-                                     ["CSV", "Excel", "Parquet"])
-            if st.button("Export Data"):
-                try:
-                    if data_format == "CSV":
-                        csv = self.df.to_csv(index=False)
-                        st.download_button(
-                            label="Download CSV",
-                            data=csv,
-                            file_name="cleaned_data.csv",
-                            mime="text/csv"
-                        )
-                    elif data_format == "Excel":
-                        excel_buffer = BytesIO()
-                        self.df.to_excel(excel_buffer, index=False)
-                        excel_data = excel_buffer.getvalue()
-                        st.download_button(
-                            label="Download Excel",
-                            data=excel_data,
-                            file_name="cleaned_data.xlsx",
-                            mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
-                        )
-                    else:  # Parquet
-                        parquet_buffer = BytesIO()
-                        self.df.to_parquet(parquet_buffer, index=False)
-                        parquet_data = parquet_buffer.getvalue()
-                        st.download_button(
-                            label="Download Parquet",
-                            data=parquet_data,
-                            file_name="cleaned_data.parquet",
-                            mime="application/octet-stream"
-                        )
-                except Exception as e:
-                    st.error(f"Error exporting data: {str(e)}")
-    def generate_text_report(self) -> str:
-        """Generate text analysis report"""
-        report = f"""DATA ANALYSIS REPORT
-==================
-Dataset Overview:
-- Rows: {self.stats['shape'][0]:,}
-- Columns: {self.stats['shape'][1]:,}
-- Missing Values: {self.stats['missing_values']:,}
-- Memory Usage: {self.stats['memory_usage']:.1f} MB
-Key Insights:
 """
-        for insight in self.insights:
-            report += f"\n- Stage {insight['stage']}: {insight['insight']}"
-        report += f"\n\nGenerated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}"
-        return report
-    def generate_markdown_report(self) -> str:
-        """Generate markdown analysis report"""
-        report = f"""# Data Analysis Report
-## Dataset Overview
-* **Rows:** {self.stats['shape'][0]:,}
-* **Columns:** {self.stats['shape'][1]:,}
-* **Missing Values:** {self.stats['missing_values']:,}
-* **Memory Usage:** {self.stats['memory_usage']:.1f} MB
-## Data Types
-```
-{pd.DataFrame(self.stats['dtypes'].items(), columns=['Type', 'Count']).to_markdown()}
-```
-## Key Insights
 """
-        # Group insights by stage
-        for stage in range(1, 6):
-            stage_insights = [i for i in self.insights if i['stage'] == stage]
-            if stage_insights:
-                report += f"\n### Stage {stage}\n"
-                for insight in stage_insights:
-                    report += f"* {insight['insight']}\n"
-        report += f"\n\n*Generated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}*"
-        return report
-    def generate_python_code(self) -> str:
-        """Generate reproducible Python code"""
-        code = """import pandas as pd
-import numpy as np
-import plotly.express as px
-from typing import Dict, List, Any
-# Load and prepare data
-df = pd.read_csv('your_data.csv')  # Update with your data source
-# Basic statistics
-def calculate_basic_stats(df: pd.DataFrame) -> Dict[str, Any]:
-    return {
-        'shape': df.shape,
-        'memory_usage': float(df.memory_usage(deep=True).sum() / 1024**2),
-        'missing_values': int(df.isnull().sum().sum()),
-        'dtypes': df.dtypes.value_counts().to_dict(),
-        'duplicates': int(df.duplicated().sum())
-    }
-stats = calculate_basic_stats(df)
-print("\\nBasic Statistics:")
-print(f"- Shape: {stats['shape']}")
-print(f"- Memory Usage: {stats['memory_usage']:.1f} MB")
-print(f"- Missing Values: {stats['missing_values']}")
-print(f"- Duplicates: {stats['duplicates']}")
 """
-        # Add data cleaning operations if any were performed
-        if hasattr(self, 'cleaning_history'):
-            code += "\n# Data Cleaning\n"
-            for operation in self.cleaning_history:
-                if "missing values" in operation.lower():
-                    code += "# Handle missing values\n"
-                    code += "df = df.fillna(method='ffill')  # Update with your chosen method\n"
-                elif "duplicate" in operation.lower():
-                    code += "# Remove duplicates\n"
-                    code += "df = df.drop_duplicates()\n"
-                elif "outlier" in operation.lower():
-                    code += """# Handle outliers
-def remove_outliers(df: pd.DataFrame, column: str) -> pd.DataFrame:
-    Q1 = df[column].quantile(0.25)
-    Q3 = df[column].quantile(0.75)
-    IQR = Q3 - Q1
-    return df[~((df[column] < (Q1 - 1.5 * IQR)) | (df[column] > (Q3 + 1.5 * IQR)))]
-# Apply to numeric columns as needed
-numeric_cols = df.select_dtypes(include=[np.number]).columns
-for col in numeric_cols:
-    df = remove_outliers(df, col)
 """
-        # Add visualization code
-        code += """
-# Visualizations
-def plot_missing_values(df: pd.DataFrame):
-    missing = df.isnull().sum()
-    if missing.sum() > 0:
-        missing = missing[missing > 0]
-        fig = px.bar(x=missing.index, y=missing.values,
-                    title='Missing Values by Column')
-        fig.show()
-def plot_correlations(df: pd.DataFrame):
-    numeric_cols = df.select_dtypes(include=[np.number]).columns
-    if len(numeric_cols) > 1:
-        corr = df[numeric_cols].corr()
-        fig = px.imshow(corr, title='Correlation Matrix')
-        fig.show()
-# Generate plots
-plot_missing_values(df)
-plot_correlations(df)
 """
-        return code

 import streamlit as st
 import pandas as pd
 import numpy as np
 import plotly.express as px
 import plotly.graph_objects as go
+import plotly.figure_factory as ff
+from plotly.subplots import make_subplots
 from typing import Dict, List, Any, Optional
 import os
 from dotenv import load_dotenv
 from data_handler import *
+# ML imports
 try:
+    from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
+    from sklearn.linear_model import LogisticRegression, LinearRegression
+    from sklearn.model_selection import train_test_split
+    from sklearn.metrics import accuracy_score, r2_score, mean_squared_error
+    from sklearn.preprocessing import LabelEncoder
+    ML_AVAILABLE = True
 except ImportError:
+    ML_AVAILABLE = False
+# Load environment variables
+load_dotenv()
 class AIAssistant:
     """AI-powered analysis assistant"""
         self.openai_key = os.getenv('OPENAI_API_KEY')
         self.gemini_key = os.getenv('GOOGLE_API_KEY')
+        try:
+            import google.generativeai as genai
+            if self.gemini_key:
+                genai.configure(api_key=self.gemini_key)
+                self.gemini_model = genai.GenerativeModel('gemini-1.5-flash')
+        except ImportError:
+            pass
     def analyze_insights(self, df: pd.DataFrame, insights: List[Dict], model: str = "Google Gemini") -> str:
         """Get AI analysis of insights"""
         summary = f"""
         Dataset Summary:
         - Shape: {df.shape}
         - Columns: {list(df.columns)}
+        Key Insights:
         """
         for insight in insights:
         prompt = f"""
         As a senior data scientist, analyze this dataset and provide:
+        1. Business implications
+        2. Key opportunities and risks
+        3. Actionable recommendations
         4. Suggestions for further analysis
         {summary}
         """
         try:
+            if hasattr(self, 'gemini_model'):
                 response = self.gemini_model.generate_content(prompt)
                 return response.text
             else:
+                return "AI analysis not available. Please configure API keys in .env file."
         except Exception as e:
             return f"AI Analysis Error: {str(e)}"
 class DataAnalysisWorkflow:
+    """Enhanced data analysis workflow with ML capabilities"""
     def __init__(self, df: pd.DataFrame):
         self.df = df
+        self.original_df = df.copy()  # Keep original for reference
         self.stats = calculate_basic_stats(df)
         self.column_types = get_column_types(df)
         self.insights = []
+        self.ml_results = {}
     def add_insight(self, insight: str, stage: int):
         """Add insight to analysis report"""
             'timestamp': pd.Timestamp.now()
         })
     def stage_1_overview(self):
+        """Stage 1: Enhanced Data Overview"""
         st.subheader("📊 Data Overview")
+        # Key metrics with better formatting
         col1, col2, col3, col4 = st.columns(4)
         with col1:
+            st.metric("Total Rows", f"{self.stats['shape'][0]:,}")
         with col2:
+            st.metric("Total Columns", f"{self.stats['shape'][1]:,}")
         with col3:
+            missing_pct = (self.stats['missing_values'] / (self.stats['shape'][0] * self.stats['shape'][1])) * 100
+            st.metric("Missing Values", f"{self.stats['missing_values']:,}", f"{missing_pct:.1f}%")
         with col4:
+            st.metric("Memory Usage", f"{self.stats['memory_usage']:.1f} MB")
+        # Enhanced data types visualization
         if self.stats['dtypes']:
+            col1, col2 = st.columns(2)
+            with col1:
+                fig = px.pie(
+                    values=list(self.stats['dtypes'].values()),
+                    names=list(self.stats['dtypes'].keys()),
+                    title="Data Types Distribution",
+                    color_discrete_sequence=px.colors.qualitative.Set3
+                )
+                st.plotly_chart(fig, use_container_width=True)
+            with col2:
+                # Column overview table
+                column_info = []
+                for col in self.df.columns:
+                    column_info.append({
+                        'Column': col,
+                        'Type': str(self.df[col].dtype),
+                        'Non-Null': self.df[col].notna().sum(),
+                        'Unique': self.df[col].nunique()
+                    })
+                info_df = pd.DataFrame(column_info)
+                st.subheader("Column Details")
+                st.dataframe(info_df, use_container_width=True, height=300)
+        # Enhanced data preview
+        st.subheader("Data Preview")
+        preview_option = st.radio(
+            "Preview type:",
+            ["First 10 rows", "Last 10 rows", "Random sample", "Custom range"],
+            horizontal=True
+        )
+        if preview_option == "First 10 rows":
+            st.dataframe(self.df.head(10), use_container_width=True)
+        elif preview_option == "Last 10 rows":
+            st.dataframe(self.df.tail(10), use_container_width=True)
+        elif preview_option == "Random sample":
+            sample_size = min(10, len(self.df))
+            st.dataframe(self.df.sample(n=sample_size), use_container_width=True)
         else:
+            col1, col2 = st.columns(2)
+            with col1:
+                start_row = st.number_input("Start row", 0, len(self.df)-1, 0)
+            with col2:
+                end_row = st.number_input("End row", start_row+1, len(self.df), min(start_row+10, len(self.df)))
+            st.dataframe(self.df.iloc[start_row:end_row], use_container_width=True)
         # Missing values analysis
         missing_df = calculate_missing_data(self.df)
         if not missing_df.empty:
             st.subheader("Missing Values Analysis")
+            # Visualize missing values
+            fig = px.bar(
+                missing_df,
+                x='Column',
+                y='Missing %',
+                title="Missing Values by Column",
+                color='Missing %',
+                color_continuous_scale='Reds'
+            )
+            st.plotly_chart(fig, use_container_width=True)
             st.dataframe(missing_df, use_container_width=True)
             worst_column = missing_df.iloc[0]['Column']
             worst_percentage = missing_df.iloc[0]['Missing %']
             self.add_insight(f"Column '{worst_column}' has highest missing data: {worst_percentage:.1f}%", 1)
         else:
+            st.success("✅ No missing values found - Excellent data quality!")
+            self.add_insight("Dataset has perfect completeness with no missing values", 1)
     def stage_2_exploration(self):
+        """Stage 2: Enhanced Exploratory Data Analysis"""
         st.subheader("🔍 Exploratory Data Analysis")
         numeric_cols = self.column_types['numeric']
         categorical_cols = self.column_types['categorical']
+        # Numeric analysis with enhanced visualizations
         if numeric_cols:
+            st.subheader("Numeric Variables Analysis")
+            # Multi-column selection
+            selected_numerics = st.multiselect(
+                "Select numeric columns for analysis:",
+                numeric_cols,
+                default=numeric_cols[:3] if len(numeric_cols) >= 3 else numeric_cols
+            )
+            if selected_numerics:
+                # Distribution plots
+                st.subheader("Distribution Analysis")
+                if len(selected_numerics) == 1:
+                    col = selected_numerics[0]
+                    col1, col2 = st.columns(2)
+                    with col1:
+                        fig = px.histogram(
+                            self.df,
+                            x=col,
+                            marginal="box",
+                            title=f"Distribution of {col}",
+                            nbins=50
+                        )
+                        st.plotly_chart(fig, use_container_width=True)
+                    with col2:
+                        # Q-Q plot
+                        from scipy import stats
+                        fig = go.Figure()
+                        # Remove NaN values for Q-Q plot
+                        clean_data = self.df[col].dropna()
+                        if len(clean_data) > 0:
+                            qq = stats.probplot(clean_data, dist="norm")
+                            fig.add_trace(go.Scatter(
+                                x=qq[0][0],
+                                y=qq[0][1],
+                                mode='markers',
+                                name='Data points'
+                            ))
+                            fig.add_trace(go.Scatter(
+                                x=qq[0][0],
+                                y=qq[1][1] + qq[1][0] * qq[0][0],
+                                mode='lines',
+                                name='Normal distribution line',
+                                line=dict(color='red')
+                            ))
+                            fig.update_layout(
+                                title=f"Q-Q Plot: {col}",
+                                xaxis_title="Theoretical Quantiles",
+                                yaxis_title="Sample Quantiles"
+                            )
+                            st.plotly_chart(fig, use_container_width=True)
+                else:
+                    # Multiple distributions
+                    fig = make_subplots(
+                        rows=len(selected_numerics),
+                        cols=1,
+                        subplot_titles=selected_numerics,
+                        vertical_spacing=0.05
+                    )
+                    for i, col in enumerate(selected_numerics, 1):
+                        fig.add_trace(
+                            go.Histogram(x=self.df[col], name=col, nbinsx=30),
+                            row=i, col=1
+                        )
+                    fig.update_layout(height=200 * len(selected_numerics), showlegend=False)
                     st.plotly_chart(fig, use_container_width=True)
+                # Statistical summary
+                st.subheader("Statistical Summary")
+                summary_stats = self.df[selected_numerics].describe()
+                st.dataframe(summary_stats, use_container_width=True)
+                # Correlation analysis
+                if len(selected_numerics) > 1:
+                    st.subheader("Correlation Analysis")
+                    corr_matrix = self.df[selected_numerics].corr()
+                    # Enhanced correlation heatmap
+                    fig = px.imshow(
+                        corr_matrix,
+                        text_auto=True,
+                        aspect="auto",
+                        title="Correlation Matrix",
+                        color_continuous_scale='RdBu',
+                        zmin=-1, zmax=1
+                    )
+                    fig.update_layout(height=500)
+                    st.plotly_chart(fig, use_container_width=True)
+                    # Find strongest correlations
+                    corr_pairs = []
                     for i in range(len(corr_matrix.columns)):
                         for j in range(i+1, len(corr_matrix.columns)):
+                            corr_val = corr_matrix.iloc[i, j]
+                            if abs(corr_val) > 0.1:  # Only show meaningful correlations
+                                corr_pairs.append({
+                                    'Variable 1': corr_matrix.columns[i],
+                                    'Variable 2': corr_matrix.columns[j],
+                                    'Correlation': corr_val,
+                                    'Strength': 'Strong' if abs(corr_val) > 0.7 else 'Moderate' if abs(corr_val) > 0.3 else 'Weak'
+                                })
+                    if corr_pairs:
+                        corr_df = pd.DataFrame(corr_pairs).sort_values('Correlation', key=abs, ascending=False)
+                        st.subheader("Top Correlations")
+                        st.dataframe(corr_df, use_container_width=True)
+                        strongest = corr_df.iloc[0]
+                        self.add_insight(
+                            f"Strongest correlation: {strongest['Variable 1']} vs {strongest['Variable 2']} ({strongest['Correlation']:.3f})",
+                            2
+                        )
+        # Enhanced categorical analysis
         if categorical_cols:
+            st.subheader("Categorical Variables Analysis")
             selected_categorical = st.selectbox("Select categorical column:", categorical_cols)
+            value_counts = get_value_counts(self.df, selected_categorical, 15)  # Top 15
+            col1, col2 = st.columns(2)
+            with col1:
+                # Bar chart
+                fig = px.bar(
+                    x=value_counts.values,
+                    y=value_counts.index,
+                    orientation='h',
+                    title=f"Top Categories in {selected_categorical}",
+                    color=value_counts.values,
+                    color_continuous_scale='viridis'
+                )
+                fig.update_layout(height=400, yaxis={'categoryorder':'total ascending'})
+                st.plotly_chart(fig, use_container_width=True)
+            with col2:
+                # Pie chart for top categories
+                top_5 = value_counts.head(5)
+                others = value_counts.iloc[5:].sum() if len(value_counts) > 5 else 0
+                if others > 0:
+                    pie_data = list(top_5.values) + [others]
+                    pie_labels = list(top_5.index) + ['Others']
+                else:
+                    pie_data = list(top_5.values)
+                    pie_labels = list(top_5.index)
+                fig = px.pie(
+                    values=pie_data,
+                    names=pie_labels,
+                    title=f"Distribution of {selected_categorical}",
+                    color_discrete_sequence=px.colors.qualitative.Set3
+                )
+                st.plotly_chart(fig, use_container_width=True)
+            # Category statistics
             total_categories = self.df[selected_categorical].nunique()
+            most_common = value_counts.index[0]
+            most_common_pct = (value_counts.iloc[0] / len(self.df)) * 100
+            st.metric("Total Unique Categories", total_categories)
+            st.metric("Most Common Category", f"{most_common} ({most_common_pct:.1f}%)")
+            self.add_insight(f"Column '{selected_categorical}' has {total_categories} categories, dominated by '{most_common}' ({most_common_pct:.1f}%)", 2)
+    def stage_3_quality_check(self):
+        """Stage 3: Enhanced Data Quality Assessment"""
         st.subheader("🧹 Data Quality Assessment")
+        quality_score = 100
+        issues = []
+        # Missing values check
         if self.stats['missing_values'] > 0:
+            missing_pct = (self.stats['missing_values'] / (self.stats['shape'][0] * self.stats['shape'][1])) * 100
+            st.warning(f"⚠️ Found {self.stats['missing_values']:,} missing values ({missing_pct:.2f}%)")
+            quality_score -= min(missing_pct * 2, 30)
+            issues.append("Missing values detected")
+        else:
+            st.success("✅ No missing values")
+        # Duplicates check
         if self.stats['duplicates'] > 0:
+            dup_pct = (self.stats['duplicates'] / self.stats['shape'][0]) * 100
+            st.warning(f"⚠️ Found {self.stats['duplicates']:,} duplicate rows ({dup_pct:.2f}%)")
+            quality_score -= min(dup_pct * 3, 25)
+            issues.append("Duplicate rows found")
         else:
+            st.success("✅ No duplicate rows")
+        # Outlier detection with enhanced visualization
         numeric_cols = self.column_types['numeric']
         if numeric_cols:
             st.subheader("Outlier Detection")
+            outlier_summary = []
+            for col in numeric_cols:
+                outliers = calculate_outliers(self.df, col)
+                outlier_pct = (len(outliers) / len(self.df)) * 100
+                outlier_summary.append({
+                    'Column': col,
+                    'Outliers': len(outliers),
+                    'Percentage': outlier_pct,
+                    'Status': '⚠️ High' if outlier_pct > 10 else '⚡ Medium' if outlier_pct > 5 else '✅ Low'
+                })
+            outlier_df = pd.DataFrame(outlier_summary)
+            st.dataframe(outlier_df, use_container_width=True)
+            # Visualize outliers
+            selected_col = st.selectbox("Select column for detailed outlier analysis:", numeric_cols)
+            col1, col2 = st.columns(2)
+            with col1:
+                fig = px.box(
+                    self.df,
+                    y=selected_col,
+                    title=f"Box Plot: {selected_col}",
+                    points="outliers"
+                )
+                st.plotly_chart(fig, use_container_width=True)
+            with col2:
+                # Outlier details
+                outliers = calculate_outliers(self.df, selected_col)
+                if len(outliers) > 0:
+                    st.metric("Outliers Found", len(outliers))
+                    st.metric("Outlier Percentage", f"{len(outliers)/len(self.df)*100:.2f}%")
+                    if len(outliers) <= 100:  # Show outlier values if not too many
+                        st.subheader("Outlier Values")
+                        st.dataframe(outliers[[selected_col]].head(20), use_container_width=True)
+                else:
+                    st.success("✅ No outliers detected")
+            # Adjust quality score based on outliers
+            total_outlier_pct = sum([row['Percentage'] for row in outlier_summary]) / len(outlier_summary)
+            quality_score -= min(total_outlier_pct, 20)
+        # Data consistency checks
+        st.subheader("Data Consistency Analysis")
+        consistency_issues = []
+        # Check for mixed data types in object columns
+        for col in self.column_types['categorical']:
+            unique_types = set(type(x).__name__ for x in self.df[col].dropna().head(100))
+            if len(unique_types) > 1:
+                consistency_issues.append(f"Mixed data types in column '{col}': {unique_types}")
+        # Check for unusual string patterns
+        for col in self.column_types['categorical']:
+            sample_values = self.df[col].dropna().head(50).astype(str)
+            if sample_values.str.contains(r'^[0-9]+$').any() and sample_values.str.contains(r'[a-zA-Z]').any():
+                consistency_issues.append(f"Mixed numeric/text patterns in column '{col}'")
+        if consistency_issues:
+            for issue in consistency_issues:
+                st.warning(f"⚠️ {issue}")
+            quality_score -= len(consistency_issues) * 5
+        else:
+            st.success("✅ Data types are consistent")
+        # Overall quality score
+        st.subheader("Overall Data Quality Score")
+        quality_score = max(0, min(100, quality_score))  # Ensure 0-100 range
+        col1, col2, col3 = st.columns(3)
+        with col2:
+            if quality_score >= 90:
+                st.success(f"🏆 Excellent Quality: {quality_score:.0f}/100")
+                quality_level = "Excellent"
+            elif quality_score >= 75:
+                st.info(f"👍 Good Quality: {quality_score:.0f}/100")
+                quality_level = "Good"
+            elif quality_score >= 60:
+                st.warning(f"⚠️ Fair Quality: {quality_score:.0f}/100")
+                quality_level = "Fair"
             else:
+                st.error(f"❌ Poor Quality: {quality_score:.0f}/100")
+                quality_level = "Poor"
+        # Action recommendations
+        if issues:
+            st.subheader("📋 Recommended Actions")
+            for i, issue in enumerate(issues, 1):
+                st.write(f"{i}. Address {issue}")
+            self.add_insight(f"Data quality: {quality_level} ({quality_score:.0f}/100) - {len(issues)} issues identified", 3)
         else:
+            st.success("🎉 No major data quality issues found!")
+            self.add_insight(f"Excellent data quality ({quality_score:.0f}/100) with no major issues", 3)
+    def stage_4_advanced_analysis(self):
+        """Stage 4: Advanced Statistical Analysis"""
         st.subheader("🔬 Advanced Analysis")
         numeric_cols = self.column_types['numeric']
         categorical_cols = self.column_types['categorical']
+        # Advanced relationship analysis
         if len(numeric_cols) >= 2:
+            st.subheader("🔗 Advanced Relationship Analysis")
+            # Scatter plot matrix for multiple variables
+            if len(numeric_cols) >= 3:
+                st.subheader("Scatter Plot Matrix")
+                selected_vars = st.multiselect(
+                    "Select variables for scatter plot matrix:",
+                    numeric_cols,
+                    default=numeric_cols[:4] if len(numeric_cols) >= 4 else numeric_cols
+                )
+                if len(selected_vars) >= 2:
+                    # Sample data for performance
+                    sample_size = min(1000, len(self.df))
+                    sample_df = self.df[selected_vars].sample(n=sample_size) if len(self.df) > sample_size else self.df[selected_vars]
+                    fig = px.scatter_matrix(
+                        sample_df,
+                        dimensions=selected_vars,
+                        title="Scatter Plot Matrix"
+                    )
+                    fig.update_layout(height=600)
+                    st.plotly_chart(fig, use_container_width=True)
+            # Pairwise analysis
+            st.subheader("Detailed Pairwise Analysis")
             col1, col2 = st.columns(2)
             with col1:
+                x_var = st.selectbox("X Variable:", numeric_cols, key="x_var_advanced")
             with col2:
+                y_var = st.selectbox("Y Variable:", [col for col in numeric_cols if col != x_var], key="y_var_advanced")
+            # Color by categorical variable option
+            color_var = None
+            if categorical_cols:
+                use_color = st.checkbox("Color by categorical variable")
+                if use_color:
+                    color_var = st.selectbox("Color variable:", categorical_cols)
+            # Create enhanced scatter plot
             sample_size = min(5000, len(self.df))
+            plot_df = self.df.sample(n=sample_size) if len(self.df) > sample_size else self.df
+            fig = px.scatter(
+                plot_df,
+                x=x_var,
+                y=y_var,
+                color=color_var,
+                title=f"Advanced Analysis: {x_var} vs {y_var}",
+                trendline="ols",
+                marginal_x="histogram",
+                marginal_y="histogram"
+            )
             st.plotly_chart(fig, use_container_width=True)
+            # Statistical analysis
             correlation = self.df[x_var].corr(self.df[y_var])
+            col1, col2, col3 = st.columns(3)
+            with col1:
+                st.metric("Correlation", f"{correlation:.3f}")
+            with col2:
+                r_squared = correlation ** 2
+                st.metric("R²", f"{r_squared:.3f}")
+            with col3:
+                if abs(correlation) > 0.7:
+                    strength = "Strong"
+                elif abs(correlation) > 0.3:
+                    strength = "Moderate"
+                else:
+                    strength = "Weak"
+                st.metric("Relationship", strength)
+            self.add_insight(f"Advanced analysis: {strength} relationship between {x_var} and {y_var} (r={correlation:.3f})", 4)
+        # Group comparison analysis
         if categorical_cols and numeric_cols:
+            st.subheader("📊 Group Comparison Analysis")
             col1, col2 = st.columns(2)
             with col1:
+                group_var = st.selectbox("Group by:", categorical_cols, key="group_var_advanced")
             with col2:
+                metric_var = st.selectbox("Analyze metric:", numeric_cols, key="metric_var_advanced")
+            # Calculate group statistics
             group_stats = calculate_group_stats(self.df, group_var, metric_var)
+            # Enhanced group visualization
             unique_groups = self.df[group_var].nunique()
             if unique_groups <= 20:
+                col1, col2 = st.columns(2)
+                with col1:
+                    # Box plot
+                    fig = px.box(
+                        self.df,
+                        x=group_var,
+                        y=metric_var,
+                        title=f"{metric_var} Distribution by {group_var}",
+                        points="outliers"
+                    )
+                    fig.update_xaxes(tickangle=45)
+                    st.plotly_chart(fig, use_container_width=True)
+                with col2:
+                    # Violin plot
+                    fig = px.violin(
+                        self.df,
+                        x=group_var,
+                        y=metric_var,
+                        title=f"{metric_var} Density by {group_var}",
+                        box=True
+                    )
+                    fig.update_xaxes(tickangle=45)
+                    st.plotly_chart(fig, use_container_width=True)
+                # Statistical comparison
+                st.subheader("Statistical Comparison")
+                st.dataframe(group_stats, use_container_width=True)
+                # Identify best performing group
+                best_group = group_stats['mean'].idxmax()
+                best_value = group_stats.loc[best_group, 'mean']
+                worst_group = group_stats['mean'].idxmin()
+                worst_value = group_stats.loc[worst_group, 'mean']
+                col1, col2 = st.columns(2)
+                with col1:
+                    st.metric("Best Performing Group", best_group, f"Avg: {best_value:.2f}")
+                with col2:
+                    st.metric("Lowest Performing Group", worst_group, f"Avg: {worst_value:.2f}")
+                self.add_insight(f"Group analysis: '{best_group}' performs best with average {metric_var} of {best_value:.2f}", 4)
+            else:
+                st.info(f"Too many groups ({unique_groups}) for detailed visualization. Showing summary statistics only.")
+                st.dataframe(group_stats.head(15), use_container_width=True)
+    def stage_5_ml_modeling(self):
+        """Stage 5: Machine Learning Modeling"""
+        st.subheader("🤖 Machine Learning Modeling")
+        if not ML_AVAILABLE:
+            st.warning("⚠️ Machine Learning libraries not available. Please install scikit-learn to use this feature.")
+            st.code("pip install scikit-learn")
+            return
+        numeric_cols = self.column_types['numeric']
+        categorical_cols = self.column_types['categorical']
+        if len(numeric_cols) < 2:
+            st.warning("⚠️ Need at least 2 numeric columns for ML modeling.")
+            return
+        st.info("🎯 Automated machine learning model training and evaluation")
+        # Model configuration
+        st.subheader("Model Configuration")
+        col1, col2 = st.columns(2)
+        with col1:
+            target_column = st.selectbox(
+                "Select target variable (what to predict):",
+                numeric_cols + categorical_cols
+            )
+        with col2:
+            model_type = st.radio(
+                "Problem type:",
+                ["Auto-detect", "Regression", "Classification"]
+            )
+        # Feature selection
+        available_features = [col for col in numeric_cols if col != target_column]
+        if len(available_features) == 0:
+            st.error("❌ No suitable features available for modeling.")
+            return
+        selected_features = st.multiselect(
+            "Select features (leave empty for auto-selection):",
+            available_features,
+            default=available_features[:5] if len(available_features) >= 5 else available_features
+        )
+        if not selected_features:
+            selected_features = available_features[:10]  # Auto-select top 10
+        if st.button("🚀 Train Models", type="primary"):
+            try:
+                with st.spinner("Training machine learning models..."):
+                    self._train_ml_models(target_column, selected_features, model_type)
+                st.success("✅ Models trained successfully!")
+            except Exception as e:
+                st.error(f"❌ Model training failed: {str(e)}")
+        # Display results if available
+        if hasattr(self, 'ml_results') and self.ml_results:
+            self._display_ml_results()
+    def _train_ml_models(self, target_col: str, feature_cols: List[str], model_type: str):
+        """Train ML models"""
+        # Prepare data
+        X = self.df[feature_cols].copy()
+        y = self.df[target_col].copy()
+        # Handle missing values
+        X = X.fillna(X.mean())
+        y = y.fillna(y.mean() if y.dtype in ['int64', 'float64'] else y.mode()[0])
+        # Auto-detect problem type
+        if model_type == "Auto-detect":
+            if y.dtype == 'object' or y.nunique() < 10:
+                model_type = "Classification"
             else:
+                model_type = "Regression"
+        # Encode categorical target if needed
+        label_encoder = None
+        if model_type == "Classification" and y.dtype == 'object':
+            label_encoder = LabelEncoder()
+            y = label_encoder.fit_transform(y)
+        # Split data
+        X_train, X_test, y_train, y_test = train_test_split(
+            X, y, test_size=0.2, random_state=42, stratify=y if model_type == "Classification" else None
+        )
+        # Train models
+        models = {}
+        results = {}
+        if model_type == "Regression":
+            models = {
+                "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
+                "Linear Regression": LinearRegression()
+            }
+        else:
+            models = {
+                "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
+                "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000)
+            }
+        for name, model in models.items():
+            # Train model
+            model.fit(X_train, y_train)
+            # Make predictions
+            y_pred = model.predict(X_test)
+            # Calculate metrics
+            if model_type == "Regression":
+                r2 = r2_score(y_test, y_pred)
+                mse = mean_squared_error(y_test, y_pred)
+                results[name] = {
+                    "R² Score": r2,
+                    "MSE": mse,
+                    "RMSE": np.sqrt(mse)
+                }
+            else:
+                accuracy = accuracy_score(y_test, y_pred)
+                results[name] = {
+                    "Accuracy": accuracy
+                }
+            # Feature importance
+            if hasattr(model, 'feature_importances_'):
+                feature_importance = pd.DataFrame({
+                    'feature': feature_cols,
+                    'importance': model.feature_importances_
+                }).sort_values('importance', ascending=False)
+                results[name]['feature_importance'] = feature_importance
+        # Store results
+        self.ml_results = {
+            'model_type': model_type,
+            'target_column': target_col,
+            'feature_columns': feature_cols,
+            'results': results,
+            'label_encoder': label_encoder,
+            'test_size': len(X_test)
+        }
+        # Add insight
+        best_model = max(results.keys(), key=lambda x:
+            results[x]['R² Score'] if model_type == "Regression" else results[x]['Accuracy']
+        )
+        best_score = (results[best_model]['R² Score'] if model_type == "Regression"
+                     else results[best_model]['Accuracy'])
+        self.add_insight(f"ML modeling: Best {model_type.lower()} model is {best_model} with score {best_score:.3f}", 5)
+    def _display_ml_results(self):
+        """Display ML modeling results"""
+        st.subheader("🎯 Model Performance Results")
+        results = self.ml_results['results']
+        model_type = self.ml_results['model_type']
+        # Performance comparison
+        performance_data = []
+        for model_name, metrics in results.items():
+            row = {'Model': model_name}
+            for metric, value in metrics.items():
+                if metric != 'feature_importance':
+                    row[metric] = value
+            performance_data.append(row)
+        performance_df = pd.DataFrame(performance_data)
+        st.dataframe(performance_df, use_container_width=True)
+        # Visualize performance
+        if model_type == "Regression":
+            metric_to_plot = "R² Score"
+        else:
+            metric_to_plot = "Accuracy"
+        fig = px.bar(
+            performance_df,
+            x='Model',
+            y=metric_to_plot,
+            title=f"Model Performance Comparison ({metric_to_plot})",
+            color=metric_to_plot,
+            color_continuous_scale='viridis'
+        )
+        st.plotly_chart(fig, use_container_width=True)
+        # Feature importance analysis
+        st.subheader("🔍 Feature Importance Analysis")
+        # Get feature importance from best model
+        best_model = max(results.keys(), key=lambda x:
+            results[x][metric_to_plot]
+        )
+        if 'feature_importance' in results[best_model]:
+            importance_df = results[best_model]['feature_importance']
+            col1, col2 = st.columns(2)
+            with col1:
+                # Bar plot
+                fig = px.bar(
+                    importance_df.head(10),
+                    x='importance',
+                    y='feature',
+                    orientation='h',
+                    title=f"Top 10 Feature Importance ({best_model})",
+                    color='importance',
+                    color_continuous_scale='plasma'
+                )
+                fig.update_layout(yaxis={'categoryorder':'total ascending'})
+                st.plotly_chart(fig, use_container_width=True)
+            with col2:
+                # Show importance table
+                st.subheader("Feature Rankings")
+                st.dataframe(importance_df.head(10), use_container_width=True)
+            # Top features insight
+            top_feature = importance_df.iloc[0]['feature']
+            top_importance = importance_df.iloc[0]['importance']
+            self.add_insight(f"Most important feature: '{top_feature}' (importance: {top_importance:.3f})", 5)
+        # Model recommendations
+        st.subheader("📋 Model Recommendations")
+        best_score = results[best_model][metric_to_plot]
+        if model_type == "Regression":
+            if best_score > 0.8:
+                st.success(f"🏆 Excellent model performance! {best_model} explains {best_score*100:.1f}% of the variance.")
+            elif best_score > 0.6:
+                st.info(f"👍 Good model performance. {best_model} explains {best_score*100:.1f}% of the variance.")
+            else:
+                st.warning(f"⚠️ Model performance could be improved. Consider feature engineering or more advanced models.")
+        else:
+            if best_score > 0.9:
+                st.success(f"🏆 Excellent classification accuracy: {best_score*100:.1f}%")
+            elif best_score > 0.8:
+                st.info(f"👍 Good classification accuracy: {best_score*100:.1f}%")
+            else:
+                st.warning(f"⚠️ Classification accuracy could be improved: {best_score*100:.1f}%")
+    def stage_6_summary(self):
+        """Stage 6: Enhanced Summary and Export"""
+        st.subheader("📈 Analysis Summary & Export")
+        # Key metrics overview
+        col1, col2, col3, col4 = st.columns(4)
         with col1:
+            st.metric("Total Insights Generated", len(self.insights))
         with col2:
+            quality = "High" if self.stats['missing_values'] == 0 and self.stats['duplicates'] == 0 else "Medium"
             st.metric("Data Quality", quality)
         with col3:
+            analysis_completeness = "100%" if len(self.insights) >= 5 else f"{len(self.insights)*20}%"
+            st.metric("Analysis Complete", analysis_completeness)
+        with col4:
+            ml_status = "✅" if hasattr(self, 'ml_results') and self.ml_results else "➖"
+            st.metric("ML Models", ml_status)
+        # Insights timeline
+        st.subheader("🔍 Key Insights Timeline")
+        insights_by_stage = {}
+        for insight in self.insights:
+            stage = insight['stage']
+            if stage not in insights_by_stage:
+                insights_by_stage[stage] = []
+            insights_by_stage[stage].append(insight)
+        stage_names = {
+            1: "📊 Data Overview",
+            2: "🔍 Exploration",
+            3: "🧹 Quality Check",
+            4: "🔬 Advanced Analysis",
+            5: "🤖 ML Modeling",
+            6: "📈 Summary"
+        }
+        for stage_num in sorted(insights_by_stage.keys()):
+            with st.expander(f"{stage_names.get(stage_num, f'Stage {stage_num}')} - {len(insights_by_stage[stage_num])} insights"):
+                for i, insight in enumerate(insights_by_stage[stage_num], 1):
+                    st.write(f"{i}. {insight['insight']}")
+                    st.caption(f"Generated: {insight['timestamp'].strftime('%H:%M:%S')}")
+        # Executive summary with AI
+        st.subheader("🤖 AI-Powered Executive Summary")
+        ai_assistant = AIAssistant()
+        if st.button("Generate AI Summary", type="primary"):
+            with st.spinner("Generating AI-powered analysis summary..."):
+                ai_summary = ai_assistant.analyze_insights(self.df, self.insights)
+                st.markdown("### 📋 Executive Summary")
+                st.markdown(ai_summary)
+                # Store AI summary for export
+                self.ai_summary = ai_summary
         # Export options
+        st.subheader("📥 Export Results")
+        col1, col2, col3 = st.columns(3)
+        with col1:
+            if st.button("📄 Generate Report"):
+                report = self._generate_comprehensive_report()
+                st.download_button(
+                    label="📥 Download Analysis Report",
+                    data=report,
+                    file_name=f"analysis_report_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}.txt",
+                    mime="text/plain"
+                )
+        with col2:
+            if st.button("📊 Export Data Summary"):
+                summary_data = self._generate_data_summary()
+                st.download_button(
+                    label="📥 Download Data Summary (CSV)",
+                    data=summary_data.to_csv(index=False),
+                    file_name=f"data_summary_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}.csv",
+                    mime="text/csv"
+                )
+        with col3:
+            if hasattr(self, 'ml_results') and self.ml_results:
+                if st.button("🤖 Export ML Results"):
+                    ml_report = self._generate_ml_report()
+                    st.download_button(
+                        label="📥 Download ML Report",
+                        data=ml_report,
+                        file_name=f"ml_report_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}.txt",
+                        mime="text/plain"
+                    )
+        # Analysis completion celebration
+        if len(self.insights) >= 5:
+            st.balloons()
+            st.success("🎉 Comprehensive analysis completed successfully!")
+    def _generate_comprehensive_report(self) -> str:
+        """Generate comprehensive analysis report"""
+        report = f"""
+COMPREHENSIVE DATA ANALYSIS REPORT
+{'='*50}
+DATASET OVERVIEW
+{'-'*20}
+• Dataset Shape: {self.stats['shape'][0]:,} rows × {self.stats['shape'][1]:,} columns
+• Memory Usage: {self.stats['memory_usage']:.2f} MB
+• Missing Values: {self.stats['missing_values']:,} ({self.stats['missing_values']/(self.stats['shape'][0]*self.stats['shape'][1])*100:.2f}%)
+• Duplicate Rows: {self.stats['duplicates']:,}
+DATA TYPES DISTRIBUTION
+{'-'*25}
 """
+        for dtype, count in self.stats['dtypes'].items():
+            report += f"• {dtype}: {count} columns\n"
+        report += f"""
+KEY INSIGHTS BY ANALYSIS STAGE
+{'-'*35}
+"""
+        stage_names = {
+            1: "Data Overview",
+            2: "Exploratory Analysis",
+            3: "Quality Assessment",
+            4: "Advanced Analysis",
+            5: "Machine Learning",
+            6: "Summary"
+        }
+        for i, insight in enumerate(self.insights, 1):
+            stage_name = stage_names.get(insight['stage'], f"Stage {insight['stage']}")
+            report += f"\n{i}. [{stage_name}] {insight['insight']}"
+        # Add ML results if available
+        if hasattr(self, 'ml_results') and self.ml_results:
+            report += f"""
+MACHINE LEARNING RESULTS
+{'-'*25}
+• Problem Type: {self.ml_results['model_type']}
+• Target Variable: {self.ml_results['target_column']}
+• Features Used: {len(self.ml_results['feature_columns'])}
+• Test Set Size: {self.ml_results['test_size']} samples
+Model Performance:
+"""
+            for model_name, metrics in self.ml_results['results'].items():
+                report += f"\n{model_name}:\n"
+                for metric, value in metrics.items():
+                    if metric != 'feature_importance':
+                        report += f"  • {metric}: {value:.4f}\n"
+        # Add AI summary if available
+        if hasattr(self, 'ai_summary'):
+            report += f"""
+AI-POWERED EXECUTIVE SUMMARY
+{'-'*30}
+{self.ai_summary}
 """
+        report += f"""
+ANALYSIS METADATA
+{'-'*18}
+• Total Insights Generated: {len(self.insights)}
+• Analysis Completion Time: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}
+• Platform: Enhanced Data Analysis Platform v2.0
+{'-'*50}
+Report generated automatically by Enhanced Data Analysis Platform
+"""
+        return report
+    def _generate_data_summary(self) -> pd.DataFrame:
+        """Generate data summary for export"""
+        summary_data = []
+        for col in self.df.columns:
+            col_info = {
+                'Column': col,
+                'Data_Type': str(self.df[col].dtype),
+                'Non_Null_Count': self.df[col].notna().sum(),
+                'Missing_Count': self.df[col].isna().sum(),
+                'Missing_Percentage': (self.df[col].isna().sum() / len(self.df)) * 100,
+                'Unique_Values': self.df[col].nunique(),
+                'Most_Common_Value': str(self.df[col].mode().iloc[0]) if not self.df[col].mode().empty else 'N/A'
+            }
+            if self.df[col].dtype in ['int64', 'float64']:
+                col_info.update({
+                    'Mean': self.df[col].mean(),
+                    'Median': self.df[col].median(),
+                    'Std_Dev': self.df[col].std(),
+                    'Min_Value': self.df[col].min(),
+                    'Max_Value': self.df[col].max()
+                })
+            summary_data.append(col_info)
+        return pd.DataFrame(summary_data)
+    def _generate_ml_report(self) -> str:
+        """Generate ML-specific report"""
+        if not hasattr(self, 'ml_results') or not self.ml_results:
+            return "No ML results available."
+        ml_report = f"""
+MACHINE LEARNING ANALYSIS REPORT
+{'='*40}
+MODEL CONFIGURATION
+{'-'*20}
+• Problem Type: {self.ml_results['model_type']}
+• Target Variable: {self.ml_results['target_column']}
+• Number of Features: {len(self.ml_results['feature_columns'])}
+• Features Used: {', '.join(self.ml_results['feature_columns'])}
+• Test Set Size: {self.ml_results['test_size']} samples
+MODEL PERFORMANCE RESULTS
+{'-'*27}
 """
+        for model_name, metrics in self.ml_results['results'].items():
+            ml_report += f"\n{model_name}:\n"
+            for metric, value in metrics.items():
+                if metric != 'feature_importance':
+                    ml_report += f"  • {metric}: {value:.6f}\n"
+        # Add feature importance for best model
+        best_model = max(self.ml_results['results'].keys(), key=lambda x:
+            list(self.ml_results['results'][x].values())[0] if isinstance(list(self.ml_results['results'][x].values())[0], (int, float)) else 0
+        )
+        if 'feature_importance' in self.ml_results['results'][best_model]:
+            ml_report += f"""
+FEATURE IMPORTANCE ANALYSIS ({best_model})
+{'-'*35}
 """
+            importance_df = self.ml_results['results'][best_model]['feature_importance']
+            for _, row in importance_df.head(10).iterrows():
+                ml_report += f"• {row['feature']}: {row['importance']:.6f}\n"
+        ml_report += f"""
+RECOMMENDATIONS
+{'-'*15}
+"""
+        if self.ml_results['model_type'] == "Regression":
+            best_score = max([metrics.get('R² Score', 0) for metrics in self.ml_results['results'].values()])
+            if best_score > 0.8:
+                ml_report += "• Excellent model performance - ready for production use\n"
+            elif best_score > 0.6:
+                ml_report += "• Good model performance - consider feature engineering for improvement\n"
+            else:
+                ml_report += "• Model performance needs improvement - try advanced algorithms or more features\n"
+        else:
+            best_score = max([metrics.get('Accuracy', 0) for metrics in self.ml_results['results'].values()])
+            if best_score > 0.9:
+                ml_report += "• Excellent classification accuracy - model ready for deployment\n"
+            elif best_score > 0.8:
+                ml_report += "• Good classification performance - minor optimizations recommended\n"
+            else:
+                ml_report += "• Classification accuracy needs improvement - consider ensemble methods\n"
+        ml_report += f"""
+{'-'*40}
+ML Report generated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}
 """
+        return ml_report