Spaces:

RamAi2026
/

dataanalyst

No application file

App Files Files Community

RamAi2026 commited on 15 days ago

Commit

da8e446

verified ·

1 Parent(s): 717a86b

Upload 13 files

Browse files

Files changed (13) hide show

README.MD +0 -0
app.py +1023 -0
chatbot.py +1051 -0
data_preprocessing.py +387 -0
data_quality.py +252 -0
dataset_overview.py +1159 -0
explainability.py +176 -0
insights.py +369 -0
ml_pipeline.py +940 -0
requirements.txt +16 -0
statistical_analysis.py +928 -0
utils.py +208 -0
visualization.py +435 -0

README.MD ADDED Viewed

Binary file (7.64 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,1023 @@

+import streamlit as st
+import pandas as pd
+import plotly.io as pio
+import traceback
+import sys
+from datetime import datetime
+from data_preprocessing import preprocess_data
+from insights import generate_business_insights
+from dataset_overview import eda_analysis  # Updated import
+from visualization import auto_visualizations
+from ml_pipeline import run_ml_pipeline
+from statistical_analysis import statistical_analysis
+from data_quality import quality_report
+from chatbot import data_chatbot
+# Set plotly template
+pio.templates.default = "plotly_white"
+# ---------------------------------------
+# PAGE CONFIG
+# ---------------------------------------
+st.set_page_config(
+    page_title="AI Data Analyst Pro",
+    layout="wide",
+    page_icon="📊",
+    initial_sidebar_state="expanded"
+)
+# ---------------------------------------
+# CUSTOM ERROR HANDLER
+# ---------------------------------------
+class StreamlitExceptionHandler:
+    """Custom exception handler for Streamlit"""
+    @staticmethod
+    def handle_exception(e, context="application"):
+        """Handle exceptions with user-friendly messages"""
+        error_type = type(e).__name__
+        error_msg = str(e)
+        # Create user-friendly error message
+        user_message = f"""
+        ### ❌ An error occurred in the {context}
+        **Error Type:** {error_type}
+        **What happened:** {error_msg if error_msg else "An unexpected error occurred"}
+        **Possible solutions:**
+        """
+        # Add specific solutions based on error type
+        if "MemoryError" in error_type:
+            user_message += """
+            - Your dataset might be too large. Try uploading a smaller file.
+            - Close other applications to free up memory.
+            - Consider sampling your data before uploading.
+            """
+        elif "KeyError" in error_type or "IndexError" in error_type:
+            user_message += """
+            - The requested column or index doesn't exist in your dataset.
+            - Check if you've selected valid columns for the operation.
+            - Try refreshing the page and uploading your data again.
+            """
+        elif "ValueError" in error_type:
+            user_message += """
+            - The data values don't match the expected format.
+            - Check for invalid values in your dataset (e.g., text in numeric columns).
+            - Ensure your data types are correct for the selected operation.
+            """
+        elif "TypeError" in error_type:
+            user_message += """
+            - There's a mismatch in data types.
+            - Check if you're mixing numeric and text data in operations.
+            - Use the preprocessing tab to convert data types appropriately.
+            """
+        elif "FileNotFoundError" in error_type:
+            user_message += """
+            - The file couldn't be found. Please upload it again.
+            - Check if the file path is correct.
+            """
+        elif "PermissionError" in error_type:
+            user_message += """
+            - Permission denied when accessing the file.
+            - Make sure the file isn't open in another program.
+            """
+        elif "pd.errors.EmptyDataError" in error_type:
+            user_message += """
+            - The uploaded file is empty.
+            - Please upload a file containing data.
+            """
+        elif "pd.errors.ParserError" in error_type:
+            user_message += """
+            - Couldn't parse the file. Check if it's a valid CSV or Excel file.
+            - Ensure the file format matches the selected file type.
+            """
+        else:
+            user_message += """
+            - Try refreshing the page and uploading your data again.
+            - Check if your data format is compatible with the operation.
+            - If the problem persists, try with a smaller sample of your data.
+            """
+        # Add technical details in an expander for debugging
+        user_message += f"""
+        **Technical Details:**
+        """
+        return user_message
+# Initialize session state for error tracking
+if "error_log" not in st.session_state:
+    st.session_state.error_log = []
+if "last_successful_operation" not in st.session_state:
+    st.session_state.last_successful_operation = None
+# ---------------------------------------
+# ADVANCED CSS WITH RESPONSIVE DESIGN
+# ---------------------------------------
+st.markdown("""
+<style>
+    /* Global Styles */
+    .main {
+        padding: 0rem 1rem;
+    }
+    /* Header Styling */
+    .header-container {
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+        padding: 2rem;
+        border-radius: 20px;
+        margin-bottom: 2rem;
+        color: white;
+        text-align: center;
+        box-shadow: 0 10px 30px rgba(0,0,0,0.2);
+    }
+    .header-title {
+        font-size: 2.5rem;
+        font-weight: 700;
+        margin-bottom: 0.5rem;
+        animation: fadeInDown 1s;
+    }
+    .header-subtitle {
+        font-size: 1.1rem;
+        opacity: 0.95;
+        animation: fadeInUp 1s;
+    }
+    /* Card Styling */
+    .custom-card {
+        background: white;
+        padding: 1.5rem;
+        border-radius: 15px;
+        box-shadow: 0 5px 15px rgba(0,0,0,0.08);
+        margin-bottom: 1.5rem;
+        border: 1px solid rgba(0,0,0,0.05);
+        transition: transform 0.3s, box-shadow 0.3s;
+    }
+    .custom-card:hover {
+        transform: translateY(-5px);
+        box-shadow: 0 8px 25px rgba(0,0,0,0.15);
+    }
+    /* Error Message Styling */
+    .error-container {
+        background: linear-gradient(135deg, #ff6b6b 0%, #ff4757 100%);
+        color: white;
+        padding: 1.5rem;
+        border-radius: 15px;
+        margin: 1rem 0;
+        box-shadow: 0 10px 30px rgba(255, 71, 87, 0.3);
+        animation: slideInRight 0.5s;
+    }
+    .error-title {
+        font-size: 1.5rem;
+        font-weight: 700;
+        margin-bottom: 1rem;
+    }
+    .error-solution {
+        background: rgba(255, 255, 255, 0.2);
+        padding: 1rem;
+        border-radius: 10px;
+        margin-top: 1rem;
+    }
+    /* Success Message Styling */
+    .success-container {
+        background: linear-gradient(135deg, #51cf66 0%, #37b24d 100%);
+        color: white;
+        padding: 1rem;
+        border-radius: 10px;
+        margin: 1rem 0;
+        animation: fadeInUp 0.5s;
+    }
+    /* Warning Message Styling */
+    .warning-container {
+        background: linear-gradient(135deg, #ffd43b 0%, #fcc419 100%);
+        color: #2c3e50;
+        padding: 1rem;
+        border-radius: 10px;
+        margin: 1rem 0;
+        animation: fadeInUp 0.5s;
+    }
+    /* Metric Cards */
+    .metric-card {
+        background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
+        padding: 1.2rem;
+        border-radius: 12px;
+        text-align: center;
+        border-left: 4px solid #667eea;
+    }
+    .metric-value {
+        font-size: 2rem;
+        font-weight: 700;
+        color: #2c3e50;
+        margin: 0.5rem 0;
+    }
+    .metric-label {
+        font-size: 0.9rem;
+        color: #7f8c8d;
+        text-transform: uppercase;
+        letter-spacing: 1px;
+    }
+    /* Chatbot Styling */
+    .chat-container {
+        max-width: 800px;
+        margin: 2rem auto;
+        background: #f8f9fa;
+        border-radius: 20px;
+        padding: 1.5rem;
+        box-shadow: 0 5px 20px rgba(0,0,0,0.1);
+    }
+    .user-message {
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+        color: white;
+        padding: 12px 18px;
+        border-radius: 20px 20px 5px 20px;
+        margin: 10px 0;
+        max-width: 80%;
+        margin-left: auto;
+        animation: slideInRight 0.5s;
+    }
+    .bot-message {
+        background: white;
+        color: #2c3e50;
+        padding: 12px 18px;
+        border-radius: 20px 20px 20px 5px;
+        margin: 10px 0;
+        max-width: 80%;
+        box-shadow: 0 2px 5px rgba(0,0,0,0.1);
+        animation: slideInLeft 0.5s;
+    }
+    /* Loading Spinner */
+    .custom-spinner {
+        border: 4px solid #f3f3f3;
+        border-top: 4px solid #667eea;
+        border-radius: 50%;
+        width: 40px;
+        height: 40px;
+        animation: spin 1s linear infinite;
+        margin: 20px auto;
+    }
+    @keyframes spin {
+        0% { transform: rotate(0deg); }
+        100% { transform: rotate(360deg); }
+    }
+    /* Animations */
+    @keyframes fadeInDown {
+        from {
+            opacity: 0;
+            transform: translateY(-20px);
+        }
+        to {
+            opacity: 1;
+            transform: translateY(0);
+        }
+    }
+    @keyframes fadeInUp {
+        from {
+            opacity: 0;
+            transform: translateY(20px);
+        }
+        to {
+            opacity: 1;
+            transform: translateY(0);
+        }
+    }
+    @keyframes slideInRight {
+        from {
+            opacity: 0;
+            transform: translateX(30px);
+        }
+        to {
+            opacity: 1;
+            transform: translateX(0);
+        }
+    }
+    @keyframes slideInLeft {
+        from {
+            opacity: 0;
+            transform: translateX(-30px);
+        }
+        to {
+            opacity: 1;
+            transform: translateX(0);
+        }
+    }
+    /* Responsive Design */
+    @media (max-width: 768px) {
+        .header-title {
+            font-size: 1.8rem;
+        }
+        .metric-value {
+            font-size: 1.5rem;
+        }
+        .user-message, .bot-message {
+            max-width: 95%;
+        }
+    }
+    /* Sidebar Styling */
+    .css-1d391kg {
+        background: linear-gradient(180deg, #f8f9fa 0%, #e9ecef 100%);
+    }
+    /* Button Styling */
+    .stButton > button {
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+        color: white;
+        border: none;
+        padding: 0.5rem 2rem;
+        border-radius: 25px;
+        font-weight: 600;
+        transition: transform 0.2s, box-shadow 0.2s;
+    }
+    .stButton > button:hover {
+        transform: translateY(-2px);
+        box-shadow: 0 5px 15px rgba(102, 126, 234, 0.4);
+    }
+    .stButton > button:disabled {
+        opacity: 0.5;
+        cursor: not-allowed;
+    }
+    /* Progress Bar */
+    .stProgress > div > div {
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+    }
+    /* Tabs */
+    .stTabs [data-baseweb="tab-list"] {
+        gap: 2rem;
+        background-color: #f8f9fa;
+        padding: 0.5rem;
+        border-radius: 30px;
+    }
+    .stTabs [data-baseweb="tab"] {
+        border-radius: 25px;
+        padding: 0.5rem 2rem;
+    }
+    /* Tooltip */
+    .tooltip {
+        position: relative;
+        display: inline-block;
+        cursor: help;
+    }
+    .tooltip .tooltiptext {
+        visibility: hidden;
+        width: 200px;
+        background-color: #555;
+        color: #fff;
+        text-align: center;
+        border-radius: 6px;
+        padding: 5px;
+        position: absolute;
+        z-index: 1;
+        bottom: 125%;
+        left: 50%;
+        margin-left: -100px;
+        opacity: 0;
+        transition: opacity 0.3s;
+    }
+    .tooltip:hover .tooltiptext {
+        visibility: visible;
+        opacity: 1;
+    }
+</style>
+""", unsafe_allow_html=True)
+# ---------------------------------------
+# HEADER WITH ANIMATION
+# ---------------------------------------
+st.markdown("""
+<div class="header-container">
+    <div class="header-title">📊 AI Data Analyst Pro</div>
+    <div class="header-subtitle">Intelligent Data Analysis & Visualization Platform</div>
+</div>
+""", unsafe_allow_html=True)
+# ---------------------------------------
+# SESSION STATE INITIALIZATION
+# ---------------------------------------
+if "data" not in st.session_state:
+    st.session_state.data = None
+if "processed_data" not in st.session_state:
+    st.session_state.processed_data = None
+if "uploaded_file_name" not in st.session_state:
+    st.session_state.uploaded_file_name = None
+if "upload_error" not in st.session_state:
+    st.session_state.upload_error = None
+if "data_loaded" not in st.session_state:
+    st.session_state.data_loaded = False
+if "operation_status" not in st.session_state:
+    st.session_state.operation_status = {}
+# ---------------------------------------
+# HELPER FUNCTIONS
+# ---------------------------------------
+def safe_dataframe_operation(func, df, *args, **kwargs):
+    """Safely execute dataframe operations with error handling"""
+    try:
+        result = func(df, *args, **kwargs)
+        st.session_state.last_successful_operation = func.__name__
+        return result, None
+    except Exception as e:
+        error_msg = StreamlitExceptionHandler.handle_exception(e, func.__name__)
+        return None, error_msg
+def validate_dataset(df):
+    """Validate dataset for common issues"""
+    issues = []
+    if df.empty:
+        issues.append("The dataset is empty")
+    if df.shape[0] == 0:
+        issues.append("No rows in the dataset")
+    if df.shape[1] == 0:
+        issues.append("No columns in the dataset")
+    # Check for memory issues
+    memory_usage = df.memory_usage(deep=True).sum() / 1024**3  # GB
+    if memory_usage > 1:
+        issues.append(f"Large dataset detected ({memory_usage:.2f} GB). Some operations may be slow.")
+    # Check for mixed types
+    for col in df.columns:
+        if df[col].dtype == 'object':
+            # Check if column has mixed types
+            types = df[col].apply(type).unique()
+            if len(types) > 1:
+                issues.append(f"Column '{col}' has mixed data types: {types}")
+    return issues
+def show_validation_warnings(issues):
+    """Display validation warnings"""
+    if issues:
+        st.markdown("""
+        <div class="warning-container">
+            <strong>⚠️ Data Quality Warnings:</strong><br>
+        """ + "<br>".join([f"• {issue}" for issue in issues]) + """
+        </div>
+        """, unsafe_allow_html=True)
+# ---------------------------------------
+# SIDEBAR WITH ENHANCED NAVIGATION
+# ---------------------------------------
+with st.sidebar:
+    st.markdown("### 🧭 Navigation")
+    # Custom radio buttons styling
+    page = st.radio(
+        "Select Module",
+        ["📤 Upload Dataset", "🛠️ Preprocessing", "🔍 EDA",
+         "📈 Visualization", "🤖 Machine Learning", "💡 Insights",
+         "💬 Chatbot", "📋 Data Quality", "📐 Statistical Analysis"],
+        label_visibility="collapsed"
+    )
+    st.markdown("---")
+    # Dataset info in sidebar
+    if st.session_state.data is not None:
+        st.markdown("### 📂 Current Dataset")
+        df = st.session_state.data
+        col1, col2 = st.columns(2)
+        with col1:
+            st.metric("Rows", f"{df.shape[0]:,}")
+        with col2:
+            st.metric("Columns", df.shape[1])
+        # Show data quality indicator
+        missing_pct = (df.isnull().sum().sum() / (df.shape[0] * df.shape[1])) * 100
+        if missing_pct == 0:
+            st.success("✅ Data Quality: Excellent")
+        elif missing_pct < 5:
+            st.info(f"ℹ️ Data Quality: Good ({missing_pct:.1f}% missing)")
+        elif missing_pct < 20:
+            st.warning(f"⚠️ Data Quality: Fair ({missing_pct:.1f}% missing)")
+        else:
+            st.error(f"❌ Data Quality: Poor ({missing_pct:.1f}% missing)")
+        # Quick actions
+        st.markdown("### ⚡ Quick Actions")
+        col1, col2 = st.columns(2)
+        with col1:
+            if st.button("🔄 Reset Data", use_container_width=True):
+                st.session_state.data = None
+                st.session_state.processed_data = None
+                st.session_state.data_loaded = False
+                st.rerun()
+        with col2:
+            if st.button("📥 Download Sample", use_container_width=True):
+                # Create sample data download
+                sample_df = df.head(100)
+                csv = sample_df.to_csv(index=False)
+                st.download_button(
+                    label="Download Sample",
+                    data=csv,
+                    file_name="sample_data.csv",
+                    mime="text/csv"
+                )
+        # Show operation history
+        if st.session_state.operation_status:
+            with st.expander("📋 Operation History"):
+                for op, status in st.session_state.operation_status.items():
+                    if status == "success":
+                        st.success(f"✅ {op}")
+                    elif status == "error":
+                        st.error(f"❌ {op}")
+                    else:
+                        st.info(f"⏳ {op}")
+    else:
+        st.info("👆 Upload a dataset to get started")
+# ---------------------------------------
+# MAIN CONTENT AREA
+# ---------------------------------------
+# Map page names to functions
+page_map = {
+    "📤 Upload Dataset": "upload",
+    "🛠️ Preprocessing": "preprocess",
+    "🔍 EDA": "eda",
+    "📈 Visualization": "visualization",
+    "🤖 Machine Learning": "ml",
+    "💡 Insights": "insights",
+    "💬 Chatbot": "chatbot",
+    "📋 Data Quality": "quality",
+    "📐 Statistical Analysis": "statistical"
+}
+current_page = page_map[page]
+# ---------------------------------------
+# UPLOAD DATASET PAGE
+# ---------------------------------------
+if current_page == "upload":
+    col1, col2, col3 = st.columns([1, 2, 1])
+    with col2:
+        st.markdown('<div class="custom-card">', unsafe_allow_html=True)
+        st.markdown("### 📂 Upload Your Dataset")
+        # File uploader with size limit warning
+        file = st.file_uploader(
+            "Choose a CSV or Excel file",
+            type=["csv", "xlsx"],
+            help="Maximum recommended file size: 200MB. Larger files may cause performance issues."
+        )
+        if file:
+            try:
+                # Check file size
+                file_size = file.size / 1024**2  # MB
+                if file_size > 200:
+                    st.warning(f"⚠️ Large file detected ({file_size:.2f} MB). Processing may be slow.")
+                with st.spinner("📂 Loading file..."):
+                    # Read file based on extension
+                    if file.name.endswith("csv"):
+                        # Try different encodings
+                        encodings = ['utf-8', 'latin-1', 'iso-8859-1', 'cp1252']
+                        df = None
+                        for encoding in encodings:
+                            try:
+                                df = pd.read_csv(file, encoding=encoding)
+                                break
+                            except UnicodeDecodeError:
+                                continue
+                        if df is None:
+                            st.error("❌ Could not read CSV file with any common encoding.")
+                            st.stop()
+                    elif file.name.endswith(("xlsx", "xls")):
+                        try:
+                            df = pd.read_excel(file)
+                        except Exception as e:
+                            st.error(f"❌ Error reading Excel file: {str(e)}")
+                            st.info("💡 Try saving the file as CSV and uploading again.")
+                            st.stop()
+                    # Validate dataset
+                    issues = validate_dataset(df)
+                    show_validation_warnings(issues)
+                    if not issues or all("Large dataset" not in issue for issue in issues):
+                        # Store in session state
+                        st.session_state.data = df
+                        st.session_state.uploaded_file_name = file.name
+                        st.session_state.data_loaded = True
+                        st.session_state.upload_error = None
+                        # Show success message
+                        st.markdown("""
+                        <div class="success-container">
+                            <strong>✅ Successfully loaded:</strong> {}<br>
+                            <strong>📊 Shape:</strong> {} rows × {} columns
+                        </div>
+                        """.format(file.name, df.shape[0], df.shape[1]), unsafe_allow_html=True)
+                        # File statistics
+                        st.markdown("### 📊 File Statistics")
+                        col1, col2, col3 = st.columns(3)
+                        with col1:
+                            st.metric("Total Rows", f"{df.shape[0]:,}")
+                        with col2:
+                            st.metric("Total Columns", df.shape[1])
+                        with col3:
+                            memory = df.memory_usage(deep=True).sum() / 1024**2
+                            st.metric("Memory Usage", f"{memory:.2f} MB")
+                        # Data preview with scroll
+                        st.markdown("### 👁️ Data Preview")
+                        st.dataframe(
+                            df.head(10),
+                            use_container_width=True,
+                            height=300
+                        )
+                        # Column info with sorting
+                        st.markdown("### 📋 Column Information")
+                        col_info = pd.DataFrame({
+                            'Column': df.columns,
+                            'Type': df.dtypes.astype(str),
+                            'Non-Null Count': df.count().values,
+                            'Null Count': df.isnull().sum().values,
+                            'Null %': (df.isnull().sum().values / len(df) * 100).round(2),
+                            'Unique Values': [df[col].nunique() for col in df.columns]
+                        })
+                        # Sort by null count
+                        col_info = col_info.sort_values('Null %', ascending=False)
+                        st.dataframe(
+                            col_info.style.background_gradient(subset=['Null %'], cmap='YlOrRd'),
+                            use_container_width=True
+                        )
+                        # Quick stats
+                        st.markdown("### 📈 Quick Statistics")
+                        numeric_cols = df.select_dtypes(include=['number']).columns
+                        if len(numeric_cols) > 0:
+                            st.dataframe(
+                                df[numeric_cols].describe(),
+                                use_container_width=True
+                            )
+                        # Navigation buttons
+                        st.markdown("### 🚀 Next Steps")
+                        col1, col2, col3 = st.columns(3)
+                        with col1:
+                            if st.button("🛠️ Go to Preprocessing", use_container_width=True):
+                                st.session_state.page = "🛠️ Preprocessing"
+                                st.rerun()
+                        with col2:
+                            if st.button("📊 Go to EDA", use_container_width=True):
+                                st.session_state.page = "📊 EDA"
+                                st.rerun()
+                        with col3:
+                            if st.button("📈 Go to Visualization", use_container_width=True):
+                                st.session_state.page = "📈 Visualization"
+                                st.rerun()
+            except pd.errors.EmptyDataError:
+                st.error("❌ The uploaded file is empty. Please upload a file with data.")
+            except pd.errors.ParserError as e:
+                st.error(f"❌ Error parsing file: {str(e)}")
+                st.info("💡 Check if your CSV file has consistent delimiters and quoting.")
+            except MemoryError:
+                st.error("❌ Out of memory! The file is too large to process.")
+                st.info("💡 Try uploading a smaller file or sampling your data first.")
+            except Exception as e:
+                error_msg = StreamlitExceptionHandler.handle_exception(e, "file upload")
+                st.markdown(f'<div class="error-container">{error_msg}</div>', unsafe_allow_html=True)
+                # Log error
+                st.session_state.error_log.append({
+                    'timestamp': datetime.now(),
+                    'error': str(e),
+                    'traceback': traceback.format_exc()
+                })
+        st.markdown('</div>', unsafe_allow_html=True)
+        # Sample data option
+        with st.expander("🔄 Or use sample data"):
+            st.markdown("Don't have a dataset? Try our sample data:")
+            if st.button("Load Sample Dataset", use_container_width=True):
+                try:
+                    from utils import create_sample_dataset
+                    sample_df = create_sample_dataset()
+                    st.session_state.data = sample_df
+                    st.session_state.uploaded_file_name = "sample_dataset.csv"
+                    st.session_state.data_loaded = True
+                    st.success("✅ Sample dataset loaded successfully!")
+                    st.rerun()
+                except Exception as e:
+                    st.error(f"❌ Error loading sample data: {str(e)}")
+# ---------------------------------------
+# PREPROCESSING PAGE
+# ---------------------------------------
+elif current_page == "preprocess":
+    try:
+        if st.session_state.data is not None:
+            df = st.session_state.data
+            # Validate data before preprocessing
+            issues = validate_dataset(df)
+            if issues:
+                show_validation_warnings(issues)
+            # Run preprocessing with error handling
+            with st.spinner("🔄 Preprocessing data..."):
+                processed_df, error = safe_dataframe_operation(preprocess_data, df)
+                if error:
+                    st.markdown(f'<div class="error-container">{error}</div>', unsafe_allow_html=True)
+                    st.session_state.operation_status['Preprocessing'] = 'error'
+                else:
+                    st.session_state.processed_data = processed_df
+                    st.session_state.operation_status['Preprocessing'] = 'success'
+                    # Show success message
+                    st.markdown("""
+                    <div class="success-container">
+                        <strong>✅ Preprocessing completed successfully!</strong><br>
+                        You can now proceed to analysis or visualization.
+                    </div>
+                    """, unsafe_allow_html=True)
+        else:
+            st.warning("⚠️ Please upload a dataset first in the Upload section")
+    except Exception as e:
+        error_msg = StreamlitExceptionHandler.handle_exception(e, "preprocessing")
+        st.markdown(f'<div class="error-container">{error_msg}</div>', unsafe_allow_html=True)
+# ---------------------------------------
+# EDA PAGE
+# ---------------------------------------
+elif current_page == "eda":
+    try:
+        if st.session_state.data is not None:
+            df = st.session_state.data
+            # Validate data
+            issues = validate_dataset(df)
+            if issues:
+                show_validation_warnings(issues)
+            # Run EDA with error handling
+            with st.spinner("🔍 Performing Exploratory Data Analysis..."):
+                result, error = safe_dataframe_operation(eda_analysis, df)
+                if error:
+                    st.markdown(f'<div class="error-container">{error}</div>', unsafe_allow_html=True)
+                    st.session_state.operation_status['EDA'] = 'error'
+                else:
+                    st.session_state.operation_status['EDA'] = 'success'
+        else:
+            st.warning("⚠️ Please upload a dataset first in the Upload section")
+    except Exception as e:
+        error_msg = StreamlitExceptionHandler.handle_exception(e, "EDA")
+        st.markdown(f'<div class="error-container">{error_msg}</div>', unsafe_allow_html=True)
+# ---------------------------------------
+# VISUALIZATION PAGE
+# ---------------------------------------
+elif current_page == "visualization":
+    try:
+        if st.session_state.data is not None:
+            df = st.session_state.data
+            # Validate data
+            issues = validate_dataset(df)
+            if issues:
+                show_validation_warnings(issues)
+            # Run visualization with error handling
+            with st.spinner("📊 Generating visualizations..."):
+                result, error = safe_dataframe_operation(auto_visualizations, df)
+                if error:
+                    st.markdown(f'<div class="error-container">{error}</div>', unsafe_allow_html=True)
+                    st.session_state.operation_status['Visualization'] = 'error'
+                else:
+                    st.session_state.operation_status['Visualization'] = 'success'
+        else:
+            st.warning("⚠️ Please upload a dataset first in the Upload section")
+    except Exception as e:
+        error_msg = StreamlitExceptionHandler.handle_exception(e, "visualization")
+        st.markdown(f'<div class="error-container">{error_msg}</div>', unsafe_allow_html=True)
+# ---------------------------------------
+# MACHINE LEARNING PAGE
+# ---------------------------------------
+elif current_page == "ml":
+    try:
+        if st.session_state.data is not None:
+            data_to_use = st.session_state.processed_data if st.session_state.processed_data is not None else st.session_state.data
+            # Validate data for ML
+            if data_to_use.shape[0] < 10:
+                st.warning("⚠️ Dataset too small for machine learning (need at least 10 rows)")
+            else:
+                # Run ML pipeline with error handling
+                with st.spinner("🤖 Running machine learning pipeline..."):
+                    result, error = safe_dataframe_operation(run_ml_pipeline, data_to_use)
+                    if error:
+                        st.markdown(f'<div class="error-container">{error}</div>', unsafe_allow_html=True)
+                        st.session_state.operation_status['ML'] = 'error'
+                    else:
+                        st.session_state.operation_status['ML'] = 'success'
+        else:
+            st.warning("⚠️ Please upload a dataset first in the Upload section")
+    except Exception as e:
+        error_msg = StreamlitExceptionHandler.handle_exception(e, "machine learning")
+        st.markdown(f'<div class="error-container">{error_msg}</div>', unsafe_allow_html=True)
+# ---------------------------------------
+# INSIGHTS PAGE
+# ---------------------------------------
+elif current_page == "insights":
+    try:
+        if st.session_state.data is not None:
+            df = st.session_state.data
+            # Generate insights with error handling
+            with st.spinner("💡 Generating business insights..."):
+                result, error = safe_dataframe_operation(generate_business_insights, df)
+                if error:
+                    st.markdown(f'<div class="error-container">{error}</div>', unsafe_allow_html=True)
+                    st.session_state.operation_status['Insights'] = 'error'
+                else:
+                    st.session_state.operation_status['Insights'] = 'success'
+        else:
+            st.warning("⚠️ Please upload a dataset first in the Upload section")
+    except Exception as e:
+        error_msg = StreamlitExceptionHandler.handle_exception(e, "insights generation")
+        st.markdown(f'<div class="error-container">{error_msg}</div>', unsafe_allow_html=True)
+# ---------------------------------------
+# CHATBOT PAGE
+# ---------------------------------------
+elif current_page == "chatbot":
+    try:
+        if st.session_state.data is not None:
+            df = st.session_state.data
+            # Run chatbot with error handling
+            with st.spinner("🤖 Initializing chatbot..."):
+                result, error = safe_dataframe_operation(data_chatbot, df)
+                if error:
+                    st.markdown(f'<div class="error-container">{error}</div>', unsafe_allow_html=True)
+                    st.session_state.operation_status['Chatbot'] = 'error'
+                else:
+                    st.session_state.operation_status['Chatbot'] = 'success'
+        else:
+            st.warning("⚠️ Please upload a dataset first in the Upload section")
+    except Exception as e:
+        error_msg = StreamlitExceptionHandler.handle_exception(e, "chatbot")
+        st.markdown(f'<div class="error-container">{error_msg}</div>', unsafe_allow_html=True)
+# ---------------------------------------
+# DATA QUALITY PAGE
+# ---------------------------------------
+elif current_page == "quality":
+    try:
+        if st.session_state.data is not None:
+            df = st.session_state.data
+            # Run quality report with error handling
+            with st.spinner("📋 Generating quality report..."):
+                from data_quality import quality_report
+                result, error = safe_dataframe_operation(quality_report, df)
+                if error:
+                    st.markdown(f'<div class="error-container">{error}</div>', unsafe_allow_html=True)
+                    st.session_state.operation_status['Data Quality'] = 'error'
+                else:
+                    st.session_state.operation_status['Data Quality'] = 'success'
+        else:
+            st.warning("⚠️ Please upload a dataset first in the Upload section")
+    except Exception as e:
+        error_msg = StreamlitExceptionHandler.handle_exception(e, "data quality")
+        st.markdown(f'<div class="error-container">{error_msg}</div>', unsafe_allow_html=True)
+# ---------------------------------------
+# STATISTICAL ANALYSIS PAGE
+# ---------------------------------------
+elif current_page == "statistical":
+    try:
+        if st.session_state.data is not None:
+            df = st.session_state.data
+            # Validate numeric data
+            numeric_cols = df.select_dtypes(include=['number']).columns
+            if len(numeric_cols) == 0:
+                st.warning("⚠️ No numeric columns found. Statistical analysis requires numeric data.")
+            else:
+                # Run statistical analysis with error handling
+                with st.spinner("📐 Performing statistical analysis..."):
+                    from statistical_analysis import statistical_analysis
+                    result, error = safe_dataframe_operation(statistical_analysis, df)
+                    if error:
+                        st.markdown(f'<div class="error-container">{error}</div>', unsafe_allow_html=True)
+                        st.session_state.operation_status['Statistical Analysis'] = 'error'
+                    else:
+                        st.session_state.operation_status['Statistical Analysis'] = 'success'
+        else:
+            st.warning("⚠️ Please upload a dataset first in the Upload section")
+    except Exception as e:
+        error_msg = StreamlitExceptionHandler.handle_exception(e, "statistical analysis")
+        st.markdown(f'<div class="error-container">{error_msg}</div>', unsafe_allow_html=True)
+# ---------------------------------------
+# ERROR LOG DISPLAY (Hidden by default)
+# ---------------------------------------
+if st.session_state.error_log and st.checkbox("🔧 Show Error Log (Debug Mode)"):
+    st.markdown("### 📋 Error Log")
+    for i, error_entry in enumerate(st.session_state.error_log[-5:]):  # Show last 5 errors
+        with st.expander(f"Error {i+1}: {error_entry['timestamp'].strftime('%Y-%m-%d %H:%M:%S')}"):
+            st.code(error_entry['error'])
+            st.code(error_entry['traceback'])
+# ---------------------------------------
+# FOOTER
+# ---------------------------------------
+st.markdown("---")
+st.markdown(
+    "<p style='text-align: center; color: gray;'>Made with ❤️ using Streamlit | Version 2.0 | Enhanced Error Handling</p>",
+    unsafe_allow_html=True
+)

chatbot.py ADDED Viewed

	@@ -0,0 +1,1051 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+import plotly.express as px
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+import re
+from datetime import datetime, timedelta
+def data_chatbot(df):
+    """
+    Advanced chatbot that provides data access and visualizations based on user questions
+    """
+    st.markdown("""
+    <style>
+    .chat-header {
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+        padding: 25px;
+        border-radius: 15px;
+        color: white;
+        text-align: center;
+        margin-bottom: 25px;
+        box-shadow: 0 10px 30px rgba(102, 126, 234, 0.3);
+    }
+    .chat-header h2 {
+        font-size: 2.2rem;
+        margin-bottom: 10px;
+    }
+    .chat-header p {
+        font-size: 1.1rem;
+        opacity: 0.95;
+    }
+    .user-message {
+        background: linear-gradient(135deg, #e3f2fd 0%, #bbdefb 100%);
+        padding: 15px 20px;
+        border-radius: 20px 20px 5px 20px;
+        margin: 10px 0;
+        max-width: 80%;
+        margin-left: auto;
+        border-left: 4px solid #1976d2;
+        box-shadow: 0 2px 5px rgba(0,0,0,0.1);
+    }
+    .bot-message {
+        background: white;
+        padding: 15px 20px;
+        border-radius: 20px 20px 20px 5px;
+        margin: 10px 0;
+        max-width: 80%;
+        border-left: 4px solid #4caf50;
+        box-shadow: 0 2px 5px rgba(0,0,0,0.1);
+    }
+    .metric-card {
+        background: white;
+        padding: 15px;
+        border-radius: 10px;
+        text-align: center;
+        box-shadow: 0 2px 10px rgba(0,0,0,0.05);
+        border-left: 4px solid #667eea;
+    }
+    .viz-container {
+        background: white;
+        padding: 20px;
+        border-radius: 15px;
+        margin: 20px 0;
+        box-shadow: 0 5px 20px rgba(0,0,0,0.1);
+    }
+    .insight-badge {
+        background: #4caf50;
+        color: white;
+        padding: 5px 10px;
+        border-radius: 15px;
+        font-size: 12px;
+        display: inline-block;
+        margin-right: 5px;
+    }
+    </style>
+    <div class="chat-header">
+        <h2>🤖 Smart Data Assistant</h2>
+        <p>Ask questions and get instant visualizations - I'll show you the data!</p>
+    </div>
+    """, unsafe_allow_html=True)
+    # Initialize session state
+    if "chat_messages" not in st.session_state:
+        st.session_state.chat_messages = []
+    if "last_viz" not in st.session_state:
+        st.session_state.last_viz = None
+    if "last_data" not in st.session_state:
+        st.session_state.last_data = None
+    # Main layout
+    main_col, viz_col = st.columns([1, 1])
+    with main_col:
+        # Chat history
+        chat_container = st.container()
+        with chat_container:
+            if not st.session_state.chat_messages:
+                st.info("""
+                👋 **Hi! I can show you data and create visualizations. Try asking:**
+                **📊 Show Data:**
+                • "Show me the first 10 rows"
+                • "Show me data where age > 30"
+                • "Display top 5 by sales"
+                **📈 Create Visualizations:**
+                • "Show me a bar chart of category"
+                • "Plot histogram of age"
+                • "Create scatter plot of price vs quantity"
+                • "Show trend of sales over time"
+                **🔍 Analyze:**
+                • "What's the average of salary?"
+                • "Show statistics for all columns"
+                • "Find outliers in price"
+                """)
+            for msg in st.session_state.chat_messages:
+                if msg["role"] == "user":
+                    st.markdown(f'<div class="user-message"><b>👤 You:</b> {msg["content"]}</div>', unsafe_allow_html=True)
+                else:
+                    st.markdown(f'<div class="bot-message">{msg["content"]}</div>', unsafe_allow_html=True)
+        # Input area
+        st.markdown("<br>", unsafe_allow_html=True)
+        input_col1, input_col2 = st.columns([5, 1])
+        with input_col1:
+            user_query = st.text_input("", placeholder="💬 Ask a question or request a visualization...",
+                                       key="chat_input", label_visibility="collapsed")
+        with input_col2:
+            send_button = st.button("📤 Ask", use_container_width=True)
+        if send_button and user_query:
+            # Add user message
+            st.session_state.chat_messages.append({"role": "user", "content": user_query})
+            # Process query and get response with data/viz
+            with st.spinner("🔍 Processing your request..."):
+                response, viz_data, table_data = process_query_with_viz(user_query, df)
+            # Add bot response
+            st.session_state.chat_messages.append({"role": "bot", "content": response})
+            # Store visualization and data for display
+            if viz_data:
+                st.session_state.last_viz = viz_data
+            if table_data is not None:
+                st.session_state.last_data = table_data
+            st.rerun()
+    with viz_col:
+        # Display visualizations and data
+        if st.session_state.last_viz:
+            st.markdown('<div class="viz-container">', unsafe_allow_html=True)
+            st.markdown("### 📊 Generated Visualization")
+            display_visualization(st.session_state.last_viz)
+            st.markdown('</div>', unsafe_allow_html=True)
+        if st.session_state.last_data is not None:
+            st.markdown('<div class="viz-container">', unsafe_allow_html=True)
+            st.markdown("### 📋 Data Result")
+            st.dataframe(st.session_state.last_data, use_container_width=True, height=300)
+            st.markdown('</div>', unsafe_allow_html=True)
+    # Quick action buttons
+    st.markdown("---")
+    st.markdown("### 🔍 Quick Actions")
+    col1, col2, col3, col4, col5 = st.columns(5)
+    actions = [
+        ("📊 First 10 Rows", "Show me first 10 rows", col1),
+        ("📈 Bar Chart", "Show bar chart of first categorical column", col2),
+        ("📉 Histogram", "Plot histogram of first numeric column", col3),
+        ("🔎 Filter", "Show rows where value > average", col4),
+        ("📋 Statistics", "Show me statistics", col5)
+    ]
+    for label, query, col in actions:
+        if col.button(label, use_container_width=True):
+            st.session_state.chat_messages.append({"role": "user", "content": query})
+            response, viz_data, table_data = process_query_with_viz(query, df)
+            st.session_state.chat_messages.append({"role": "bot", "content": response})
+            if viz_data:
+                st.session_state.last_viz = viz_data
+            if table_data is not None:
+                st.session_state.last_data = table_data
+            st.rerun()
+    # Clear button
+    col1, col2, col3 = st.columns([1, 1, 1])
+    with col2:
+        if st.button("🗑️ Clear Chat & Visualizations", use_container_width=True):
+            st.session_state.chat_messages = []
+            st.session_state.last_viz = None
+            st.session_state.last_data = None
+            st.rerun()
+def process_query_with_viz(query, df):
+    """Process query and return response with visualization and data"""
+    query_lower = query.lower().strip()
+    # Get column information
+    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
+    categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
+    datetime_cols = df.select_dtypes(include=['datetime64']).columns.tolist()
+    all_cols = df.columns.tolist()
+    # Extract numbers from query
+    numbers = re.findall(r'\d+', query_lower)
+    n = int(numbers[0]) if numbers else 10
+    # 1. SHOW DATA - First/Last/Random rows
+    if any(word in query_lower for word in ['first', 'head', 'top']):
+        return show_first_rows(df, n)
+    elif any(word in query_lower for word in ['last', 'tail', 'bottom']):
+        return show_last_rows(df, n)
+    elif 'random' in query_lower or 'sample' in query_lower:
+        return show_random_rows(df, n)
+    # 2. FILTER DATA
+    elif any(word in query_lower for word in ['find', 'where', 'filter', 'search', 'with']):
+        return filter_data(query_lower, df)
+    # 3. SORT DATA
+    elif 'sort' in query_lower or 'order by' in query_lower:
+        return sort_data(query_lower, df)
+    # 4. BAR CHART
+    elif any(word in query_lower for word in ['bar chart', 'bar plot', 'bar graph', 'count plot']):
+        return create_bar_chart(query_lower, df, categorical_cols)
+    # 5. HISTOGRAM
+    elif any(word in query_lower for word in ['histogram', 'distribution', 'hist', 'frequency']):
+        return create_histogram(query_lower, df, numeric_cols)
+    # 6. SCATTER PLOT
+    elif any(word in query_lower for word in ['scatter', 'scatter plot', 'scatterplot', 'relationship']):
+        return create_scatter_plot(query_lower, df, numeric_cols)
+    # 7. LINE CHART / TREND
+    elif any(word in query_lower for word in ['line chart', 'line plot', 'trend', 'over time']):
+        return create_line_chart(query_lower, df, numeric_cols, datetime_cols)
+    # 8. BOX PLOT
+    elif any(word in query_lower for word in ['box plot', 'boxplot', 'box', 'outliers']):
+        return create_box_plot(query_lower, df, numeric_cols, categorical_cols)
+    # 9. PIE CHART
+    elif any(word in query_lower for word in ['pie chart', 'pie', 'proportion', 'percentage']):
+        return create_pie_chart(query_lower, df, categorical_cols)
+    # 10. HEATMAP / CORRELATION
+    elif any(word in query_lower for word in ['heatmap', 'correlation', 'corr', 'heat map']):
+        return create_heatmap(df, numeric_cols)
+    # 11. VIOLIN PLOT
+    elif 'violin' in query_lower:
+        return create_violin_plot(query_lower, df, numeric_cols, categorical_cols)
+    # 12. STATISTICS
+    elif any(word in query_lower for word in ['statistics', 'stats', 'describe', 'summary']):
+        return show_statistics(query_lower, df, numeric_cols, all_cols)
+    # 13. COLUMN INFORMATION
+    elif any(word in query_lower for word in ['column info', 'column details', 'info about']):
+        return show_column_info(query_lower, df, all_cols)
+    # 14. MISSING VALUES
+    elif any(word in query_lower for word in ['missing', 'null', 'na', 'empty']):
+        return show_missing_values(df)
+    # 15. OUTLIERS
+    elif 'outlier' in query_lower:
+        return detect_outliers(query_lower, df, numeric_cols)
+    # 16. UNIQUE VALUES
+    elif any(word in query_lower for word in ['unique', 'distinct', 'categories']):
+        return show_unique_values(query_lower, df, all_cols, categorical_cols)
+    # 17. COMPARE COLUMNS
+    elif 'compare' in query_lower:
+        return compare_columns(query_lower, df, numeric_cols, categorical_cols)
+    # 18. HELP
+    elif any(word in query_lower for word in ['help', 'what can you do', 'capabilities']):
+        return show_help(), None, None
+    # 19. DEFAULT - Try to understand if asking about a specific column
+    else:
+        return handle_general_query(query_lower, df, numeric_cols, categorical_cols, all_cols)
+def show_first_rows(df, n=10):
+    """Show first n rows"""
+    data = df.head(n)
+    response = f"### 👁️ First {n} Rows\n\nHere's the data you requested:"
+    return response, None, data
+def show_last_rows(df, n=10):
+    """Show last n rows"""
+    data = df.tail(n)
+    response = f"### 👁️ Last {n} Rows\n\nHere's the data you requested:"
+    return response, None, data
+def show_random_rows(df, n=5):
+    """Show random n rows"""
+    data = df.sample(min(n, len(df)))
+    response = f"### 🎲 Random Sample of {n} Rows\n\nHere's a random sample from your data:"
+    return response, None, data
+def filter_data(query, df):
+    """Filter data based on conditions"""
+    # Common patterns
+    patterns = [
+        (r'(\w+)\s*>\s*(\d+\.?\d*)', '>'),
+        (r'(\w+)\s*<\s*(\d+\.?\d*)', '<'),
+        (r'(\w+)\s*>=\s*(\d+\.?\d*)', '>='),
+        (r'(\w+)\s*<=\s*(\d+\.?\d*)', '<='),
+        (r'(\w+)\s*=\s*(\d+\.?\d*)', '=='),
+        (r'(\w+)\s*==\s*(\d+\.?\d*)', '=='),
+        (r'(\w+)\s*contains\s*["\']?([^"\']+)["\']?', 'contains'),
+        (r'(\w+)\s*is\s*["\']?([^"\']+)["\']?', '=='),
+    ]
+    for pattern, op in patterns:
+        match = re.search(pattern, query.lower())
+        if match:
+            col = match.group(1)
+            val = match.group(2)
+            # Find matching column
+            for c in df.columns:
+                if c.lower() == col:
+                    try:
+                        if op in ['>', '<', '>=', '<=']:
+                            val = float(val)
+                            if op == '>':
+                                filtered = df[df[c] > val]
+                                condition = f"{c} > {val}"
+                            elif op == '<':
+                                filtered = df[df[c] < val]
+                                condition = f"{c} < {val}"
+                            elif op == '>=':
+                                filtered = df[df[c] >= val]
+                                condition = f"{c} >= {val}"
+                            elif op == '<=':
+                                filtered = df[df[c] <= val]
+                                condition = f"{c} <= {val}"
+                        elif op == 'contains':
+                            filtered = df[df[c].astype(str).str.contains(val, case=False, na=False)]
+                            condition = f"{c} contains '{val}'"
+                        else:
+                            if df[c].dtype in ['int64', 'float64']:
+                                filtered = df[df[c] == float(val)]
+                            else:
+                                filtered = df[df[c].astype(str).str.lower() == val.lower()]
+                            condition = f"{c} = {val}"
+                        if len(filtered) > 0:
+                            response = f"### 🔍 Found {len(filtered)} rows where {condition}\n\nShowing first 20 results:"
+                            return response, None, filtered.head(20)
+                        else:
+                            return f"❌ No rows found where {condition}", None, None
+                    except:
+                        pass
+    return "❌ I couldn't understand the filter condition. Try something like: 'show rows where age > 30'", None, None
+def sort_data(query, df):
+    """Sort data by column"""
+    # Extract column name
+    for col in df.columns:
+        if col.lower() in query:
+            sort_col = col
+            break
+    else:
+        sort_col = df.columns[0] if len(df.columns) > 0 else None
+    if not sort_col:
+        return "❌ Please specify a column to sort by", None, None
+    # Determine order
+    if 'desc' in query or 'highest' in query or 'largest' in query:
+        ascending = False
+        order = "descending"
+    else:
+        ascending = True
+        order = "ascending"
+    # Get number
+    numbers = re.findall(r'\d+', query)
+    n = int(numbers[0]) if numbers else 20
+    sorted_df = df.sort_values(sort_col, ascending=ascending).head(n)
+    response = f"### 📊 Sorted by {sort_col} ({order})\n\nShowing top {n} results:"
+    return response, None, sorted_df
+def create_bar_chart(query, df, categorical_cols):
+    """Create bar chart for categorical column"""
+    # Find requested column
+    col = None
+    for c in categorical_cols:
+        if c.lower() in query:
+            col = c
+            break
+    if not col and categorical_cols:
+        col = categorical_cols[0]
+    if col:
+        value_counts = df[col].value_counts().head(20)
+        fig = px.bar(
+            x=value_counts.index,
+            y=value_counts.values,
+            title=f"Bar Chart of {col} (Top 20)",
+            labels={'x': col, 'y': 'Count'},
+            color_discrete_sequence=['#667eea']
+        )
+        fig.update_layout(
+            plot_bgcolor='white',
+            paper_bgcolor='white',
+            font=dict(color='#2c3e50'),
+            xaxis_tickangle=-45,
+            height=500
+        )
+        response = f"### 📊 Bar Chart of '{col}'\n\nHere's the distribution of values:"
+        return response, fig, None
+    return "❌ No categorical column found for bar chart", None, None
+def create_histogram(query, df, numeric_cols):
+    """Create histogram for numeric column"""
+    # Find requested column
+    col = None
+    for c in numeric_cols:
+        if c.lower() in query:
+            col = c
+            break
+    if not col and numeric_cols:
+        col = numeric_cols[0]
+    if col:
+        fig = px.histogram(
+            df,
+            x=col,
+            nbins=30,
+            title=f"Histogram of {col}",
+            marginal="box",
+            color_discrete_sequence=['#667eea']
+        )
+        fig.update_layout(
+            plot_bgcolor='white',
+            paper_bgcolor='white',
+            font=dict(color='#2c3e50'),
+            height=500
+        )
+        # Add statistics
+        data = df[col].dropna()
+        stats = f"Mean: {data.mean():.2f} | Median: {data.median():.2f} | Std: {data.std():.2f}"
+        response = f"### 📊 Histogram of '{col}'\n\n{stats}"
+        return response, fig, None
+    return "❌ No numeric column found for histogram", None, None
+def create_scatter_plot(query, df, numeric_cols):
+    """Create scatter plot between two numeric columns"""
+    # Find two numeric columns
+    cols = []
+    for col in numeric_cols:
+        if col.lower() in query:
+            cols.append(col)
+    if len(cols) >= 2:
+        x_col, y_col = cols[0], cols[1]
+    elif len(numeric_cols) >= 2:
+        x_col, y_col = numeric_cols[0], numeric_cols[1]
+    else:
+        return "❌ Need at least 2 numeric columns for scatter plot", None, None
+    fig = px.scatter(
+        df,
+        x=x_col,
+        y=y_col,
+        title=f"Scatter Plot: {y_col} vs {x_col}",
+        trendline="ols",
+        opacity=0.6,
+        color_discrete_sequence=['#667eea']
+    )
+    fig.update_layout(
+        plot_bgcolor='white',
+        paper_bgcolor='white',
+        font=dict(color='#2c3e50'),
+        height=500
+    )
+    # Calculate correlation
+    corr = df[x_col].corr(df[y_col])
+    response = f"### 📊 Scatter Plot: {y_col} vs {x_col}\n\nCorrelation: {corr:.4f}"
+    return response, fig, None
+def create_line_chart(query, df, numeric_cols, datetime_cols):
+    """Create line chart for time series or sequential data"""
+    # Find date column
+    date_col = None
+    for col in datetime_cols:
+        if col.lower() in query:
+            date_col = col
+            break
+    if not date_col and datetime_cols:
+        date_col = datetime_cols[0]
+    # Find value column
+    val_col = None
+    for col in numeric_cols:
+        if col.lower() in query:
+            val_col = col
+            break
+    if not val_col and numeric_cols:
+        val_col = numeric_cols[0]
+    if date_col and val_col:
+        # Sort by date
+        plot_df = df[[date_col, val_col]].dropna().sort_values(date_col)
+        fig = px.line(
+            plot_df,
+            x=date_col,
+            y=val_col,
+            title=f"Trend of {val_col} over Time",
+            color_discrete_sequence=['#667eea']
+        )
+        fig.update_layout(
+            plot_bgcolor='white',
+            paper_bgcolor='white',
+            font=dict(color='#2c3e50'),
+            height=500
+        )
+        response = f"### 📈 Line Chart: {val_col} over Time"
+        return response, fig, None
+    return "❌ Need a datetime column and numeric column for line chart", None, None
+def create_box_plot(query, df, numeric_cols, categorical_cols):
+    """Create box plot"""
+    # Find numeric column
+    num_col = None
+    for col in numeric_cols:
+        if col.lower() in query:
+            num_col = col
+            break
+    if not num_col and numeric_cols:
+        num_col = numeric_cols[0]
+    # Find categorical column for grouping
+    cat_col = None
+    for col in categorical_cols:
+        if col.lower() in query:
+            cat_col = col
+            break
+    if num_col:
+        if cat_col:
+            fig = px.box(
+                df,
+                x=cat_col,
+                y=num_col,
+                title=f"Box Plot of {num_col} by {cat_col}",
+                color_discrete_sequence=['#667eea']
+            )
+            response = f"### 📊 Box Plot: {num_col} grouped by {cat_col}"
+        else:
+            fig = px.box(
+                df,
+                y=num_col,
+                title=f"Box Plot of {num_col}",
+                color_discrete_sequence=['#667eea']
+            )
+            response = f"### 📊 Box Plot of {num_col}"
+        fig.update_layout(
+            plot_bgcolor='white',
+            paper_bgcolor='white',
+            font=dict(color='#2c3e50'),
+            height=500
+        )
+        return response, fig, None
+    return "❌ No numeric column found for box plot", None, None
+def create_pie_chart(query, df, categorical_cols):
+    """Create pie chart for categorical column"""
+    # Find categorical column
+    col = None
+    for c in categorical_cols:
+        if c.lower() in query:
+            col = c
+            break
+    if not col and categorical_cols:
+        col = categorical_cols[0]
+    if col:
+        value_counts = df[col].value_counts().head(10)
+        fig = px.pie(
+            values=value_counts.values,
+            names=value_counts.index,
+            title=f"Pie Chart of {col} (Top 10)",
+            hole=0.3,
+            color_discrete_sequence=px.colors.qualitative.Set3
+        )
+        fig.update_layout(
+            height=500,
+            showlegend=True
+        )
+        response = f"### 🥧 Pie Chart of '{col}'\n\nProportion of values:"
+        return response, fig, None
+    return "❌ No categorical column found for pie chart", None, None
+def create_heatmap(df, numeric_cols):
+    """Create correlation heatmap"""
+    if len(numeric_cols) < 2:
+        return "❌ Need at least 2 numeric columns for correlation heatmap", None, None
+    corr_matrix = df[numeric_cols].corr()
+    fig = px.imshow(
+        corr_matrix,
+        text_auto=True,
+        aspect="auto",
+        color_continuous_scale='RdBu_r',
+        title="Correlation Heatmap",
+        zmin=-1, zmax=1
+    )
+    fig.update_layout(
+        height=600,
+        plot_bgcolor='white',
+        paper_bgcolor='white'
+    )
+    response = "### 🔥 Correlation Heatmap\n\nStrong correlations are shown in dark red/blue:"
+    return response, fig, None
+def create_violin_plot(query, df, numeric_cols, categorical_cols):
+    """Create violin plot"""
+    # Find numeric column
+    num_col = None
+    for col in numeric_cols:
+        if col.lower() in query:
+            num_col = col
+            break
+    if not num_col and numeric_cols:
+        num_col = numeric_cols[0]
+    # Find categorical column for grouping
+    cat_col = None
+    for col in categorical_cols:
+        if col.lower() in query:
+            cat_col = col
+            break
+    if num_col:
+        if cat_col:
+            fig = px.violin(
+                df,
+                x=cat_col,
+                y=num_col,
+                title=f"Violin Plot of {num_col} by {cat_col}",
+                box=True,
+                points="all",
+                color_discrete_sequence=['#667eea']
+            )
+            response = f"### 🎻 Violin Plot: {num_col} grouped by {cat_col}"
+        else:
+            fig = px.violin(
+                df,
+                y=num_col,
+                title=f"Violin Plot of {num_col}",
+                box=True,
+                points="all",
+                color_discrete_sequence=['#667eea']
+            )
+            response = f"### 🎻 Violin Plot of {num_col}"
+        fig.update_layout(
+            plot_bgcolor='white',
+            paper_bgcolor='white',
+            font=dict(color='#2c3e50'),
+            height=500
+        )
+        return response, fig, None
+    return "❌ No numeric column found for violin plot", None, None
+def show_statistics(query, df, numeric_cols, all_cols):
+    """Show statistics for columns"""
+    # Check if asking about specific column
+    for col in all_cols:
+        if col.lower() in query and col in numeric_cols:
+            data = df[col].dropna()
+            stats_data = pd.DataFrame({
+                'Statistic': ['Count', 'Mean', 'Std Dev', 'Min', '25%', '50%', '75%', 'Max', 'Skewness', 'Kurtosis'],
+                'Value': [
+                    len(data),
+                    f"{data.mean():.4f}",
+                    f"{data.std():.4f}",
+                    f"{data.min():.4f}",
+                    f"{data.quantile(0.25):.4f}",
+                    f"{data.median():.4f}",
+                    f"{data.quantile(0.75):.4f}",
+                    f"{data.max():.4f}",
+                    f"{data.skew():.4f}",
+                    f"{data.kurtosis():.4f}"
+                ]
+            })
+            response = f"### 📊 Statistics for '{col}'"
+            return response, None, stats_data
+    # General statistics for all numeric columns
+    if numeric_cols:
+        stats_df = df[numeric_cols].describe().T
+        stats_df['skew'] = df[numeric_cols].skew()
+        stats_df['kurtosis'] = df[numeric_cols].kurtosis()
+        response = "### 📈 Summary Statistics for Numeric Columns"
+        return response, None, stats_df
+    return "❌ No numeric columns found for statistics", None, None
+def show_column_info(query, df, all_cols):
+    """Show information about specific column or all columns"""
+    # Check if asking about specific column
+    for col in all_cols:
+        if col.lower() in query:
+            info_data = pd.DataFrame({
+                'Property': ['Data Type', 'Unique Values', 'Missing Values', 'Missing %', 'Sample Values'],
+                'Value': [
+                    str(df[col].dtype),
+                    df[col].nunique(),
+                    df[col].isnull().sum(),
+                    f"{(df[col].isnull().sum()/len(df)*100):.2f}%",
+                    str(df[col].dropna().iloc[:3].tolist())
+                ]
+            })
+            response = f"### 📋 Column Information: '{col}'"
+            return response, None, info_data
+    # General column information
+    col_info = pd.DataFrame({
+        'Column': df.columns,
+        'Data Type': df.dtypes.astype(str),
+        'Unique Values': [df[col].nunique() for col in df.columns],
+        'Missing Values': df.isnull().sum().values,
+        'Missing %': (df.isnull().sum().values / len(df) * 100).round(2)
+    })
+    response = "### 📋 All Columns Information"
+    return response, None, col_info
+def show_missing_values(df):
+    """Show missing values analysis"""
+    missing = df.isnull().sum()
+    missing = missing[missing > 0]
+    if len(missing) == 0:
+        return "✅ **Good news!** No missing values found in the dataset.", None, None
+    missing_data = pd.DataFrame({
+        'Column': missing.index,
+        'Missing Count': missing.values,
+        'Missing %': (missing.values / len(df) * 100).round(2)
+    }).sort_values('Missing %', ascending=False)
+    total_missing = missing.sum()
+    total_cells = df.shape[0] * df.shape[1]
+    response = f"### 🔍 Missing Values Analysis\n\n**Total Missing:** {total_missing} out of {total_cells} cells ({total_missing/total_cells*100:.2f}%)"
+    return response, None, missing_data
+def detect_outliers(query, df, numeric_cols):
+    """Detect outliers in numeric columns"""
+    # Check if asking about specific column
+    target_cols = []
+    for col in numeric_cols:
+        if col.lower() in query:
+            target_cols.append(col)
+    if not target_cols:
+        target_cols = numeric_cols[:3]  # Check first 3 numeric columns
+    outlier_data = []
+    for col in target_cols:
+        data = df[col].dropna()
+        Q1 = data.quantile(0.25)
+        Q3 = data.quantile(0.75)
+        IQR = Q3 - Q1
+        outliers = data[(data < Q1 - 1.5 * IQR) | (data > Q3 + 1.5 * IQR)]
+        outlier_data.append({
+            'Column': col,
+            'Outliers Count': len(outliers),
+            'Outliers %': f"{(len(outliers)/len(data)*100):.2f}%",
+            'Normal Range': f"[{Q1 - 1.5 * IQR:.4f}, {Q3 + 1.5 * IQR:.4f}]",
+            'Severity': 'High' if len(outliers)/len(data)*100 > 10 else 'Medium' if len(outliers)/len(data)*100 > 5 else 'Low'
+        })
+    outlier_df = pd.DataFrame(outlier_data)
+    response = "### ⚠️ Outlier Detection Results"
+    return response, None, outlier_df
+def show_unique_values(query, df, all_cols, categorical_cols):
+    """Show unique values in columns"""
+    # Check if asking about specific column
+    for col in all_cols:
+        if col.lower() in query:
+            value_counts = df[col].value_counts().reset_index()
+            value_counts.columns = [col, 'Count']
+            value_counts['Percentage'] = (value_counts['Count'] / len(df) * 100).round(2)
+            response = f"### 🎯 Unique Values in '{col}'\n\n**Total Unique:** {df[col].nunique()}"
+            return response, None, value_counts.head(20)
+    # Show for categorical columns
+    if categorical_cols:
+        unique_data = []
+        for col in categorical_cols[:10]:
+            unique_data.append({
+                'Column': col,
+                'Unique Values': df[col].nunique(),
+                'Most Common': df[col].value_counts().index[0] if len(df[col].value_counts()) > 0 else 'N/A',
+                'Most Common Count': df[col].value_counts().values[0] if len(df[col].value_counts()) > 0 else 0
+            })
+        unique_df = pd.DataFrame(unique_data)
+        response = "### 🎯 Unique Values in Categorical Columns"
+        return response, None, unique_df
+    return "❌ No categorical columns found", None, None
+def compare_columns(query, df, numeric_cols, categorical_cols):
+    """Compare two columns"""
+    # Find two columns to compare
+    cols = []
+    for col in df.columns:
+        if col.lower() in query:
+            cols.append(col)
+    if len(cols) >= 2:
+        col1, col2 = cols[0], cols[1]
+        if col1 in numeric_cols and col2 in numeric_cols:
+            # Numeric comparison
+            comparison_data = pd.DataFrame({
+                'Metric': ['Mean', 'Median', 'Std Dev', 'Min', 'Max'],
+                col1: [
+                    df[col1].mean(),
+                    df[col1].median(),
+                    df[col1].std(),
+                    df[col1].min(),
+                    df[col1].max()
+                ],
+                col2: [
+                    df[col2].mean(),
+                    df[col2].median(),
+                    df[col2].std(),
+                    df[col2].min(),
+                    df[col2].max()
+                ]
+            })
+            response = f"### 🔄 Comparison: {col1} vs {col2}"
+            return response, None, comparison_data
+        elif col1 in categorical_cols and col2 in categorical_cols:
+            # Categorical comparison - crosstab
+            cross_tab = pd.crosstab(df[col1], df[col2])
+            response = f"### 🔄 Cross-tabulation: {col1} vs {col2}"
+            return response, None, cross_tab
+    return "❌ Please specify two columns to compare", None, None
+def show_help():
+    """Show help information"""
+    help_text = """
+    ### 🤖 I Can Help You With:
+    **📊 Show Data:**
+    • "Show me first 10 rows"
+    • "Show me last 5 rows"
+    • "Show random sample of 10 rows"
+    • "Find rows where age > 30"
+    • "Sort by price descending"
+    • "Top 5 by sales"
+    **📈 Create Visualizations:**
+    • "Show bar chart of category"
+    • "Plot histogram of age"
+    • "Create scatter plot of price vs quantity"
+    • "Show line chart of sales over time"
+    • "Create box plot of salary"
+    • "Show pie chart of region"
+    • "Display correlation heatmap"
+    • "Create violin plot of price"
+    **🔍 Analyze Data:**
+    • "Show statistics for all columns"
+    • "Tell me about [column name]"
+    • "Any missing values?"
+    • "Find outliers in price"
+    • "Show unique values in category"
+    • "Compare age and income"
+    **Just ask naturally and I'll show you the data and visualizations!**
+    """
+    return help_text
+def handle_general_query(query, df, numeric_cols, categorical_cols, all_cols):
+    """Handle general queries that don't match specific patterns"""
+    # Check if asking about a specific column
+    for col in all_cols:
+        if col.lower() in query:
+            if col in numeric_cols:
+                data = df[col].dropna()
+                return f"**{col}** - Mean: {data.mean():.2f}, Min: {data.min():.2f}, Max: {data.max():.2f}", None, None
+            else:
+                return f"**{col}** - Unique values: {df[col].nunique()}, Most common: {df[col].value_counts().index[0] if len(df[col].value_counts()) > 0 else 'N/A'}", None, None
+    # Check for dataset size
+    if 'size' in query or 'large' in query or 'big' in query:
+        size_mb = df.memory_usage(deep=True).sum() / 1024**2
+        return f"Dataset size: {size_mb:.2f} MB ({df.shape[0]:,} rows × {df.shape[1]} columns)", None, None
+    # Default response
+    return "❌ I didn't understand. Try asking for data, visualizations, or type 'help'", None, None
+def display_visualization(fig):
+    """Display the visualization"""
+    st.plotly_chart(fig, use_container_width=True)
+# Simple version for quick integration
+def run_simple_chatbot(df):
+    """Simplified chatbot version"""
+    st.markdown("### 💬 Simple Data Chat")
+    if "simple_msgs" not in st.session_state:
+        st.session_state.simple_msgs = []
+    # Chat display
+    for msg in st.session_state.simple_msgs:
+        if msg["role"] == "user":
+            st.info(f"👤 {msg['content']}")
+        else:
+            st.success(f"🤖 {msg['content']}")
+    # Input
+    user_input = st.text_input("Ask:", key="simple_chat_input")
+    if st.button("Send") and user_input:
+        st.session_state.simple_msgs.append({"role": "user", "content": user_input})
+        # Simple responses
+        response = "I don't understand. Try: rows, columns, missing, stats, chart"
+        if "row" in user_input.lower():
+            response = f"Dataset has {df.shape[0]} rows"
+        elif "column" in user_input.lower():
+            response = f"Dataset has {df.shape[1]} columns: {', '.join(df.columns[:5])}"
+        elif "missing" in user_input.lower():
+            missing = df.isnull().sum().sum()
+            response = f"Found {missing} missing values" if missing > 0 else "No missing values"
+        elif "stat" in user_input.lower():
+            numeric = df.select_dtypes(include=[np.number]).columns
+            if len(numeric) > 0:
+                response = f"Mean of {numeric[0]}: {df[numeric[0]].mean():.2f}"
+        elif "chart" in user_input.lower() or "plot" in user_input.lower():
+            response = "📊 Creating visualization... (check the plot above)"
+            # Simple histogram
+            numeric = df.select_dtypes(include=[np.number]).columns
+            if len(numeric) > 0:
+                fig = px.histogram(df, x=numeric[0], title=f"Distribution of {numeric[0]}")
+                st.plotly_chart(fig, use_container_width=True)
+        st.session_state.simple_msgs.append({"role": "bot", "content": response})
+        st.rerun()
+    if st.button("Clear Chat"):
+        st.session_state.simple_msgs = []
+        st.rerun()

data_preprocessing.py ADDED Viewed

	@@ -0,0 +1,387 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
+from sklearn.impute import SimpleImputer, KNNImputer
+from sklearn.ensemble import IsolationForest
+import plotly.express as px
+import plotly.graph_objects as go
+def preprocess_data(df):
+    st.markdown("""
+    <div style='text-align: center; margin-bottom: 2rem;'>
+        <h2>⚙️ Data Preprocessing Pipeline</h2>
+        <p style='color: gray;'>Clean, transform, and prepare your data for analysis</p>
+    </div>
+    """, unsafe_allow_html=True)
+    # Create tabs for different preprocessing steps
+    tab1, tab2, tab3, tab4, tab5 = st.tabs([
+        "📊 Overview", "🧹 Clean Data", "🔄 Transform",
+        "📏 Scale & Encode", "📈 Feature Engineering"
+    ])
+    with tab1:
+        st.markdown('<div class="custom-card">', unsafe_allow_html=True)
+        col1, col2, col3 = st.columns(3)
+        with col1:
+            st.metric("Original Rows", df.shape[0])
+        with col2:
+            st.metric("Original Columns", df.shape[1])
+        with col3:
+            missing_pct = (df.isnull().sum().sum() / (df.shape[0] * df.shape[1])) * 100
+            st.metric("Missing Data", f"{missing_pct:.1f}%")
+        # Data quality before preprocessing
+        st.subheader("Data Quality Check")
+        quality_df = pd.DataFrame({
+            'Column': df.columns,
+            'Data Type': df.dtypes,
+            'Missing Values': df.isnull().sum(),
+            'Missing %': (df.isnull().sum() / len(df) * 100).round(2),
+            'Unique Values': [df[col].nunique() for col in df.columns]
+        })
+        st.dataframe(quality_df, use_container_width=True)
+        # Visualize missing values
+        if df.isnull().sum().sum() > 0:
+            st.subheader("Missing Value Heatmap")
+            missing_df = df.isnull().astype(int)
+            fig = px.imshow(missing_df.T,
+                          color_continuous_scale='reds',
+                          aspect="auto",
+                          title="Missing Values Pattern")
+            st.plotly_chart(fig, use_container_width=True)
+        st.markdown('</div>', unsafe_allow_html=True)
+    with tab2:
+        st.markdown('<div class="custom-card">', unsafe_allow_html=True)
+        st.subheader("🧹 Data Cleaning Options")
+        # Create a copy for processing
+        processed_df = df.copy()
+        # Remove duplicates
+        st.markdown("### Duplicate Removal")
+        duplicates = processed_df.duplicated().sum()
+        st.write(f"Duplicate rows found: **{duplicates}**")
+        if duplicates > 0:
+            if st.button("Remove Duplicates", use_container_width=True):
+                processed_df = processed_df.drop_duplicates()
+                st.success(f"✅ Removed {duplicates} duplicate rows")
+        # Handle missing values
+        st.markdown("### Missing Value Handling")
+        missing_cols = processed_df.columns[processed_df.isnull().any()].tolist()
+        if missing_cols:
+            selected_col = st.selectbox("Select column to handle missing values", missing_cols)
+            col_type = processed_df[selected_col].dtype
+            if pd.api.types.is_numeric_dtype(processed_df[selected_col]):
+                method = st.radio(
+                    "Choose imputation method",
+                    ["Mean", "Median", "Mode", "KNN Imputer", "Drop rows", "Fill with value"]
+                )
+                if method == "Mean":
+                    processed_df[selected_col].fillna(processed_df[selected_col].mean(), inplace=True)
+                elif method == "Median":
+                    processed_df[selected_col].fillna(processed_df[selected_col].median(), inplace=True)
+                elif method == "Mode":
+                    processed_df[selected_col].fillna(processed_df[selected_col].mode()[0], inplace=True)
+                elif method == "KNN Imputer":
+                    st.info("KNN Imputer will be applied to all numeric columns")
+                    if st.button("Apply KNN Imputer"):
+                        numeric_cols = processed_df.select_dtypes(include=[np.number]).columns
+                        imputer = KNNImputer(n_neighbors=5)
+                        processed_df[numeric_cols] = imputer.fit_transform(processed_df[numeric_cols])
+                elif method == "Drop rows":
+                    if st.button(f"Drop rows with missing values in {selected_col}"):
+                        processed_df = processed_df.dropna(subset=[selected_col])
+                else:
+                    fill_value = st.text_input("Enter fill value")
+                    if fill_value:
+                        if pd.api.types.is_numeric_dtype(processed_df[selected_col]):
+                            processed_df[selected_col].fillna(float(fill_value), inplace=True)
+                        else:
+                            processed_df[selected_col].fillna(fill_value, inplace=True)
+            else:  # Categorical column
+                method = st.radio(
+                    "Choose imputation method",
+                    ["Mode", "Drop rows", "Fill with value"]
+                )
+                if method == "Mode":
+                    processed_df[selected_col].fillna(processed_df[selected_col].mode()[0], inplace=True)
+                elif method == "Drop rows":
+                    if st.button(f"Drop rows with missing values in {selected_col}"):
+                        processed_df = processed_df.dropna(subset=[selected_col])
+                else:
+                    fill_value = st.text_input("Enter fill value")
+                    if fill_value:
+                        processed_df[selected_col].fillna(fill_value, inplace=True)
+        else:
+            st.success("✅ No missing values found!")
+        # Outlier detection
+        st.markdown("### Outlier Detection")
+        numeric_cols = processed_df.select_dtypes(include=[np.number]).columns
+        if len(numeric_cols) > 0:
+            selected_num = st.selectbox("Select numeric column for outlier detection", numeric_cols)
+            # Calculate IQR
+            Q1 = processed_df[selected_num].quantile(0.25)
+            Q3 = processed_df[selected_num].quantile(0.75)
+            IQR = Q3 - Q1
+            outliers = processed_df[
+                (processed_df[selected_num] < Q1 - 1.5 * IQR) |
+                (processed_df[selected_num] > Q3 + 1.5 * IQR)
+            ]
+            st.write(f"Outliers detected: **{len(outliers)}** rows")
+            if len(outliers) > 0:
+                if st.button(f"Remove outliers from {selected_num}"):
+                    processed_df = processed_df[
+                        (processed_df[selected_num] >= Q1 - 1.5 * IQR) &
+                        (processed_df[selected_num] <= Q3 + 1.5 * IQR)
+                    ]
+                    st.success(f"✅ Removed {len(outliers)} outliers")
+        st.markdown('</div>', unsafe_allow_html=True)
+        # Update session state
+        st.session_state.data = processed_df
+    with tab3:
+        st.markdown('<div class="custom-card">', unsafe_allow_html=True)
+        st.subheader("🔄 Data Transformations")
+        processed_df = st.session_state.data.copy() if 'processed_df' not in locals() else processed_df
+        # Column operations
+        st.markdown("### Column Operations")
+        operation = st.selectbox(
+            "Choose operation",
+            ["Create new column", "Rename column", "Drop column", "Change data type"]
+        )
+        if operation == "Create new column":
+            col1, col2, col3 = st.columns(3)
+            with col1:
+                new_col_name = st.text_input("New column name")
+            with col2:
+                col_to_use = st.selectbox("Based on column", processed_df.columns)
+            with col3:
+                operation_type = st.selectbox(
+                    "Operation",
+                    ["Square", "Square Root", "Log", "Absolute", "Round", "Binary encode"]
+                )
+            if st.button("Create column") and new_col_name:
+                if operation_type == "Square":
+                    processed_df[new_col_name] = processed_df[col_to_use] ** 2
+                elif operation_type == "Square Root":
+                    processed_df[new_col_name] = np.sqrt(processed_df[col_to_use])
+                elif operation_type == "Log":
+                    processed_df[new_col_name] = np.log1p(processed_df[col_to_use])
+                elif operation_type == "Absolute":
+                    processed_df[new_col_name] = np.abs(processed_df[col_to_use])
+                elif operation_type == "Round":
+                    processed_df[new_col_name] = np.round(processed_df[col_to_use])
+                elif operation_type == "Binary encode":
+                    threshold = st.number_input("Threshold for binary encoding")
+                    processed_df[new_col_name] = (processed_df[col_to_use] > threshold).astype(int)
+                st.success(f"✅ Created column: {new_col_name}")
+        elif operation == "Rename column":
+            col_to_rename = st.selectbox("Select column to rename", processed_df.columns)
+            new_name = st.text_input("New column name")
+            if st.button("Rename") and new_name:
+                processed_df.rename(columns={col_to_rename: new_name}, inplace=True)
+                st.success(f"✅ Renamed {col_to_rename} to {new_name}")
+        elif operation == "Drop column":
+            cols_to_drop = st.multiselect("Select columns to drop", processed_df.columns)
+            if st.button("Drop columns") and cols_to_drop:
+                processed_df = processed_df.drop(columns=cols_to_drop)
+                st.success(f"✅ Dropped columns: {', '.join(cols_to_drop)}")
+        elif operation == "Change data type":
+            col_to_change = st.selectbox("Select column", processed_df.columns)
+            new_type = st.selectbox(
+                "New data type",
+                ["int", "float", "str", "datetime", "category"]
+            )
+            if st.button("Change type"):
+                try:
+                    if new_type == "int":
+                        processed_df[col_to_change] = processed_df[col_to_change].astype(int)
+                    elif new_type == "float":
+                        processed_df[col_to_change] = processed_df[col_to_change].astype(float)
+                    elif new_type == "str":
+                        processed_df[col_to_change] = processed_df[col_to_change].astype(str)
+                    elif new_type == "datetime":
+                        processed_df[col_to_change] = pd.to_datetime(processed_df[col_to_change])
+                    elif new_type == "category":
+                        processed_df[col_to_change] = processed_df[col_to_change].astype('category')
+                    st.success(f"✅ Changed {col_to_change} to {new_type}")
+                except Exception as e:
+                    st.error(f"Error: {str(e)}")
+        st.markdown('</div>', unsafe_allow_html=True)
+        # Update session state
+        st.session_state.data = processed_df
+    with tab4:
+        st.markdown('<div class="custom-card">', unsafe_allow_html=True)
+        st.subheader("📏 Feature Scaling & Encoding")
+        processed_df = st.session_state.data.copy() if 'processed_df' not in locals() else processed_df
+        col1, col2 = st.columns(2)
+        with col1:
+            st.markdown("### Feature Scaling")
+            numeric_cols = processed_df.select_dtypes(include=[np.number]).columns.tolist()
+            if numeric_cols:
+                scale_cols = st.multiselect("Select columns to scale", numeric_cols)
+                scale_method = st.radio("Scaling method", ["StandardScaler", "MinMaxScaler"])
+                if st.button("Apply Scaling") and scale_cols:
+                    if scale_method == "StandardScaler":
+                        scaler = StandardScaler()
+                    else:
+                        scaler = MinMaxScaler()
+                    processed_df[scale_cols] = scaler.fit_transform(processed_df[scale_cols])
+                    st.success(f"✅ Applied {scale_method} to {len(scale_cols)} columns")
+        with col2:
+            st.markdown("### Categorical Encoding")
+            cat_cols = processed_df.select_dtypes(include=['object', 'category']).columns.tolist()
+            if cat_cols:
+                encode_cols = st.multiselect("Select columns to encode", cat_cols)
+                encode_method = st.radio("Encoding method", ["Label Encoding", "One-Hot Encoding"])
+                if st.button("Apply Encoding") and encode_cols:
+                    if encode_method == "Label Encoding":
+                        for col in encode_cols:
+                            le = LabelEncoder()
+                            processed_df[col + '_encoded'] = le.fit_transform(processed_df[col])
+                        st.success(f"✅ Applied Label Encoding to {len(encode_cols)} columns")
+                    else:
+                        processed_df = pd.get_dummies(processed_df, columns=encode_cols)
+                        st.success(f"✅ Applied One-Hot Encoding to {len(encode_cols)} columns")
+        st.markdown('</div>', unsafe_allow_html=True)
+        # Update session state
+        st.session_state.data = processed_df
+    with tab5:
+        st.markdown('<div class="custom-card">', unsafe_allow_html=True)
+        st.subheader("📈 Feature Engineering")
+        processed_df = st.session_state.data.copy() if 'processed_df' not in locals() else processed_df
+        # Feature interactions
+        st.markdown("### Feature Interactions")
+        numeric_cols = processed_df.select_dtypes(include=[np.number]).columns.tolist()
+        if len(numeric_cols) >= 2:
+            col1, col2 = st.columns(2)
+            with col1:
+                feat1 = st.selectbox("First feature", numeric_cols)
+            with col2:
+                feat2 = st.selectbox("Second feature", [c for c in numeric_cols if c != feat1])
+            interaction_type = st.selectbox(
+                "Interaction type",
+                ["Multiplication", "Addition", "Subtraction", "Division", "Ratio"]
+            )
+            new_col_name = st.text_input("New column name", f"{feat1}_{interaction_type}_{feat2}")
+            if st.button("Create Interaction Feature"):
+                if interaction_type == "Multiplication":
+                    processed_df[new_col_name] = processed_df[feat1] * processed_df[feat2]
+                elif interaction_type == "Addition":
+                    processed_df[new_col_name] = processed_df[feat1] + processed_df[feat2]
+                elif interaction_type == "Subtraction":
+                    processed_df[new_col_name] = processed_df[feat1] - processed_df[feat2]
+                elif interaction_type == "Division":
+                    processed_df[new_col_name] = processed_df[feat1] / (processed_df[feat2] + 1e-8)
+                elif interaction_type == "Ratio":
+                    processed_df[new_col_name] = processed_df[feat1] / (processed_df[feat2].sum() + 1e-8)
+                st.success(f"✅ Created feature: {new_col_name}")
+        # Binning
+        st.markdown("### Feature Binning")
+        if numeric_cols:
+            bin_col = st.selectbox("Select column for binning", numeric_cols)
+            n_bins = st.slider("Number of bins", 2, 20, 5)
+            bin_labels = [f"Bin_{i}" for i in range(n_bins)]
+            if st.button("Create Binned Feature"):
+                processed_df[bin_col + '_binned'] = pd.cut(processed_df[bin_col],
+                                                          bins=n_bins,
+                                                          labels=bin_labels)
+                st.success(f"✅ Created binned feature: {bin_col}_binned")
+        st.markdown('</div>', unsafe_allow_html=True)
+        # Update session state
+        st.session_state.data = processed_df
+    # Preview processed data
+    st.markdown("---")
+    st.subheader("📋 Processed Data Preview")
+    data_to_show = st.session_state.data
+    col1, col2, col3 = st.columns(3)
+    with col1:
+        st.metric("Final Rows", data_to_show.shape[0])
+    with col2:
+        st.metric("Final Columns", data_to_show.shape[1])
+    with col3:
+        final_missing = data_to_show.isnull().sum().sum()
+        st.metric("Remaining Missing", final_missing)
+    st.dataframe(data_to_show.head(10), use_container_width=True)
+    # Download processed data
+    csv = data_to_show.to_csv(index=False)
+    st.download_button(
+        label="📥 Download Processed Data",
+        data=csv,
+        file_name="processed_data.csv",
+        mime="text/csv",
+        use_container_width=True
+    )
+    return data_to_show

data_quality.py ADDED Viewed

	@@ -0,0 +1,252 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+import plotly.express as px
+import plotly.graph_objects as go
+from sklearn.ensemble import IsolationForest
+def quality_report(df):
+    st.markdown("""
+    <div style='text-align: center; margin-bottom: 2rem;'>
+        <h2>📋 Data Quality Report</h2>
+        <p style='color: gray;'>Comprehensive data quality assessment</p>
+    </div>
+    """, unsafe_allow_html=True)
+    # Overall quality score
+    st.subheader("📊 Overall Data Quality Score")
+    # Calculate various quality metrics
+    completeness = (1 - df.isnull().sum().sum() / (df.shape[0] * df.shape[1])) * 100
+    uniqueness = (1 - df.duplicated().sum() / df.shape[0]) * 100
+    # Data type consistency
+    type_consistency = 100
+    for col in df.columns:
+        if df[col].dtype == 'object':
+            # Check if column has consistent types
+            try:
+                pd.to_numeric(df[col], errors='raise')
+                # If convertible to numeric, it might be inconsistent
+                type_consistency -= 5
+            except:
+                pass
+    # Outlier impact
+    outlier_impact = 100
+    numeric_cols = df.select_dtypes(include=[np.number]).columns
+    if len(numeric_cols) > 0:
+        for col in numeric_cols:
+            Q1 = df[col].quantile(0.25)
+            Q3 = df[col].quantile(0.75)
+            IQR = Q3 - Q1
+            outliers = df[(df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)]
+            outlier_pct = len(outliers) / len(df) * 100
+            if outlier_pct > 10:
+                outlier_impact -= 10
+    quality_score = (completeness + uniqueness + type_consistency + outlier_impact) / 4
+    # Display gauge
+    fig = go.Figure(go.Indicator(
+        mode="gauge+number",
+        value=quality_score,
+        domain={'x': [0, 1], 'y': [0, 1]},
+        title={'text': "Quality Score"},
+        gauge={
+            'axis': {'range': [None, 100]},
+            'bar': {'color': "#2E86AB"},
+            'steps': [
+                {'range': [0, 50], 'color': "#FF6B6B"},
+                {'range': [50, 70], 'color': "#FFD93D"},
+                {'range': [70, 85], 'color': "#6BCB77"},
+                {'range': [85, 100], 'color': "#4CAF50"}
+            ],
+            'threshold': {
+                'line': {'color': "red", 'width': 4},
+                'thickness': 0.75,
+                'value': 90
+            }
+        }))
+    st.plotly_chart(fig, use_container_width=True)
+    # Quality metrics cards
+    col1, col2, col3, col4 = st.columns(4)
+    with col1:
+        st.metric("Completeness", f"{completeness:.1f}%",
+                 delta=None, delta_color="normal")
+    with col2:
+        st.metric("Uniqueness", f"{uniqueness:.1f}%",
+                 delta=None, delta_color="normal")
+    with col3:
+        st.metric("Type Consistency", f"{type_consistency:.1f}%",
+                 delta=None, delta_color="normal")
+    with col4:
+        st.metric("Outlier Impact", f"{outlier_impact:.1f}%",
+                 delta=None, delta_color="inverse")
+    # Detailed quality report
+    st.subheader("🔍 Detailed Quality Report")
+    quality_df = pd.DataFrame({
+        'Column': df.columns,
+        'Data Type': df.dtypes,
+        'Missing Count': df.isnull().sum().values,
+        'Missing %': (df.isnull().sum().values / len(df) * 100).round(2),
+        'Unique Values': [df[col].nunique() for col in df.columns],
+        'Unique %': [round((df[col].nunique() / len(df) * 100),2) for col in df.columns],
+        'Duplicate Values?': [df[col].duplicated().any() for col in df.columns]
+    })
+    # Add outlier info for numeric columns
+    outlier_info = []
+    for col in df.columns:
+        if df[col].dtype in ['int64', 'float64']:
+            Q1 = df[col].quantile(0.25)
+            Q3 = df[col].quantile(0.75)
+            IQR = Q3 - Q1
+            outliers = df[(df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)]
+            outlier_info.append(len(outliers))
+        else:
+            outlier_info.append(0)
+    quality_df['Outliers'] = outlier_info
+    st.dataframe(quality_df.style.background_gradient(subset=['Missing %', 'Outliers'], cmap='YlOrRd'),
+                use_container_width=True)
+    # Visualizations
+    st.subheader("📊 Quality Visualizations")
+    col1, col2 = st.columns(2)
+    with col1:
+        # Missing values bar chart
+        missing_cols = df.isnull().sum()[df.isnull().sum() > 0]
+        if len(missing_cols) > 0:
+            fig = px.bar(x=missing_cols.index, y=missing_cols.values,
+                        title="Missing Values by Column",
+                        labels={'x': 'Column', 'y': 'Missing Count'})
+            st.plotly_chart(fig, use_container_width=True)
+        else:
+            st.success("No missing values found!")
+    with col2:
+        # Data type distribution
+        dtype_counts = df.dtypes.value_counts()
+        fig = px.pie(values=dtype_counts.values, names=dtype_counts.index.astype(str),
+                    title="Data Type Distribution")
+        st.plotly_chart(fig, use_container_width=True)
+    # Outlier detection with Isolation Forest
+    st.subheader("🕵️ Anomaly Detection")
+    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
+    if len(numeric_cols) > 0:
+        contamination = st.slider("Expected outlier proportion", 0.01, 0.5, 0.1, 0.01)
+        iso_forest = IsolationForest(contamination=contamination, random_state=42)
+        outliers = iso_forest.fit_predict(df[numeric_cols].fillna(0))
+        n_outliers = (outliers == -1).sum()
+        st.write(f"**Detected Anomalies:** {n_outliers} rows ({n_outliers/len(df)*100:.2f}%)")
+        # Visualize outliers (if 2 or 3 numeric columns)
+        if len(numeric_cols) >= 2:
+            df_with_outliers = df[numeric_cols[:3]].copy()
+            df_with_outliers['is_outlier'] = outliers
+            if len(numeric_cols) == 2:
+                fig = px.scatter(df_with_outliers, x=numeric_cols[0], y=numeric_cols[1],
+                               color='is_outlier', title="Anomaly Detection Results",
+                               color_continuous_scale=['blue', 'red'])
+                st.plotly_chart(fig, use_container_width=True)
+            elif len(numeric_cols) >= 3:
+                fig = px.scatter_3d(df_with_outliers, x=numeric_cols[0],
+                                   y=numeric_cols[1], z=numeric_cols[2],
+                                   color='is_outlier', title="Anomaly Detection Results (3D)",
+                                   color_continuous_scale=['blue', 'red'])
+                st.plotly_chart(fig, use_container_width=True)
+    else:
+        st.info("No numeric columns available for anomaly detection")
+    # Recommendations
+    st.subheader("💡 Quality Improvement Recommendations")
+    recommendations = []
+    # Missing value recommendations
+    missing_cols = df.columns[df.isnull().any()].tolist()
+    if missing_cols:
+        recommendations.append(f"• Handle missing values in {len(missing_cols)} columns: {', '.join(missing_cols[:5])}")
+    # Duplicate recommendations
+    if df.duplicated().sum() > 0:
+        recommendations.append(f"• Remove {df.duplicated().sum()} duplicate rows")
+    # Outlier recommendations
+    outlier_cols = []
+    for col in numeric_cols:
+        Q1 = df[col].quantile(0.25)
+        Q3 = df[col].quantile(0.75)
+        IQR = Q3 - Q1
+        outliers = df[(df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)]
+        if len(outliers) > len(df) * 0.1:  # More than 10% outliers
+            outlier_cols.append(col)
+    if outlier_cols:
+        recommendations.append(f"• Investigate outliers in: {', '.join(outlier_cols[:3])}")
+    # Data type recommendations
+    for col in df.columns:
+        if df[col].dtype == 'object':
+            # Check if column should be numeric
+            try:
+                pd.to_numeric(df[col].dropna().iloc[:100])
+                recommendations.append(f"• Convert '{col}' to numeric type")
+            except:
+                pass
+    if recommendations:
+        for rec in recommendations:
+            st.markdown(rec)
+    else:
+        st.success("✅ Dataset quality looks good! No major issues detected.")
+    # Download quality report
+    report_text = f"""
+    DATA QUALITY REPORT
+    ===================
+    Overall Quality Score: {quality_score:.1f}/100
+    Metrics:
+    • Completeness: {completeness:.1f}%
+    • Uniqueness: {uniqueness:.1f}%
+    • Type Consistency: {type_consistency:.1f}%
+    • Outlier Impact: {outlier_impact:.1f}%
+    Dataset Statistics:
+    • Rows: {df.shape[0]:,}
+    • Columns: {df.shape[1]}
+    • Missing Values: {df.isnull().sum().sum():,}
+    • Duplicate Rows: {df.duplicated().sum():,}
+    Recommendations:
+    {chr(10).join(recommendations)}
+    """
+    st.download_button(
+        label="📥 Download Quality Report",
+        data=report_text,
+        file_name="data_quality_report.txt",
+        mime="text/plain",
+        use_container_width=True
+    )

dataset_overview.py ADDED Viewed

	@@ -0,0 +1,1159 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+import plotly.express as px
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+def eda_analysis(df):
+    """
+    Comprehensive Exploratory Data Analysis (EDA) with visual insights
+    """
+    st.markdown("""
+    <div style='text-align: center; margin-bottom: 2rem;'>
+        <h2>🔍 Exploratory Data Analysis (EDA)</h2>
+        <p style='color: gray;'>Discover patterns, relationships, and insights through visual exploration</p>
+    </div>
+    """, unsafe_allow_html=True)
+    # Error handling
+    if df.empty:
+        st.error("❌ The dataset is empty. Please upload a valid dataset.")
+        return
+    try:
+        # Create tabs for different EDA aspects
+        tab1, tab2, tab3, tab4, tab5, tab6 = st.tabs([
+            "📋 Data Overview",
+            "🔍 Missing Data Analysis",
+            "📊 Univariate Analysis",
+            "🔄 Bivariate Analysis",
+            "📈 Multivariate Analysis",
+            "🎯 Pattern Discovery"
+        ])
+        with tab1:
+            st.markdown('<div class="custom-card">', unsafe_allow_html=True)
+            st.subheader("📋 Dataset Overview")
+            try:
+                # Key metrics in cards
+                col1, col2, col3, col4 = st.columns(4)
+                with col1:
+                    st.metric("Total Rows", f"{df.shape[0]:,}")
+                with col2:
+                    st.metric("Total Columns", df.shape[1])
+                with col3:
+                    memory_usage = df.memory_usage(deep=True).sum() / 1024**2
+                    st.metric("Memory Usage", f"{memory_usage:.2f} MB")
+                with col4:
+                    missing_total = df.isnull().sum().sum()
+                    st.metric("Missing Values", f"{missing_total:,}")
+                # Data preview with interactive controls
+                st.subheader("🔍 Data Preview")
+                col1, col2 = st.columns(2)
+                with col1:
+                    preview_rows = st.slider("Number of rows to display", 5, 50, 10, key="preview_rows")
+                with col2:
+                    preview_type = st.radio("Preview type", ["Head", "Tail", "Random Sample"],
+                                           horizontal=True, key="preview_type")
+                if preview_type == "Head":
+                    st.dataframe(df.head(preview_rows), use_container_width=True)
+                elif preview_type == "Tail":
+                    st.dataframe(df.tail(preview_rows), use_container_width=True)
+                else:
+                    if len(df) > preview_rows:
+                        st.dataframe(df.sample(preview_rows), use_container_width=True)
+                    else:
+                        st.warning("⚠️ Sample size larger than dataset. Showing all rows.")
+                        st.dataframe(df, use_container_width=True)
+                # Column information with visual indicators
+                st.subheader("📋 Column Information")
+                col_info = pd.DataFrame({
+                    'Column': df.columns,
+                    'Data Type': df.dtypes.astype(str),
+                    'Non-Null Count': df.count().values,
+                    'Null Count': df.isnull().sum().values,
+                    'Null %': (df.isnull().sum().values / len(df) * 100).round(2),
+                    'Unique Values': [df[col].nunique() for col in df.columns],
+                    'Sample Values': [str(df[col].dropna().iloc[:3].tolist()) if len(df[col].dropna()) > 0 else "All null" for col in df.columns]
+                })
+                # Add color coding for data types
+                def color_data_type(val):
+                    if 'int' in val or 'float' in val:
+                        return 'background-color: #e3f2fd'
+                    elif 'object' in val:
+                        return 'background-color: #f1f8e9'
+                    elif 'datetime' in val:
+                        return 'background-color: #fff3e0'
+                    return ''
+                st.dataframe(col_info.style.applymap(color_data_type, subset=['Data Type']),
+                            use_container_width=True)
+                # Data type distribution
+                st.subheader("📊 Data Type Distribution")
+                dtype_counts = df.dtypes.value_counts()
+                if len(dtype_counts) > 0:
+                    fig = make_subplots(rows=1, cols=2,
+                                       specs=[[{"type": "pie"}, {"type": "bar"}]],
+                                       subplot_titles=("Pie Chart", "Bar Chart"))
+                    fig.add_trace(go.Pie(labels=dtype_counts.index.astype(str),
+                                        values=dtype_counts.values,
+                                        hole=0.3), row=1, col=1)
+                    fig.add_trace(go.Bar(x=dtype_counts.index.astype(str),
+                                        y=dtype_counts.values,
+                                        marker_color=['#42a5f5', '#66bb6a', '#ffa726'][:len(dtype_counts)]),
+                                 row=1, col=2)
+                    fig.update_layout(height=400, title_text="Column Types Distribution")
+                    st.plotly_chart(fig, use_container_width=True)
+                else:
+                    st.warning("⚠️ No data type information available")
+                # Dataset statistics
+                st.subheader("📈 Dataset Statistics")
+                numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
+                categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
+                datetime_cols = df.select_dtypes(include=['datetime64']).columns.tolist()
+                bool_cols = df.select_dtypes(include=['bool']).columns.tolist()
+                col1, col2, col3, col4 = st.columns(4)
+                with col1:
+                    st.info(f"**Numeric:** {len(numeric_cols)} columns")
+                with col2:
+                    st.info(f"**Categorical:** {len(categorical_cols)} columns")
+                with col3:
+                    st.info(f"**Datetime:** {len(datetime_cols)} columns")
+                with col4:
+                    st.info(f"**Boolean:** {len(bool_cols)} columns")
+            except Exception as e:
+                st.error(f"❌ Error in data overview: {str(e)}")
+                st.info("💡 Tip: Check if your dataset contains valid data types")
+            st.markdown('</div>', unsafe_allow_html=True)
+        with tab2:
+            st.markdown('<div class="custom-card">', unsafe_allow_html=True)
+            st.subheader("🔍 Missing Data Analysis")
+            try:
+                if df.isnull().sum().sum() > 0:
+                    # Missing data overview
+                    missing_df = pd.DataFrame({
+                        'Column': df.columns,
+                        'Missing Count': df.isnull().sum().values,
+                        'Missing %': (df.isnull().sum().values / len(df) * 100).round(2)
+                    }).sort_values('Missing %', ascending=False)
+                    missing_df = missing_df[missing_df['Missing Count'] > 0]
+                    if len(missing_df) > 0:
+                        # Visualize missing data
+                        fig = make_subplots(rows=2, cols=2,
+                                           subplot_titles=("Missing Values Heatmap",
+                                                          "Missing Values by Column",
+                                                          "Missing Data Patterns",
+                                                          "Missing Data Matrix"),
+                                           specs=[[{"type": "heatmap"}, {"type": "bar"}],
+                                                 [{"type": "scatter"}, {"type": "heatmap"}]])
+                        # Heatmap of missing values
+                        missing_matrix = df.isnull().astype(int).T
+                        fig.add_trace(go.Heatmap(z=missing_matrix.values,
+                                                y=missing_matrix.index,
+                                                colorscale='Reds',
+                                                showscale=False), row=1, col=1)
+                        # Bar chart of missing values
+                        fig.add_trace(go.Bar(x=missing_df['Column'].head(20),
+                                            y=missing_df['Missing Count'].head(20),
+                                            marker_color='#ef5350',
+                                            name="Missing Count"), row=1, col=2)
+                        # Missing data patterns (rows with missing data)
+                        missing_rows = df[df.isnull().any(axis=1)]
+                        if len(missing_rows) > 0:
+                            pattern_df = missing_rows.isnull().sum(axis=1).value_counts().reset_index()
+                            pattern_df.columns = ['Missing Count per Row', 'Number of Rows']
+                            pattern_df = pattern_df.sort_values('Missing Count per Row')
+                            fig.add_trace(go.Scatter(x=pattern_df['Missing Count per Row'],
+                                                    y=pattern_df['Number of Rows'],
+                                                    mode='lines+markers',
+                                                    name="Patterns"), row=2, col=1)
+                        # Missing data matrix for first 50 rows
+                        sample_missing = df.head(min(50, len(df))).isnull().astype(int).T
+                        fig.add_trace(go.Heatmap(z=sample_missing.values,
+                                                y=sample_missing.index,
+                                                colorscale='Reds',
+                                                showscale=False,
+                                                name="Matrix"), row=2, col=2)
+                        fig.update_layout(height=800, title_text="Missing Data Analysis",
+                                        showlegend=False)
+                        st.plotly_chart(fig, use_container_width=True)
+                        # Detailed missing data table
+                        st.subheader("📋 Missing Data Details")
+                        # Add severity classification
+                        def classify_severity(pct):
+                            if pct == 0:
+                                return "✅ None"
+                            elif pct < 5:
+                                return "🟢 Low"
+                            elif pct < 20:
+                                return "🟡 Medium"
+                            else:
+                                return "🔴 High"
+                        missing_df['Severity'] = missing_df['Missing %'].apply(classify_severity)
+                        missing_df['Recommendation'] = missing_df['Missing %'].apply(
+                            lambda x: "No action needed" if x == 0 else
+                                     "Consider imputation" if x < 5 else
+                                     "Imputation recommended" if x < 20 else
+                                     "Consider dropping column"
+                        )
+                        st.dataframe(missing_df, use_container_width=True)
+                        # Missing data patterns
+                        if len(missing_df) > 1:
+                            st.subheader("🔄 Missing Data Patterns")
+                            # Find columns with similar missing patterns
+                            missing_corr = df[missing_df['Column'].tolist()].isnull().corr()
+                            if len(missing_corr) > 1:
+                                fig = px.imshow(missing_corr,
+                                               text_auto=True,
+                                               aspect="auto",
+                                               color_continuous_scale='RdBu_r',
+                                               title="Missing Value Correlation Matrix")
+                                st.plotly_chart(fig, use_container_width=True)
+                                # Find highly correlated missing patterns
+                                high_corr = []
+                                for i in range(len(missing_corr.columns)):
+                                    for j in range(i+1, len(missing_corr.columns)):
+                                        if abs(missing_corr.iloc[i, j]) > 0.7:
+                                            high_corr.append({
+                                                'Column 1': missing_corr.columns[i],
+                                                'Column 2': missing_corr.columns[j],
+                                                'Correlation': missing_corr.iloc[i, j]
+                                            })
+                                if high_corr:
+                                    st.info("🔍 **Columns with similar missing patterns:**")
+                                    for item in high_corr[:5]:  # Show top 5
+                                        st.write(f"• {item['Column 1']} & {item['Column 2']}: {item['Correlation']:.2f}")
+                    else:
+                        st.success("✅ No missing values found in the dataset!")
+                else:
+                    st.success("✅ No missing values found in the dataset!")
+                    # Show complete data visualization
+                    fig = go.Figure()
+                    fig.add_trace(go.Indicator(
+                        mode="number+gauge",
+                        value=100,
+                        title={'text': "Data Completeness"},
+                        gauge={'axis': {'range': [0, 100]},
+                              'bar': {'color': "green"},
+                              'steps': [{'range': [0, 100], 'color': "lightgreen"}]}
+                    ))
+                    st.plotly_chart(fig, use_container_width=True)
+            except Exception as e:
+                st.error(f"❌ Error in missing data analysis: {str(e)}")
+                st.info("💡 Tip: Ensure your dataset has valid data for missing value analysis")
+            st.markdown('</div>', unsafe_allow_html=True)
+        with tab3:
+            st.markdown('<div class="custom-card">', unsafe_allow_html=True)
+            st.subheader("📊 Univariate Analysis")
+            try:
+                col_type = st.radio("Select column type", ["Numeric", "Categorical", "Datetime"],
+                                   horizontal=True, key="univariate_type")
+                if col_type == "Numeric":
+                    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
+                    if numeric_cols:
+                        selected_col = st.selectbox("Select numeric column", numeric_cols, key="univariate_num")
+                        data = df[selected_col].dropna()
+                        if len(data) > 0:
+                            # Create comprehensive visualization
+                            fig = make_subplots(rows=2, cols=3,
+                                               subplot_titles=("Histogram", "Box Plot", "Violin Plot",
+                                                             "ECDF", "QQ Plot", "Summary Stats"),
+                                               specs=[[{"type": "xy"}, {"type": "xy"}, {"type": "xy"}],
+                                                     [{"type": "xy"}, {"type": "xy"}, {"type": "domain"}]])
+                            # Histogram
+                            fig.add_trace(go.Histogram(x=data, nbinsx=30, name="Histogram",
+                                                       marker_color='#42a5f5'), row=1, col=1)
+                            # Box plot
+                            fig.add_trace(go.Box(y=data, name="Box Plot", boxpoints='outliers',
+                                                marker_color='#66bb6a'), row=1, col=2)
+                            # Violin plot
+                            fig.add_trace(go.Violin(y=data, name="Violin Plot", box_visible=True,
+                                                   line_color='black', fillcolor='#ffa726',
+                                                   opacity=0.6), row=1, col=3)
+                            # ECDF
+                            sorted_data = np.sort(data)
+                            ecdf = np.arange(1, len(sorted_data)+1) / len(sorted_data)
+                            fig.add_trace(go.Scatter(x=sorted_data, y=ecdf, mode='lines',
+                                                    name="ECDF", line=dict(color='#ab47bc')),
+                                         row=2, col=1)
+                            # QQ plot
+                            theoretical_q = np.random.normal(data.mean(), data.std(), len(data))
+                            theoretical_q.sort()
+                            fig.add_trace(go.Scatter(x=theoretical_q, y=sorted_data,
+                                                    mode='markers', name="QQ Plot",
+                                                    marker=dict(color='#7e57c2', size=3)),
+                                         row=2, col=2)
+                            # Add reference line to QQ plot
+                            min_val = min(theoretical_q.min(), sorted_data.min())
+                            max_val = max(theoretical_q.max(), sorted_data.max())
+                            fig.add_trace(go.Scatter(x=[min_val, max_val], y=[min_val, max_val],
+                                                    mode='lines', line=dict(color='red', dash='dash'),
+                                                    showlegend=False), row=2, col=2)
+                            # Summary statistics as table
+                            stats_text = f"""
+                            <b>Summary Statistics</b><br>
+                            Count: {len(data):,}<br>
+                            Mean: {data.mean():.4f}<br>
+                            Std: {data.std():.4f}<br>
+                            Min: {data.min():.4f}<br>
+                            Q1: {data.quantile(0.25):.4f}<br>
+                            Median: {data.median():.4f}<br>
+                            Q3: {data.quantile(0.75):.4f}<br>
+                            Max: {data.max():.4f}<br>
+                            IQR: {data.quantile(0.75) - data.quantile(0.25):.4f}<br>
+                            Skewness: {data.skew():.4f}<br>
+                            Kurtosis: {data.kurtosis():.4f}
+                            """
+                            fig.add_annotation(x=0.5, y=0.5, text=stats_text,
+                                             showarrow=False, font=dict(size=10),
+                                             row=2, col=3, align='left')
+                            fig.update_layout(height=800, title_text=f"Univariate Analysis: {selected_col}")
+                            st.plotly_chart(fig, use_container_width=True)
+                            # Outlier detection
+                            Q1 = data.quantile(0.25)
+                            Q3 = data.quantile(0.75)
+                            IQR = Q3 - Q1
+                            outliers = data[(data < Q1 - 1.5 * IQR) | (data > Q3 + 1.5 * IQR)]
+                            col1, col2 = st.columns(2)
+                            with col1:
+                                st.metric("Outliers Count", len(outliers))
+                            with col2:
+                                st.metric("Outliers %", f"{len(outliers)/len(data)*100:.2f}%")
+                            if len(outliers) > 0:
+                                with st.expander("View outlier values"):
+                                    st.write(outliers.tolist()[:20])  # Show first 20 outliers
+                                    if len(outliers) > 20:
+                                        st.info(f"... and {len(outliers) - 20} more outliers")
+                    else:
+                        st.warning("⚠️ No numeric columns available for analysis")
+                elif col_type == "Categorical":
+                    categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
+                    if categorical_cols:
+                        selected_col = st.selectbox("Select categorical column", categorical_cols,
+                                                   key="univariate_cat")
+                        # Get value counts
+                        value_counts = df[selected_col].value_counts().reset_index()
+                        value_counts.columns = [selected_col, 'count']
+                        value_counts['percentage'] = (value_counts['count'] / len(df) * 100).round(2)
+                        if len(value_counts) > 0:
+                            # Create visualizations
+                            fig = make_subplots(rows=2, cols=2,
+                                               subplot_titles=("Bar Chart (Top 20)", "Pie Chart (Top 10)",
+                                                              "Treemap (Top 10)", "Frequency Table"),
+                                               specs=[[{"type": "xy"}, {"type": "domain"}],
+                                                     [{"type": "domain"}, {"type": "table"}]])
+                            # Bar chart (top 20)
+                            top20 = value_counts.head(20)
+                            fig.add_trace(go.Bar(x=top20[selected_col],
+                                                y=top20['count'],
+                                                marker_color='#42a5f5',
+                                                name="Count"), row=1, col=1)
+                            # Pie chart (top 10)
+                            top10 = value_counts.head(10)
+                            fig.add_trace(go.Pie(labels=top10[selected_col],
+                                                values=top10['count'],
+                                                hole=0.3,
+                                                textinfo='percent+label',
+                                                name="Proportion"), row=1, col=2)
+                            # Treemap (top 10)
+                            fig.add_trace(go.Treemap(labels=top10[selected_col],
+                                                    parents=['']*len(top10),
+                                                    values=top10['count'],
+                                                    textinfo='label+value',
+                                                    name="Treemap"), row=2, col=1)
+                            # Frequency table (top 10)
+                            fig.add_trace(go.Table(header=dict(values=[selected_col, 'Count', 'Percentage']),
+                                                  cells=dict(values=[top10[selected_col].tolist(),
+                                                                    top10['count'].tolist(),
+                                                                    top10['percentage'].tolist()]),
+                                                  name="Table"), row=2, col=2)
+                            fig.update_layout(height=800, title_text=f"Categorical Analysis: {selected_col}")
+                            st.plotly_chart(fig, use_container_width=True)
+                            # Summary statistics for categorical
+                            col1, col2, col3 = st.columns(3)
+                            with col1:
+                                st.metric("Unique Values", f"{value_counts.shape[0]:,}")
+                            with col2:
+                                st.metric("Most Frequent", f"{value_counts.iloc[0, 0]}")
+                            with col3:
+                                st.metric("Frequency", f"{value_counts.iloc[0, 1]:,} ({value_counts.iloc[0, 2]}%)")
+                            # Cardinality warning
+                            if value_counts.shape[0] > 50:
+                                st.warning(f"⚠️ High cardinality detected: {value_counts.shape[0]} unique values. Consider grouping rare categories.")
+                    else:
+                        st.warning("⚠️ No categorical columns available for analysis")
+                elif col_type == "Datetime":
+                    datetime_cols = df.select_dtypes(include=['datetime64']).columns.tolist()
+                    if datetime_cols:
+                        selected_col = st.selectbox("Select datetime column", datetime_cols,
+                                                   key="univariate_datetime")
+                        # Extract temporal features
+                        df_temp = df[selected_col].dropna()
+                        if len(df_temp) > 0:
+                            # Create temporal distributions
+                            fig = make_subplots(rows=2, cols=2,
+                                               subplot_titles=("Year Distribution", "Month Distribution",
+                                                              "Day of Week Distribution", "Hour Distribution"),
+                                               specs=[[{"type": "xy"}, {"type": "xy"}],
+                                                     [{"type": "xy"}, {"type": "xy"}]])
+                            # Year distribution
+                            years = df_temp.dt.year.value_counts().sort_index()
+                            if len(years) > 0:
+                                fig.add_trace(go.Bar(x=years.index.astype(str), y=years.values,
+                                                    marker_color='#42a5f5', name="Year"), row=1, col=1)
+                            # Month distribution
+                            months = df_temp.dt.month.value_counts().sort_index()
+                            month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
+                                          'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
+                            if len(months) > 0:
+                                fig.add_trace(go.Bar(x=[month_names[i-1] for i in months.index],
+                                                    y=months.values, marker_color='#66bb6a',
+                                                    name="Month"), row=1, col=2)
+                            # Day of week distribution
+                            days = df_temp.dt.dayofweek.value_counts().sort_index()
+                            day_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
+                            if len(days) > 0:
+                                fig.add_trace(go.Bar(x=[day_names[i] for i in days.index],
+                                                    y=days.values, marker_color='#ffa726',
+                                                    name="Day of Week"), row=2, col=1)
+                            # Hour distribution (if time component exists)
+                            if df_temp.dt.hour.nunique() > 1:
+                                hours = df_temp.dt.hour.value_counts().sort_index()
+                                fig.add_trace(go.Bar(x=hours.index.astype(str), y=hours.values,
+                                                    marker_color='#ab47bc', name="Hour"), row=2, col=2)
+                            fig.update_layout(height=800, title_text=f"Temporal Analysis: {selected_col}")
+                            st.plotly_chart(fig, use_container_width=True)
+                            # Date range information
+                            col1, col2, col3 = st.columns(3)
+                            with col1:
+                                st.metric("Start Date", df_temp.min().strftime('%Y-%m-%d'))
+                            with col2:
+                                st.metric("End Date", df_temp.max().strftime('%Y-%m-%d'))
+                            with col3:
+                                date_range = (df_temp.max() - df_temp.min()).days
+                                st.metric("Date Range", f"{date_range} days")
+                    else:
+                        st.warning("⚠️ No datetime columns available for analysis")
+            except Exception as e:
+                st.error(f"❌ Error in univariate analysis: {str(e)}")
+                st.info("💡 Tip: Ensure the selected column contains valid data for analysis")
+            st.markdown('</div>', unsafe_allow_html=True)
+        with tab4:
+            st.markdown('<div class="custom-card">', unsafe_allow_html=True)
+            st.subheader("🔄 Bivariate Analysis")
+            try:
+                analysis_type = st.radio("Select analysis type",
+                                        ["Numeric vs Numeric", "Numeric vs Categorical",
+                                         "Categorical vs Categorical"],
+                                        horizontal=True, key="bivariate_type")
+                if analysis_type == "Numeric vs Numeric":
+                    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
+                    if len(numeric_cols) >= 2:
+                        col1, col2 = st.columns(2)
+                        with col1:
+                            x_col = st.selectbox("Select X axis", numeric_cols, key="bi_x")
+                        with col2:
+                            y_col = st.selectbox("Select Y axis", [c for c in numeric_cols if c != x_col],
+                                                key="bi_y")
+                        # Clean data for analysis
+                        plot_df = df[[x_col, y_col]].dropna()
+                        if len(plot_df) > 0:
+                            # Create comprehensive visualization
+                            fig = make_subplots(rows=2, cols=3,
+                                               subplot_titles=("Scatter Plot", "Hexbin Plot", "Density Contour",
+                                                             "Marginal Distributions", "Residuals", "Statistics"),
+                                               specs=[[{"type": "xy"}, {"type": "xy"}, {"type": "xy"}],
+                                                     [{"type": "xy"}, {"type": "xy"}, {"type": "domain"}]])
+                            # Scatter plot with trendline
+                            fig.add_trace(go.Scatter(x=plot_df[x_col], y=plot_df[y_col],
+                                                    mode='markers', name="Scatter",
+                                                    marker=dict(size=5, opacity=0.6, color='#42a5f5')),
+                                         row=1, col=1)
+                            # Add trendline
+                            try:
+                                z = np.polyfit(plot_df[x_col], plot_df[y_col], 1)
+                                p = np.poly1d(z)
+                                x_range = np.linspace(plot_df[x_col].min(), plot_df[x_col].max(), 100)
+                                fig.add_trace(go.Scatter(x=x_range, y=p(x_range),
+                                                        mode='lines', name="Trend",
+                                                        line=dict(color='red', width=2)), row=1, col=1)
+                            except:
+                                pass
+                            # Hexbin plot
+                            fig.add_trace(go.Histogram2d(x=plot_df[x_col], y=plot_df[y_col],
+                                                        colorscale='Viridis',
+                                                        name="Hexbin"), row=1, col=2)
+                            # Density contour
+                            fig.add_trace(go.Histogram2dContour(x=plot_df[x_col], y=plot_df[y_col],
+                                                               colorscale='Viridis',
+                                                               name="Contour"), row=1, col=3)
+                            # Marginal distributions
+                            fig.add_trace(go.Histogram(x=plot_df[x_col], name=f"{x_col}",
+                                                      marker_color='#66bb6a'), row=2, col=1)
+                            fig.add_trace(go.Histogram(y=plot_df[y_col], name=f"{y_col}",
+                                                      marker_color='#ffa726', orientation='h'),
+                                         row=2, col=1)
+                            # Residuals
+                            try:
+                                residuals = plot_df[y_col] - p(plot_df[x_col])
+                                fig.add_trace(go.Scatter(x=plot_df[x_col], y=residuals,
+                                                        mode='markers', name="Residuals",
+                                                        marker=dict(size=3, opacity=0.5, color='#ab47bc')),
+                                             row=2, col=2)
+                                fig.add_hline(y=0, line_dash="dash", line_color="red", row=2, col=2)
+                            except:
+                                pass
+                            # Statistics
+                            corr = plot_df[x_col].corr(plot_df[y_col])
+                            stats_text = f"""
+                            <b>Statistics</b><br>
+                            Correlation: {corr:.4f}<br>
+                            R²: {corr**2:.4f}<br>
+                            Covariance: {plot_df[x_col].cov(plot_df[y_col]):.4f}<br>
+                            Sample Size: {len(plot_df)}<br>
+                            """
+                            fig.add_annotation(x=0.5, y=0.5, text=stats_text,
+                                             showarrow=False, font=dict(size=10),
+                                             row=2, col=3, align='left')
+                            fig.update_layout(height=800, title_text=f"Bivariate Analysis: {x_col} vs {y_col}")
+                            st.plotly_chart(fig, use_container_width=True)
+                            # Correlation interpretation
+                            if abs(corr) > 0.7:
+                                st.success(f"✅ Strong {'positive' if corr > 0 else 'negative'} correlation detected")
+                            elif abs(corr) > 0.3:
+                                st.info(f"ℹ️ Moderate {'positive' if corr > 0 else 'negative'} correlation detected")
+                            else:
+                                st.warning(f"⚠️ Weak or no correlation detected")
+                    else:
+                        st.warning("⚠️ Need at least 2 numeric columns for this analysis")
+                elif analysis_type == "Numeric vs Categorical":
+                    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
+                    categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
+                    if numeric_cols and categorical_cols:
+                        col1, col2 = st.columns(2)
+                        with col1:
+                            num_col = st.selectbox("Select numeric column", numeric_cols, key="bi_num")
+                        with col2:
+                            cat_col = st.selectbox("Select categorical column", categorical_cols, key="bi_cat")
+                        # Clean data
+                        plot_df = df[[num_col, cat_col]].dropna()
+                        if len(plot_df) > 0 and plot_df[cat_col].nunique() <= 30:
+                            # Create visualizations
+                            fig = make_subplots(rows=2, cols=2,
+                                               subplot_titles=("Box Plot", "Violin Plot",
+                                                              "Strip Plot", "Bar Chart (Means ± SD)"),
+                                               specs=[[{"type": "xy"}, {"type": "xy"}],
+                                                     [{"type": "xy"}, {"type": "xy"}]])
+                            # Box plot
+                            fig.add_trace(go.Box(x=plot_df[cat_col], y=plot_df[num_col],
+                                                name="Box Plot", marker_color='#42a5f5'), row=1, col=1)
+                            # Violin plot
+                            fig.add_trace(go.Violin(x=plot_df[cat_col], y=plot_df[num_col],
+                                                   box_visible=True, line_color='black',
+                                                   fillcolor='#66bb6a', opacity=0.6,
+                                                   name="Violin Plot"), row=1, col=2)
+                            # Strip plot
+                            fig.add_trace(go.Scatter(x=plot_df[cat_col], y=plot_df[num_col],
+                                                    mode='markers', name="Strip Plot",
+                                                    marker=dict(size=3, opacity=0.3, color='#ffa726')),
+                                         row=2, col=1)
+                            # Bar chart with error bars
+                            stats_by_cat = plot_df.groupby(cat_col)[num_col].agg(['mean', 'std', 'count']).reset_index()
+                            stats_by_cat = stats_by_cat.sort_values('mean', ascending=False).head(15)
+                            fig.add_trace(go.Bar(x=stats_by_cat[cat_col], y=stats_by_cat['mean'],
+                                                error_y=dict(type='data', array=stats_by_cat['std']),
+                                                name="Mean ± SD", marker_color='#ab47bc'),
+                                         row=2, col=2)
+                            fig.update_layout(height=800, title_text=f"{num_col} by {cat_col}")
+                            st.plotly_chart(fig, use_container_width=True)
+                            # ANOVA test for groups with >2 categories
+                            if plot_df[cat_col].nunique() >= 2:
+                                groups = [group[num_col].values for name, group in plot_df.groupby(cat_col)]
+                                if all(len(g) > 0 for g in groups):
+                                    f_stat, p_val = stats.f_oneway(*groups)
+                                    st.write(f"**One-way ANOVA Results:** F-statistic = {f_stat:.4f}, p-value = {p_val:.4f}")
+                                    if p_val < 0.05:
+                                        st.success("✅ Significant differences exist between groups")
+                                    else:
+                                        st.info("ℹ️ No significant differences found between groups")
+                        elif plot_df[cat_col].nunique() > 30:
+                            st.warning(f"⚠️ Categorical column has {plot_df[cat_col].nunique()} unique values. Consider grouping or selecting another column.")
+                    else:
+                        st.warning("⚠️ Need both numeric and categorical columns for this analysis")
+                elif analysis_type == "Categorical vs Categorical":
+                    categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
+                    if len(categorical_cols) >= 2:
+                        col1, col2 = st.columns(2)
+                        with col1:
+                            cat1 = st.selectbox("Select first categorical column", categorical_cols, key="bi_cat1")
+                        with col2:
+                            cat2 = st.selectbox("Select second categorical column",
+                                               [c for c in categorical_cols if c != cat1], key="bi_cat2")
+                        # Create contingency table
+                        contingency = pd.crosstab(df[cat1], df[cat2])
+                        if contingency.size > 0:
+                            fig = make_subplots(rows=1, cols=2,
+                                               subplot_titles=("Stacked Bar Chart", "Heatmap"),
+                                               specs=[[{"type": "xy"}, {"type": "heatmap"}]])
+                            # Stacked bar chart
+                            for col in contingency.columns[:10]:  # Limit to 10 categories
+                                fig.add_trace(go.Bar(x=contingency.index[:10], y=contingency[col][:10],
+                                                    name=str(col)), row=1, col=1)
+                            # Heatmap
+                            fig.add_trace(go.Heatmap(z=contingency.values[:10, :10],
+                                                    x=contingency.columns[:10].astype(str),
+                                                    y=contingency.index[:10].astype(str),
+                                                    colorscale='Viridis',
+                                                    text=contingency.values[:10, :10],
+                                                    texttemplate="%{text}"), row=1, col=2)
+                            fig.update_layout(height=600, title_text=f"Relationship: {cat1} vs {cat2}",
+                                            barmode='stack')
+                            st.plotly_chart(fig, use_container_width=True)
+                            # Chi-square test
+                            from scipy.stats import chi2_contingency
+                            chi2, p_val, dof, expected = chi2_contingency(contingency)
+                            st.write(f"**Chi-square Test Results:**")
+                            st.write(f"χ² = {chi2:.4f}, df = {dof}, p-value = {p_val:.4f}")
+                            if p_val < 0.05:
+                                st.success("✅ Significant association found between variables")
+                                # Cramer's V for effect size
+                                n = contingency.sum().sum()
+                                cramer_v = np.sqrt(chi2 / (n * (min(contingency.shape) - 1)))
+                                st.write(f"**Cramer's V (effect size):** {cramer_v:.4f}")
+                            else:
+                                st.info("ℹ️ No significant association found")
+                    else:
+                        st.warning("⚠️ Need at least 2 categorical columns for this analysis")
+            except Exception as e:
+                st.error(f"❌ Error in bivariate analysis: {str(e)}")
+                st.info("💡 Tip: Check if selected columns have sufficient data for analysis")
+            st.markdown('</div>', unsafe_allow_html=True)
+        with tab5:
+            st.markdown('<div class="custom-card">', unsafe_allow_html=True)
+            st.subheader("📈 Multivariate Analysis")
+            try:
+                numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
+                if len(numeric_cols) >= 3:
+                    analysis_type = st.radio("Select analysis type",
+                                            ["Correlation Matrix", "Parallel Coordinates",
+                                             "3D Scatter", "Radar Chart"],
+                                            horizontal=True, key="multivariate_type")
+                    if analysis_type == "Correlation Matrix":
+                        corr_matrix = df[numeric_cols].corr()
+                        fig = px.imshow(corr_matrix,
+                                       text_auto=True,
+                                       aspect="auto",
+                                       color_continuous_scale='RdBu_r',
+                                       title="Correlation Matrix Heatmap",
+                                       zmin=-1, zmax=1)
+                        fig.update_layout(height=700)
+                        st.plotly_chart(fig, use_container_width=True)
+                        # Find highly correlated pairs
+                        high_corr = []
+                        for i in range(len(numeric_cols)):
+                            for j in range(i+1, len(numeric_cols)):
+                                if abs(corr_matrix.iloc[i, j]) > 0.7:
+                                    high_corr.append({
+                                        'Feature 1': numeric_cols[i],
+                                        'Feature 2': numeric_cols[j],
+                                        'Correlation': corr_matrix.iloc[i, j]
+                                    })
+                        if high_corr:
+                            st.subheader("🔍 Highly Correlated Pairs (|r| > 0.7)")
+                            for item in high_corr:
+                                st.write(f"• **{item['Feature 1']}** & **{item['Feature 2']}**: {item['Correlation']:.4f}")
+                    elif analysis_type == "Parallel Coordinates":
+                        # Select dimensions
+                        selected_dims = st.multiselect("Select dimensions (columns)",
+                                                      numeric_cols,
+                                                      default=numeric_cols[:min(4, len(numeric_cols))])
+                        if len(selected_dims) >= 2:
+                            # Optional color dimension
+                            color_dim = st.selectbox("Color by", ["None"] + numeric_cols +
+                                                    df.select_dtypes(include=['object', 'category']).columns.tolist())
+                            plot_df = df[selected_dims].dropna()
+                            if len(plot_df) > 0:
+                                if color_dim == "None":
+                                    fig = px.parallel_coordinates(plot_df,
+                                                                 dimensions=selected_dims,
+                                                                 title="Parallel Coordinates Plot")
+                                else:
+                                    if color_dim in numeric_cols:
+                                        fig = px.parallel_coordinates(plot_df,
+                                                                     dimensions=selected_dims,
+                                                                     color=color_dim,
+                                                                     color_continuous_scale=px.colors.diverging.RdBu,
+                                                                     title=f"Parallel Coordinates colored by {color_dim}")
+                                    else:
+                                        # Categorical color
+                                        temp_df = df[selected_dims + [color_dim]].dropna()
+                                        fig = px.parallel_coordinates(temp_df,
+                                                                     dimensions=selected_dims,
+                                                                     color=color_dim,
+                                                                     title=f"Parallel Coordinates colored by {color_dim}")
+                                fig.update_layout(height=600)
+                                st.plotly_chart(fig, use_container_width=True)
+                    elif analysis_type == "3D Scatter":
+                        if len(numeric_cols) >= 3:
+                            col1, col2, col3 = st.columns(3)
+                            with col1:
+                                x_3d = st.selectbox("X axis", numeric_cols, key="3d_x")
+                            with col2:
+                                y_3d = st.selectbox("Y axis", [c for c in numeric_cols if c != x_3d], key="3d_y")
+                            with col3:
+                                z_3d = st.selectbox("Z axis", [c for c in numeric_cols if c not in [x_3d, y_3d]],
+                                                   key="3d_z")
+                            color_3d = st.selectbox("Color by", ["None"] +
+                                                   df.select_dtypes(include=['object', 'category']).columns.tolist())
+                            plot_df = df[[x_3d, y_3d, z_3d]].dropna()
+                            if len(plot_df) > 0:
+                                if color_3d == "None":
+                                    fig = px.scatter_3d(plot_df, x=x_3d, y=y_3d, z=z_3d,
+                                                      title=f"3D Scatter Plot",
+                                                      opacity=0.7)
+                                else:
+                                    temp_df = df[[x_3d, y_3d, z_3d, color_3d]].dropna()
+                                    fig = px.scatter_3d(temp_df, x=x_3d, y=y_3d, z=z_3d,
+                                                      color=color_3d,
+                                                      title=f"3D Scatter colored by {color_3d}",
+                                                      opacity=0.7)
+                                fig.update_layout(height=700)
+                                st.plotly_chart(fig, use_container_width=True)
+                    elif analysis_type == "Radar Chart":
+                        # Select features for radar
+                        radar_features = st.multiselect("Select features for radar chart",
+                                                        numeric_cols,
+                                                        default=numeric_cols[:min(5, len(numeric_cols))])
+                        if len(radar_features) >= 3:
+                            # Select how many samples to show
+                            n_samples = st.slider("Number of samples to show", 1, min(10, len(df)), 3)
+                            fig = go.Figure()
+                            for i in range(n_samples):
+                                sample = df.iloc[i][radar_features].values
+                                fig.add_trace(go.Scatterpolar(
+                                    r=sample,
+                                    theta=radar_features,
+                                    fill='toself',
+                                    name=f'Sample {i}'
+                                ))
+                            fig.update_layout(
+                                polar=dict(
+                                    radialaxis=dict(
+                                        visible=True,
+                                        range=[df[radar_features].min().min(), df[radar_features].max().max()]
+                                    )),
+                                title=f"Radar Chart - First {n_samples} Samples",
+                                height=600
+                            )
+                            st.plotly_chart(fig, use_container_width=True)
+                else:
+                    st.warning("⚠️ Need at least 3 numeric columns for multivariate analysis")
+            except Exception as e:
+                st.error(f"❌ Error in multivariate analysis: {str(e)}")
+                st.info("💡 Tip: Ensure you have enough numeric columns for multivariate analysis")
+            st.markdown('</div>', unsafe_allow_html=True)
+        with tab6:
+            st.markdown('<div class="custom-card">', unsafe_allow_html=True)
+            st.subheader("🎯 Pattern Discovery")
+            try:
+                analysis_type = st.radio("Select pattern discovery method",
+                                        ["Clustering Visualization", "Outlier Detection",
+                                         "Trend Detection", "Seasonal Patterns"],
+                                        horizontal=True, key="pattern_type")
+                numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
+                if analysis_type == "Clustering Visualization":
+                    if len(numeric_cols) >= 2:
+                        from sklearn.cluster import KMeans
+                        from sklearn.preprocessing import StandardScaler
+                        # Select features for clustering
+                        cluster_features = st.multiselect("Select features for clustering",
+                                                          numeric_cols,
+                                                          default=numeric_cols[:min(3, len(numeric_cols))])
+                        if len(cluster_features) >= 2:
+                            n_clusters = st.slider("Number of clusters", 2, 8, 3)
+                            # Prepare data
+                            X = df[cluster_features].dropna()
+                            if len(X) > 0:
+                                # Scale data
+                                scaler = StandardScaler()
+                                X_scaled = scaler.fit_transform(X)
+                                # Perform clustering
+                                kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
+                                clusters = kmeans.fit_predict(X_scaled)
+                                # Create visualization
+                                if len(cluster_features) == 2:
+                                    fig = px.scatter(x=X[cluster_features[0]], y=X[cluster_features[1]],
+                                                   color=clusters.astype(str),
+                                                   title=f"K-Means Clustering (k={n_clusters})",
+                                                   labels={'x': cluster_features[0], 'y': cluster_features[1],
+                                                          'color': 'Cluster'})
+                                elif len(cluster_features) >= 3:
+                                    fig = px.scatter_3d(x=X[cluster_features[0]], y=X[cluster_features[1]],
+                                                       z=X[cluster_features[2]], color=clusters.astype(str),
+                                                       title=f"K-Means Clustering (k={n_clusters})",
+                                                       labels={cluster_features[0]: cluster_features[0],
+                                                              cluster_features[1]: cluster_features[1],
+                                                              cluster_features[2]: cluster_features[2],
+                                                              'color': 'Cluster'})
+                                fig.update_layout(height=600)
+                                st.plotly_chart(fig, use_container_width=True)
+                                # Cluster statistics
+                                st.subheader("📊 Cluster Statistics")
+                                X['Cluster'] = clusters
+                                cluster_stats = X.groupby('Cluster')[cluster_features].mean()
+                                st.dataframe(cluster_stats.style.format("{:.4f}"))
+                elif analysis_type == "Outlier Detection":
+                    if len(numeric_cols) >= 2:
+                        from sklearn.ensemble import IsolationForest
+                        # Select features for outlier detection
+                        outlier_features = st.multiselect("Select features for outlier detection",
+                                                          numeric_cols,
+                                                          default=numeric_cols[:min(3, len(numeric_cols))])
+                        if len(outlier_features) >= 2:
+                            contamination = st.slider("Expected outlier proportion", 0.01, 0.5, 0.1, 0.01)
+                            # Prepare data
+                            X = df[outlier_features].dropna()
+                            if len(X) > 0:
+                                # Detect outliers
+                                iso_forest = IsolationForest(contamination=contamination, random_state=42)
+                                outliers = iso_forest.fit_predict(X)
+                                # Create visualization
+                                if len(outlier_features) == 2:
+                                    fig = px.scatter(x=X[outlier_features[0]], y=X[outlier_features[1]],
+                                                   color=outliers,
+                                                   color_continuous_scale=['blue', 'red'],
+                                                   title=f"Outlier Detection (contamination={contamination})",
+                                                   labels={'x': outlier_features[0], 'y': outlier_features[1],
+                                                          'color': 'Outlier'})
+                                elif len(outlier_features) >= 3:
+                                    fig = px.scatter_3d(x=X[outlier_features[0]], y=X[outlier_features[1]],
+                                                       z=X[outlier_features[2]], color=outliers,
+                                                       color_continuous_scale=['blue', 'red'],
+                                                       title=f"Outlier Detection (contamination={contamination})",
+                                                       labels={outlier_features[0]: outlier_features[0],
+                                                              outlier_features[1]: outlier_features[1],
+                                                              outlier_features[2]: outlier_features[2],
+                                                              'color': 'Outlier'})
+                                fig.update_layout(height=600)
+                                st.plotly_chart(fig, use_container_width=True)
+                                # Outlier statistics
+                                n_outliers = (outliers == -1).sum()
+                                st.write(f"**Outliers detected:** {n_outliers} ({n_outliers/len(X)*100:.2f}%)")
+                elif analysis_type == "Trend Detection":
+                    datetime_cols = df.select_dtypes(include=['datetime64']).columns.tolist()
+                    if datetime_cols and numeric_cols:
+                        date_col = st.selectbox("Select date column", datetime_cols)
+                        value_col = st.selectbox("Select value column", numeric_cols)
+                        # Prepare time series data
+                        ts_df = df[[date_col, value_col]].dropna().sort_values(date_col)
+                        if len(ts_df) > 10:
+                            # Calculate moving averages
+                            window = st.slider("Moving average window", 2, 30, 7)
+                            ts_df['MA'] = ts_df[value_col].rolling(window=window).mean()
+                            # Detect trend using linear regression
+                            from sklearn.linear_model import LinearRegression
+                            X = np.arange(len(ts_df)).reshape(-1, 1)
+                            y = ts_df[value_col].values
+                            model = LinearRegression()
+                            model.fit(X, y)
+                            trend = model.predict(X)
+                            # Create visualization
+                            fig = go.Figure()
+                            fig.add_trace(go.Scatter(x=ts_df[date_col], y=ts_df[value_col],
+                                                    mode='lines', name='Original'))
+                            fig.add_trace(go.Scatter(x=ts_df[date_col], y=ts_df['MA'],
+                                                    mode='lines', name=f'{window}-period MA',
+                                                    line=dict(color='orange')))
+                            fig.add_trace(go.Scatter(x=ts_df[date_col], y=trend,
+                                                    mode='lines', name='Linear Trend',
+                                                    line=dict(color='red', dash='dash')))
+                            fig.update_layout(title="Trend Detection",
+                                            xaxis_title="Date",
+                                            yaxis_title=value_col,
+                                            height=500)
+                            st.plotly_chart(fig, use_container_width=True)
+                            # Trend statistics
+                            slope = model.coef_[0]
+                            st.write(f"**Trend slope:** {slope:.4f} units per time step")
+                            if slope > 0:
+                                st.success("✅ Upward trend detected")
+                            elif slope < 0:
+                                st.warning("⚠�� Downward trend detected")
+                            else:
+                                st.info("ℹ️ No clear trend detected")
+                elif analysis_type == "Seasonal Patterns":
+                    datetime_cols = df.select_dtypes(include=['datetime64']).columns.tolist()
+                    if datetime_cols and numeric_cols:
+                        date_col = st.selectbox("Select date column", datetime_cols, key="seasonal_date")
+                        value_col = st.selectbox("Select value column", numeric_cols, key="seasonal_value")
+                        # Extract seasonal components
+                        df_temp = df[[date_col, value_col]].dropna()
+                        df_temp['year'] = pd.DatetimeIndex(df_temp[date_col]).year
+                        df_temp['month'] = pd.DatetimeIndex(df_temp[date_col]).month
+                        df_temp['quarter'] = pd.DatetimeIndex(df_temp[date_col]).quarter
+                        df_temp['dayofweek'] = pd.DatetimeIndex(df_temp[date_col]).dayofweek
+                        # Create seasonal visualizations
+                        fig = make_subplots(rows=2, cols=2,
+                                           subplot_titles=("Year-over-Year", "Monthly Pattern",
+                                                          "Quarterly Pattern", "Day of Week Pattern"),
+                                           specs=[[{"type": "xy"}, {"type": "xy"}],
+                                                 [{"type": "xy"}, {"type": "xy"}]])
+                        # Year-over-Year
+                        yearly_avg = df_temp.groupby('year')[value_col].mean().reset_index()
+                        fig.add_trace(go.Scatter(x=yearly_avg['year'], y=yearly_avg[value_col],
+                                                mode='lines+markers', name="Yearly Avg"), row=1, col=1)
+                        # Monthly pattern
+                        monthly_avg = df_temp.groupby('month')[value_col].mean().reset_index()
+                        month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
+                                      'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
+                        fig.add_trace(go.Bar(x=[month_names[m-1] for m in monthly_avg['month']],
+                                            y=monthly_avg[value_col], name="Monthly Avg"), row=1, col=2)
+                        # Quarterly pattern
+                        quarterly_avg = df_temp.groupby('quarter')[value_col].mean().reset_index()
+                        quarter_names = ['Q1', 'Q2', 'Q3', 'Q4']
+                        fig.add_trace(go.Bar(x=[quarter_names[q-1] for q in quarterly_avg['quarter']],
+                                            y=quarterly_avg[value_col], name="Quarterly Avg"), row=2, col=1)
+                        # Day of week pattern
+                        dow_avg = df_temp.groupby('dayofweek')[value_col].mean().reset_index()
+                        day_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
+                        fig.add_trace(go.Bar(x=[day_names[d] for d in dow_avg['dayofweek']],
+                                            y=dow_avg[value_col], name="Day of Week Avg"), row=2, col=2)
+                        fig.update_layout(height=800, title_text="Seasonal Pattern Analysis")
+                        st.plotly_chart(fig, use_container_width=True)
+            except Exception as e:
+                st.error(f"❌ Error in pattern discovery: {str(e)}")
+                st.info("💡 Tip: Ensure you have sufficient data for pattern detection")
+            st.markdown('</div>', unsafe_allow_html=True)
+    except Exception as e:
+        st.error(f"❌ Critical error in EDA: {str(e)}")
+        st.info("💡 Please check your dataset and try again")
+    # Export options
+    st.markdown("---")
+    st.markdown("### 📥 Export EDA Report")
+    try:
+        numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
+        report_text = f"""
+        EXPLORATORY DATA ANALYSIS REPORT
+        =================================
+        Dataset Information:
+        • Total Rows: {df.shape[0]:,}
+        • Total Columns: {df.shape[1]}
+        • Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB
+        Column Types:
+        • Numeric: {len(numeric_cols)}
+        • Categorical: {len(df.select_dtypes(include=['object', 'category']).columns)}
+        • Datetime: {len(df.select_dtypes(include=['datetime64']).columns)}
+        Data Quality:
+        • Missing Values: {df.isnull().sum().sum():,}
+        • Complete Cases: {df.dropna().shape[0]:,}
+        • Duplicate Rows: {df.duplicated().sum():,}
+        Analysis Performed:
+        • Data Overview
+        • Missing Data Analysis
+        • Univariate Analysis
+        • Bivariate Analysis
+        • Multivariate Analysis
+        • Pattern Discovery
+        """
+        st.download_button(
+            label="📥 Download EDA Report",
+            data=report_text,
+            file_name="eda_report.txt",
+            mime="text/plain",
+            use_container_width=True
+        )
+    except Exception as e:
+        st.error(f"❌ Error generating report: {str(e)}")

explainability.py ADDED Viewed

	@@ -0,0 +1,176 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+import plotly.express as px
+import plotly.graph_objects as go
+from sklearn.inspection import permutation_importance
+import matplotlib.pyplot as plt
+import shap
+def explain_model(model, X, y=None, feature_names=None):
+    """
+    Explain model predictions using various techniques
+    """
+    st.subheader("🔍 Model Explainability")
+    if feature_names is None:
+        feature_names = X.columns if hasattr(X, 'columns') else [f"Feature {i}" for i in range(X.shape[1])]
+    # Create tabs for different explanation methods
+    tab1, tab2, tab3 = st.tabs(["Feature Importance", "SHAP Values", "Partial Dependence"])
+    with tab1:
+        st.markdown("### 📊 Feature Importance")
+        # Method selection
+        method = st.radio(
+            "Importance method",
+            ["Built-in", "Permutation"],
+            horizontal=True
+        )
+        if method == "Built-in":
+            if hasattr(model, 'feature_importances_'):
+                importance = model.feature_importances_
+                importance_df = pd.DataFrame({
+                    'feature': feature_names,
+                    'importance': importance
+                }).sort_values('importance', ascending=False)
+                fig = px.bar(importance_df.head(20), x='importance', y='feature',
+                           orientation='h', title="Feature Importance (Built-in)")
+                st.plotly_chart(fig, use_container_width=True)
+            else:
+                st.warning("Model doesn't have built-in feature importance")
+        else:  # Permutation importance
+            if y is not None:
+                with st.spinner("Calculating permutation importance..."):
+                    perm_importance = permutation_importance(model, X, y, n_repeats=10)
+                    importance_df = pd.DataFrame({
+                        'feature': feature_names,
+                        'importance': perm_importance.importances_mean,
+                        'std': perm_importance.importances_std
+                    }).sort_values('importance', ascending=False)
+                    fig = go.Figure()
+                    fig.add_trace(go.Bar(
+                        x=importance_df['importance'].head(20),
+                        y=importance_df['feature'].head(20),
+                        orientation='h',
+                        error_x=dict(
+                            type='data',
+                            array=importance_df['std'].head(20),
+                            visible=True
+                        )
+                    ))
+                    fig.update_layout(title="Permutation Importance (with error bars)",
+                                    xaxis_title="Importance")
+                    st.plotly_chart(fig, use_container_width=True)
+            else:
+                st.warning("Need target values for permutation importance")
+    with tab2:
+        st.markdown("### 📈 SHAP Values")
+        if hasattr(model, 'predict'):
+            with st.spinner("Calculating SHAP values (this may take a moment)..."):
+                try:
+                    # Create explainer based on model type
+                    if str(type(model)).find('sklearn') != -1:
+                        explainer = shap.Explainer(model, X[:100])  # Use subset for speed
+                    else:
+                        explainer = shap.TreeExplainer(model) if hasattr(model, 'feature_importances_') else shap.Explainer(model, X[:100])
+                    # Calculate SHAP values
+                    shap_values = explainer(X[:100])  # Limit to 100 samples for performance
+                    # Summary plot
+                    st.markdown("#### SHAP Summary Plot")
+                    fig, ax = plt.subplots()
+                    shap.summary_plot(shap_values, X[:100], feature_names=feature_names, show=False)
+                    st.pyplot(fig)
+                    plt.close()
+                    # Waterfall plot for a single prediction
+                    st.markdown("#### Single Prediction Explanation")
+                    sample_idx = st.slider("Select sample index", 0, min(99, len(X)-1), 0)
+                    fig, ax = plt.subplots()
+                    shap.waterfall_plot(shap_values[sample_idx], show=False)
+                    st.pyplot(fig)
+                    plt.close()
+                except Exception as e:
+                    st.error(f"Error calculating SHAP values: {str(e)}")
+                    st.info("Try using a smaller sample or a different model type")
+        else:
+            st.warning("Model doesn't support prediction")
+    with tab3:
+        st.markdown("### 📉 Partial Dependence Plots")
+        if hasattr(model, 'predict') and len(feature_names) > 0:
+            from sklearn.inspection import partial_dependence
+            selected_feature = st.selectbox("Select feature for PDP", feature_names)
+            if selected_feature:
+                feature_idx = list(feature_names).index(selected_feature)
+                # Calculate partial dependence
+                pdp = partial_dependence(model, X, [feature_idx], grid_resolution=50)
+                # Create plot
+                fig = go.Figure()
+                fig.add_trace(go.Scatter(
+                    x=pdp['values'][0],
+                    y=pdp['average'][0],
+                    mode='lines+markers',
+                    name='Partial Dependence'
+                ))
+                fig.update_layout(
+                    title=f"Partial Dependence Plot for {selected_feature}",
+                    xaxis_title=selected_feature,
+                    yaxis_title="Prediction"
+                )
+                st.plotly_chart(fig, use_container_width=True)
+                # Individual conditional expectation (ICE) plots
+                if st.checkbox("Show ICE plots"):
+                    ice_data = []
+                    for i in range(min(10, X.shape[0])):  # Show up to 10 lines
+                        ice = partial_dependence(model, X.iloc[i:i+1], [feature_idx], grid_resolution=20)
+                        ice_data.append(ice['average'][0])
+                    fig = go.Figure()
+                    for i, ice in enumerate(ice_data):
+                        fig.add_trace(go.Scatter(
+                            x=pdp['values'][0],
+                            y=ice,
+                            mode='lines',
+                            name=f'Sample {i}',
+                            line=dict(width=1, color='lightgray')
+                        ))
+                    # Add average line
+                    fig.add_trace(go.Scatter(
+                        x=pdp['values'][0],
+                        y=pdp['average'][0],
+                        mode='lines',
+                        name='Average',
+                        line=dict(width=3, color='red')
+                    ))
+                    fig.update_layout(
+                        title=f"ICE Plots for {selected_feature}",
+                        xaxis_title=selected_feature,
+                        yaxis_title="Prediction"
+                    )
+                    st.plotly_chart(fig, use_container_width=True)
+        else:
+            st.warning("Need more features for partial dependence plots")

insights.py ADDED Viewed

	@@ -0,0 +1,369 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+import plotly.express as px
+import plotly.graph_objects as go
+from scipy import stats
+def generate_business_insights(df):
+    st.markdown("""
+    <div style='text-align: center; margin-bottom: 2rem;'>
+        <h2>💡 Automated Business Insights</h2>
+        <p style='color: gray;'>AI-powered analysis to uncover hidden patterns and opportunities</p>
+    </div>
+    """, unsafe_allow_html=True)
+    # Get column types
+    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
+    categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
+    datetime_cols = df.select_dtypes(include=['datetime64']).columns.tolist()
+    # Create tabs for different insight categories
+    tab1, tab2, tab3, tab4, tab5 = st.tabs([
+        "📊 Overview", "📈 Trends & Patterns", "🎯 Key Drivers",
+        "⚠️ Anomalies", "💡 Recommendations"
+    ])
+    with tab1:
+        st.markdown('<div class="custom-card">', unsafe_allow_html=True)
+        st.subheader("📊 Dataset Overview")
+        # Key metrics
+        col1, col2, col3, col4 = st.columns(4)
+        with col1:
+            st.metric("Total Records", f"{df.shape[0]:,}")
+        with col2:
+            st.metric("Total Features", df.shape[1])
+        with col3:
+            completeness = (1 - df.isnull().sum().sum() / (df.shape[0] * df.shape[1])) * 100
+            st.metric("Data Completeness", f"{completeness:.1f}%")
+        with col4:
+            if numeric_cols:
+                total_value = df[numeric_cols].sum().sum()
+                st.metric("Total Value", f"{total_value:,.0f}" if total_value < 1e6 else f"{total_value/1e6:,.1f}M")
+        # Column composition
+        st.markdown("### 📋 Column Composition")
+        comp_data = {
+            'Type': ['Numeric', 'Categorical', 'Datetime'],
+            'Count': [len(numeric_cols), len(categorical_cols), len(datetime_cols)]
+        }
+        fig = px.pie(comp_data, values='Count', names='Type',
+                    title="Column Type Distribution",
+                    color_discrete_sequence=px.colors.qualitative.Set3)
+        st.plotly_chart(fig, use_container_width=True)
+        # Data quality score
+        st.markdown("### 📊 Data Quality Score")
+        quality_score = 0
+        quality_metrics = []
+        # Completeness score
+        completeness_score = (1 - df.isnull().sum().sum() / (df.shape[0] * df.shape[1])) * 100
+        quality_metrics.append(completeness_score)
+        # Uniqueness score (avoid duplicates)
+        duplicate_pct = (df.duplicated().sum() / df.shape[0]) * 100
+        uniqueness_score = 100 - duplicate_pct
+        quality_metrics.append(uniqueness_score)
+        # Consistency score (data type consistency)
+        type_consistency = 100  # Default high
+        for col in df.columns:
+            if df[col].dtype == 'object':
+                # Check if column has mixed types
+                try:
+                    pd.to_numeric(df[col], errors='raise')
+                    # If convertible to numeric, it's consistent
+                except:
+                    pass  # Object type is fine
+            else:
+                # Numeric columns are consistent
+                pass
+        quality_metrics.append(type_consistency)
+        # Average quality score
+        avg_quality = np.mean(quality_metrics)
+        # Display gauge chart
+        fig = go.Figure(go.Indicator(
+            mode = "gauge+number",
+            value = avg_quality,
+            domain = {'x': [0, 1], 'y': [0, 1]},
+            title = {'text': "Overall Data Quality"},
+            gauge = {
+                'axis': {'range': [None, 100]},
+                'bar': {'color': "darkblue"},
+                'steps': [
+                    {'range': [0, 50], 'color': "lightgray"},
+                    {'range': [50, 80], 'color': "gray"},
+                    {'range': [80, 100], 'color': "lightgreen"}],
+                'threshold': {
+                    'line': {'color': "red", 'width': 4},
+                    'thickness': 0.75,
+                    'value': 90}}))
+        st.plotly_chart(fig, use_container_width=True)
+        st.markdown('</div>', unsafe_allow_html=True)
+    with tab2:
+        st.markdown('<div class="custom-card">', unsafe_allow_html=True)
+        st.subheader("📈 Trends & Patterns")
+        if len(numeric_cols) > 0:
+            # Correlation analysis
+            if len(numeric_cols) >= 2:
+                st.markdown("### 🔗 Key Relationships")
+                corr_matrix = df[numeric_cols].corr()
+                # Find strongest correlations
+                corr_pairs = []
+                for i in range(len(numeric_cols)):
+                    for j in range(i+1, len(numeric_cols)):
+                        corr_pairs.append({
+                            'feature1': numeric_cols[i],
+                            'feature2': numeric_cols[j],
+                            'correlation': corr_matrix.iloc[i, j]
+                        })
+                corr_pairs.sort(key=lambda x: abs(x['correlation']), reverse=True)
+                # Display top 5 correlations
+                for pair in corr_pairs[:5]:
+                    strength = abs(pair['correlation'])
+                    if strength > 0.7:
+                        emoji = "🟢"
+                        desc = "Strong"
+                    elif strength > 0.3:
+                        emoji = "🟡"
+                        desc = "Moderate"
+                    else:
+                        emoji = "🔴"
+                        desc = "Weak"
+                    direction = "positive" if pair['correlation'] > 0 else "negative"
+                    st.markdown(
+                        f"{emoji} **{pair['feature1']}** & **{pair['feature2']}**: "
+                        f"{pair['correlation']:.3f} ({desc} {direction} correlation)"
+                    )
+                # Insight
+                if corr_pairs:
+                    st.info(f"💡 **Insight**: {corr_pairs[0]['feature1']} and {corr_pairs[0]['feature2']} "
+                           f"have the strongest {'positive' if corr_pairs[0]['correlation'] > 0 else 'negative'} "
+                           f"relationship in the dataset.")
+            # Distribution insights
+            st.markdown("### 📊 Distribution Analysis")
+            skewness = df[numeric_cols].skew()
+            skewed_cols = skewness[abs(skewness) > 1].index.tolist()
+            if skewed_cols:
+                st.warning(f"⚠️ **Skewed Features**: {', '.join(skewed_cols[:3])}" +
+                          (" and more" if len(skewed_cols) > 3 else ""))
+                st.markdown("💡 These features might benefit from transformation for better model performance.")
+            # Show distribution of most skewed feature
+            if skewed_cols:
+                col_to_show = skewed_cols[0]
+                fig = px.histogram(df, x=col_to_show, nbins=30,
+                                  title=f"Distribution of {col_to_show} (Most Skewed)",
+                                  marginal="box")
+                st.plotly_chart(fig, use_container_width=True)
+        else:
+            st.info("No numeric columns available for trend analysis")
+        st.markdown('</div>', unsafe_allow_html=True)
+    with tab3:
+        st.markdown('<div class="custom-card">', unsafe_allow_html=True)
+        st.subheader("🎯 Key Business Drivers")
+        if len(numeric_cols) > 0:
+            # Find features with highest variance (potential impact)
+            variances = df[numeric_cols].var().sort_values(ascending=False)
+            st.markdown("### 📊 High Variance Features")
+            st.markdown("Features with high variance often indicate key business drivers")
+            fig = px.bar(x=variances.index[:10], y=variances.values[:10],
+                        title="Top 10 Features by Variance",
+                        labels={'x': 'Feature', 'y': 'Variance'})
+            st.plotly_chart(fig, use_container_width=True)
+            # Feature importance based on mutual information
+            if len(numeric_cols) >= 2:
+                st.markdown("### 🔍 Predictive Power")
+                # Use last numeric column as potential target
+                target = numeric_cols[-1]
+                features = numeric_cols[:-1]
+                if len(features) > 0:
+                    from sklearn.feature_selection import mutual_info_regression
+                    mi_scores = mutual_info_regression(df[features].fillna(0), df[target].fillna(0))
+                    mi_df = pd.DataFrame({
+                        'feature': features,
+                        'importance': mi_scores
+                    }).sort_values('importance', ascending=False)
+                    fig = px.bar(mi_df.head(10), x='importance', y='feature',
+                               orientation='h',
+                               title=f"Feature Importance for Predicting {target}")
+                    st.plotly_chart(fig, use_container_width=True)
+                    st.info(f"💡 **Key Driver**: {mi_df.iloc[0]['feature']} appears to be the most "
+                           f"important factor for predicting {target}")
+        else:
+            st.info("No numeric columns available for driver analysis")
+        st.markdown('</div>', unsafe_allow_html=True)
+    with tab4:
+        st.markdown('<div class="custom-card">', unsafe_allow_html=True)
+        st.subheader("⚠️ Anomaly Detection")
+        if len(numeric_cols) > 0:
+            # Outlier detection using IQR
+            outlier_report = []
+            for col in numeric_cols:
+                Q1 = df[col].quantile(0.25)
+                Q3 = df[col].quantile(0.75)
+                IQR = Q3 - Q1
+                outliers = df[(df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)]
+                outlier_pct = (len(outliers) / len(df)) * 100
+                if outlier_pct > 5:
+                    outlier_report.append({
+                        'column': col,
+                        'outlier_pct': outlier_pct,
+                        'lower_bound': Q1 - 1.5 * IQR,
+                        'upper_bound': Q3 + 1.5 * IQR
+                    })
+            if outlier_report:
+                st.warning(f"⚠️ Found {len(outlier_report)} columns with significant outliers")
+                for item in outlier_report[:5]:
+                    st.markdown(f"**{item['column']}**: {item['outlier_pct']:.1f}% outliers "
+                              f"(outside [{item['lower_bound']:.2f}, {item['upper_bound']:.2f}])")
+                # Visualize outliers for first column
+                col_to_show = outlier_report[0]['column']
+                fig = px.box(df, y=col_to_show, title=f"Outliers in {col_to_show}")
+                st.plotly_chart(fig, use_container_width=True)
+                st.markdown("💡 **Recommendation**: Investigate these outliers - they may represent "
+                           "unusual but important business events or data quality issues.")
+            else:
+                st.success("✅ No significant outliers detected in numeric columns")
+        else:
+            st.info("No numeric columns available for outlier detection")
+        st.markdown('</div>', unsafe_allow_html=True)
+    with tab5:
+        st.markdown('<div class="custom-card">', unsafe_allow_html=True)
+        st.subheader("💡 Strategic Recommendations")
+        # Generate business recommendations based on data insights
+        recommendations = []
+        if len(numeric_cols) > 0:
+            # Check for growth opportunities
+            growth_cols = []
+            for col in numeric_cols:
+                if df[col].min() >= 0 and df[col].max() > df[col].min() * 10:
+                    growth_cols.append(col)
+            if growth_cols:
+                recommendations.append({
+                    'area': 'Growth Opportunity',
+                    'recommendation': f"Focus on {growth_cols[0]} which shows high variability "
+                                    f"(range: {df[growth_cols[0]].min():.2f} to {df[growth_cols[0]].max():.2f})",
+                    'priority': 'High'
+                })
+            # Check for efficiency opportunities
+            if len(numeric_cols) >= 2:
+                # Find features with high correlation - potential redundancy
+                corr_matrix = df[numeric_cols].corr()
+                for i in range(len(numeric_cols)):
+                    for j in range(i+1, len(numeric_cols)):
+                        if abs(corr_matrix.iloc[i, j]) > 0.9:
+                            recommendations.append({
+                                'area': 'Efficiency',
+                                'recommendation': f"Consider consolidating {numeric_cols[i]} and {numeric_cols[j]} "
+                                                f"as they are highly correlated ({corr_matrix.iloc[i, j]:.2f})",
+                                'priority': 'Medium'
+                            })
+                            break
+                    if len(recommendations) > 3:
+                        break
+        if categorical_cols:
+            # Check for customer/market segments
+            for col in categorical_cols[:2]:
+                if df[col].nunique() > 1 and df[col].nunique() <= 10:
+                    top_segment = df[col].value_counts().index[0]
+                    recommendations.append({
+                        'area': 'Segmentation',
+                        'recommendation': f"Target the dominant segment in {col}: '{top_segment}' "
+                                        f"({df[col].value_counts().iloc[0]:,} records)",
+                        'priority': 'Medium'
+                    })
+        # Display recommendations
+        if recommendations:
+            for rec in recommendations:
+                priority_color = "🔴" if rec['priority'] == 'High' else "🟡" if rec['priority'] == 'Medium' else "🟢"
+                st.markdown(f"{priority_color} **{rec['area']}**: {rec['recommendation']}")
+        else:
+            st.info("No specific recommendations generated. Try uploading a dataset with more variety.")
+        # Add download insights option
+        st.markdown("---")
+        st.markdown("### 📥 Export Insights")
+        insight_text = f"""
+        BUSINESS INSIGHTS REPORT
+        =======================
+        Dataset: {df.shape[0]} rows × {df.shape[1]} columns
+        KEY METRICS:
+        • Total Records: {df.shape[0]:,}
+        • Total Features: {df.shape[1]}
+        • Data Completeness: {completeness:.1f}%
+        COLUMN COMPOSITION:
+        • Numeric: {len(numeric_cols)}
+        • Categorical: {len(categorical_cols)}
+        • Datetime: {len(datetime_cols)}
+        RECOMMENDATIONS:
+        """
+        for rec in recommendations:
+            insight_text += f"\n• {rec['area']}: {rec['recommendation']} (Priority: {rec['priority']})"
+        st.download_button(
+            label="📥 Download Insights Report",
+            data=insight_text,
+            file_name="business_insights.txt",
+            mime="text/plain",
+            use_container_width=True
+        )
+        st.markdown('</div>', unsafe_allow_html=True)

ml_pipeline.py ADDED Viewed

	@@ -0,0 +1,940 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
+from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
+from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
+                           confusion_matrix, classification_report, roc_curve, auc,
+                           mean_squared_error, r2_score, mean_absolute_error)
+from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso
+from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
+from sklearn.svm import SVC, SVR
+from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
+from sklearn.ensemble import (RandomForestClassifier, RandomForestRegressor,
+                            GradientBoostingClassifier, GradientBoostingRegressor,
+                            AdaBoostClassifier, AdaBoostRegressor,
+                            VotingClassifier, VotingRegressor)
+from xgboost import XGBClassifier, XGBRegressor
+from lightgbm import LGBMClassifier, LGBMRegressor
+import plotly.express as px
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+import time
+import warnings
+warnings.filterwarnings('ignore')
+class MLPipelineError(Exception):
+    """Custom exception for ML pipeline errors"""
+    pass
+def validate_ml_data(df, target, features):
+    """Validate data for machine learning"""
+    issues = []
+    if df.empty:
+        issues.append("Dataset is empty")
+        return issues
+    if target not in df.columns:
+        issues.append(f"Target column '{target}' not found in dataset")
+    missing_features = [f for f in features if f not in df.columns]
+    if missing_features:
+        issues.append(f"Features not found: {missing_features}")
+    # Check for sufficient data
+    if df.shape[0] < 10:
+        issues.append("Dataset too small (minimum 10 rows required)")
+    # Check for constant columns
+    for col in features:
+        if df[col].nunique() == 1:
+            issues.append(f"Feature '{col}' is constant")
+    # Check target for classification
+    if target in df.columns:
+        if df[target].dtype in ['object', 'category'] or df[target].nunique() <= 20:
+            if df[target].nunique() == 1:
+                issues.append("Target has only one class")
+            elif df[target].nunique() > 50:
+                issues.append(f"Target has {df[target].nunique()} classes, which may cause issues")
+    return issues
+def safe_ml_operation(func, *args, **kwargs):
+    """Safely execute ML operations with error handling"""
+    try:
+        result = func(*args, **kwargs)
+        return result, None
+    except ValueError as e:
+        error_msg = f"Value Error: {str(e)}. Check your data types and values."
+        return None, error_msg
+    except MemoryError as e:
+        error_msg = "Memory Error: Dataset too large. Try reducing the number of features or using a sample."
+        return None, error_msg
+    except Exception as e:
+        error_msg = f"ML Error: {str(e)}"
+        return None, error_msg
+def run_ml_pipeline(df):
+    """
+    Enhanced machine learning pipeline with comprehensive error handling
+    """
+    st.markdown("""
+    <div style='text-align: center; margin-bottom: 2rem;'>
+        <h2>🤖 Advanced Machine Learning Pipeline</h2>
+        <p style='color: gray;'>Train, evaluate, and compare multiple ML models with automatic error handling</p>
+    </div>
+    """, unsafe_allow_html=True)
+    try:
+        # Check if dataset is suitable for ML
+        if df.shape[0] < 10:
+            st.error("❌ Dataset too small for machine learning (need at least 10 rows)")
+            return
+        # Create tabs for different ML stages
+        tab1, tab2, tab3, tab4, tab5 = st.tabs([
+            "⚙️ Configuration",
+            "📊 Model Training",
+            "📈 Model Evaluation",
+            "🔮 Predictions",
+            "📋 ML Report"
+        ])
+        with tab1:
+            st.markdown('<div class="custom-card">', unsafe_allow_html=True)
+            st.subheader("⚙️ Model Configuration")
+            try:
+                # Target selection with validation
+                st.markdown("### 🎯 Target Variable")
+                # Auto-detect potential target columns
+                potential_targets = []
+                target_types = {}
+                for col in df.columns:
+                    try:
+                        if df[col].dtype in ['int64', 'float64']:
+                            if df[col].nunique() <= 20:
+                                potential_targets.append(col)
+                                target_types[col] = "Classification (low cardinality)"
+                            else:
+                                potential_targets.append(col)
+                                target_types[col] = "Regression"
+                        elif df[col].dtype in ['object', 'category']:
+                            if df[col].nunique() <= 50:
+                                potential_targets.append(col)
+                                target_types[col] = f"Classification ({df[col].nunique()} classes)"
+                    except Exception as e:
+                        st.warning(f"⚠️ Couldn't analyze column {col}: {str(e)}")
+                if not potential_targets:
+                    st.error("❌ No suitable target columns found. Need numeric or categorical columns with reasonable cardinality.")
+                    return
+                target = st.selectbox(
+                    "Select target column",
+                    potential_targets,
+                    help=f"Column types: {target_types}"
+                )
+                # Task type detection
+                if df[target].dtype in ['object', 'category'] or df[target].nunique() <= 20:
+                    task_type = "Classification"
+                    unique_values = df[target].nunique()
+                    if unique_values == 2:
+                        st.success("✅ **Binary Classification** problem detected")
+                    elif unique_values <= 10:
+                        st.info(f"📊 **Multi-class Classification** with {unique_values} classes")
+                    else:
+                        st.warning(f"⚠️ **Multi-class Classification** with {unique_values} classes - may be challenging")
+                    # Check class balance
+                    class_dist = df[target].value_counts(normalize=True)
+                    if class_dist.min() < 0.1:
+                        st.warning("⚠️ Class imbalance detected. Consider using class weights or resampling.")
+                else:
+                    task_type = "Regression"
+                    st.info("📈 **Regression** task detected")
+                    # Check target distribution
+                    target_skew = df[target].skew()
+                    if abs(target_skew) > 1:
+                        st.warning(f"⚠️ Target variable is highly skewed (skewness: {target_skew:.2f}). Consider log transformation.")
+                # Feature selection
+                st.markdown("### 🔍 Feature Selection")
+                # Auto-select features (exclude target)
+                all_features = [col for col in df.columns if col != target]
+                # Remove problematic columns
+                problematic_cols = []
+                for col in all_features:
+                    try:
+                        if df[col].nunique() == 1:
+                            problematic_cols.append(col)
+                        elif df[col].isnull().sum() > len(df) * 0.5:
+                            problematic_cols.append(col)
+                    except:
+                        problematic_cols.append(col)
+                if problematic_cols:
+                    st.warning(f"⚠️ Problematic columns detected (will be excluded): {problematic_cols}")
+                    all_features = [f for f in all_features if f not in problematic_cols]
+                if not all_features:
+                    st.error("❌ No valid features remaining after filtering.")
+                    return
+                # Select features
+                selected_features = st.multiselect(
+                    "Choose features for modeling",
+                    all_features,
+                    default=all_features[:min(10, len(all_features))],
+                    help="Select the columns to use as features. Using too many features may cause overfitting."
+                )
+                if not selected_features:
+                    st.warning("⚠️ Please select at least one feature")
+                    return
+                # Validate selected features
+                validation_issues = validate_ml_data(df, target, selected_features)
+                if validation_issues:
+                    for issue in validation_issues:
+                        st.warning(f"⚠️ {issue}")
+                # Data preprocessing options
+                st.markdown("### 🛠️ Preprocessing Options")
+                col1, col2 = st.columns(2)
+                with col1:
+                    test_size = st.slider("Test set size (%)", 10, 40, 20, 5) / 100
+                    scaler_option = st.selectbox("Feature scaling", ["None", "StandardScaler", "MinMaxScaler"])
+                with col2:
+                    cv_folds = st.slider("Cross-validation folds", 2, 10, 5)
+                    if task_type == "Classification":
+                        handle_imbalance = st.checkbox("Handle class imbalance", value=False,
+                                                       help="Use class weights or sampling techniques")
+                    else:
+                        handle_imbalance = False
+                # Model selection based on task type
+                st.markdown("### 🤖 Model Selection")
+                if task_type == "Classification":
+                    models = {
+                        "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
+                        "K-Nearest Neighbors": KNeighborsClassifier(),
+                        "Decision Tree": DecisionTreeClassifier(random_state=42),
+                        "Random Forest": RandomForestClassifier(random_state=42, n_jobs=-1),
+                        "Gradient Boosting": GradientBoostingClassifier(random_state=42),
+                        "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
+                        "LightGBM": LGBMClassifier(verbose=-1, random_state=42),
+                        "AdaBoost": AdaBoostClassifier(random_state=42),
+                        "SVM": SVC(probability=True, random_state=42)
+                    }
+                    # Default models for quick selection
+                    default_models = ["Logistic Regression", "Random Forest", "XGBoost"]
+                else:  # Regression
+                    models = {
+                        "Linear Regression": LinearRegression(),
+                        "Ridge Regression": Ridge(random_state=42),
+                        "Lasso Regression": Lasso(random_state=42),
+                        "Decision Tree": DecisionTreeRegressor(random_state=42),
+                        "Random Forest": RandomForestRegressor(random_state=42, n_jobs=-1),
+                        "Gradient Boosting": GradientBoostingRegressor(random_state=42),
+                        "XGBoost": XGBRegressor(random_state=42),
+                        "LightGBM": LGBMRegressor(verbose=-1, random_state=42),
+                        "AdaBoost": AdaBoostRegressor(random_state=42),
+                        "SVR": SVR()
+                    }
+                    default_models = ["Linear Regression", "Random Forest", "XGBoost"]
+                selected_models = st.multiselect(
+                    "Choose models to train",
+                    list(models.keys()),
+                    default=default_models,
+                    help="Select multiple models to compare performance"
+                )
+                if not selected_models:
+                    st.warning("⚠️ Please select at least one model")
+                    return
+                # Advanced options
+                with st.expander("⚡ Advanced Options"):
+                    do_tuning = st.checkbox("Perform hyperparameter tuning", value=False,
+                                           help="Grid search for best parameters (may be slow)")
+                    if do_tuning:
+                        tuning_folds = st.slider("Tuning CV folds", 2, 5, 3)
+                        max_tuning_iter = st.slider("Max tuning iterations per model", 5, 50, 20)
+                    use_sampling = st.checkbox("Use data sampling (for large datasets)", value=False,
+                                              help="Use a sample for faster experimentation")
+                    if use_sampling:
+                        sample_size = st.slider("Sample size (%)", 10, 100, 100, 10) / 100
+                    random_state = st.number_input("Random seed", value=42, min_value=0, max_value=999)
+                st.markdown('</div>', unsafe_allow_html=True)
+                # Store configuration in session state
+                st.session_state['ml_config'] = {
+                    'target': target,
+                    'features': selected_features,
+                    'task_type': task_type,
+                    'test_size': test_size,
+                    'scaler': scaler_option,
+                    'cv_folds': cv_folds,
+                    'handle_imbalance': handle_imbalance,
+                    'models': {name: models[name] for name in selected_models},
+                    'do_tuning': do_tuning,
+                    'random_state': random_state
+                }
+            except Exception as e:
+                st.error(f"❌ Error in configuration: {str(e)}")
+                st.info("💡 Tip: Check your data types and ensure all columns are valid")
+                return
+        with tab2:
+            if 'ml_config' not in st.session_state:
+                st.info("ℹ️ Please configure your model in the 'Configuration' tab first")
+                return
+            if st.button("🚀 Start Training", use_container_width=True, type="primary"):
+                try:
+                    config = st.session_state['ml_config']
+                    st.markdown('<div class="custom-card">', unsafe_allow_html=True)
+                    # Prepare data with error handling
+                    with st.spinner("📊 Preparing data..."):
+                        try:
+                            X = df[config['features']].copy()
+                            y = df[config['target']].copy()
+                            # Handle missing values
+                            if X.isnull().sum().sum() > 0:
+                                st.info(f"⚠️ Handling {X.isnull().sum().sum()} missing values in features...")
+                                X = X.fillna(X.mean(numeric_only=True)).fillna(X.mode().iloc[0])
+                            # Handle categorical features
+                            cat_features = X.select_dtypes(include=['object', 'category']).columns
+                            if len(cat_features) > 0:
+                                st.info(f"🔄 Encoding categorical features: {list(cat_features)}")
+                                X = pd.get_dummies(X, columns=cat_features)
+                            # Handle target encoding for classification
+                            le = None
+                            if config['task_type'] == "Classification" and y.dtype == 'object':
+                                le = LabelEncoder()
+                                y = le.fit_transform(y)
+                                st.info(f"📊 Target classes: {dict(zip(le.classes_, le.transform(le.classes_)))}")
+                            # Handle class imbalance
+                            if config['task_type'] == "Classification" and config['handle_imbalance']:
+                                from sklearn.utils.class_weight import compute_class_weight
+                                class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
+                                st.info(f"⚖️ Using class weights: {dict(zip(np.unique(y), class_weights))}")
+                            # Scale features
+                            scaler = None
+                            if config['scaler'] != "None":
+                                if config['scaler'] == "StandardScaler":
+                                    scaler = StandardScaler()
+                                else:
+                                    scaler = MinMaxScaler()
+                                X_scaled = scaler.fit_transform(X)
+                                X = pd.DataFrame(X_scaled, columns=X.columns)
+                            # Split data
+                            stratify = y if config['task_type'] == "Classification" else None
+                            X_train, X_test, y_train, y_test = train_test_split(
+                                X, y, test_size=config['test_size'],
+                                random_state=config['random_state'],
+                                stratify=stratify
+                            )
+                            st.success(f"✅ Data prepared: {X_train.shape[0]} training samples, {X_test.shape[0]} test samples")
+                        except Exception as e:
+                            st.error(f"❌ Error in data preparation: {str(e)}")
+                            return
+                    # Train models
+                    results = []
+                    trained_models = {}
+                    progress_bar = st.progress(0)
+                    status_text = st.empty()
+                    for i, (model_name, model) in enumerate(config['models'].items()):
+                        status_text.text(f"🔄 Training {model_name}...")
+                        try:
+                            # Apply class weights if needed
+                            if config['task_type'] == "Classification" and config['handle_imbalance']:
+                                if hasattr(model, 'class_weight'):
+                                    model.set_params(class_weight='balanced')
+                            # Train
+                            start_time = time.time()
+                            model.fit(X_train, y_train)
+                            training_time = time.time() - start_time
+                            # Store trained model
+                            trained_models[model_name] = {
+                                'model': model,
+                                'scaler': scaler,
+                                'label_encoder': le,
+                                'features': X.columns.tolist()
+                            }
+                            # Predict
+                            y_pred = model.predict(X_test)
+                            # Calculate metrics
+                            if config['task_type'] == "Classification":
+                                try:
+                                    accuracy = accuracy_score(y_test, y_pred)
+                                    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
+                                    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
+                                    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
+                                    # Cross-validation
+                                    cv_scores = cross_val_score(model, X_train, y_train, cv=config['cv_folds'])
+                                    results.append({
+                                        "Model": model_name,
+                                        "Accuracy": f"{accuracy:.4f}",
+                                        "Precision": f"{precision:.4f}",
+                                        "Recall": f"{recall:.4f}",
+                                        "F1 Score": f"{f1:.4f}",
+                                        "CV Score": f"{cv_scores.mean():.4f} (±{cv_scores.std()*2:.4f})",
+                                        "Time (s)": f"{training_time:.2f}"
+                                    })
+                                except Exception as e:
+                                    st.warning(f"⚠️ Could not calculate all metrics for {model_name}: {str(e)}")
+                            else:  # Regression
+                                try:
+                                    mse = mean_squared_error(y_test, y_pred)
+                                    rmse = np.sqrt(mse)
+                                    mae = mean_absolute_error(y_test, y_pred)
+                                    r2 = r2_score(y_test, y_pred)
+                                    # Cross-validation
+                                    cv_scores = cross_val_score(model, X_train, y_train, cv=config['cv_folds'], scoring='r2')
+                                    results.append({
+                                        "Model": model_name,
+                                        "R² Score": f"{r2:.4f}",
+                                        "RMSE": f"{rmse:.4f}",
+                                        "MAE": f"{mae:.4f}",
+                                        "CV R²": f"{cv_scores.mean():.4f} (±{cv_scores.std()*2:.4f})",
+                                        "Time (s)": f"{training_time:.2f}"
+                                    })
+                                except Exception as e:
+                                    st.warning(f"⚠️ Could not calculate all metrics for {model_name}: {str(e)}")
+                        except MemoryError:
+                            st.error(f"❌ Out of memory training {model_name}. Try using fewer features or a sample.")
+                        except Exception as e:
+                            st.warning(f"⚠️ Error training {model_name}: {str(e)}")
+                        progress_bar.progress((i + 1) / len(config['models']))
+                    status_text.text("✅ Training complete!")
+                    if not results:
+                        st.error("❌ No models were successfully trained")
+                        return
+                    # Display results
+                    st.subheader("📊 Model Performance Comparison")
+                    results_df = pd.DataFrame(results)
+                    # Highlight best model
+                    if config['task_type'] == "Classification":
+                        best_idx = results_df['F1 Score'].astype(float).idxmax()
+                    else:
+                        best_idx = results_df['R² Score'].astype(float).idxmax()
+                    # Style dataframe
+                    def highlight_best(s):
+                        is_best = s.index == best_idx
+                        return ['background-color: #90EE90' if v else '' for v in is_best]
+                    st.dataframe(results_df.style.apply(highlight_best), use_container_width=True)
+                    # Store results
+                    st.session_state['trained_models'] = trained_models
+                    st.session_state['X_train'] = X_train
+                    st.session_state['X_test'] = X_test
+                    st.session_state['y_train'] = y_train
+                    st.session_state['y_test'] = y_test
+                    st.session_state['task_type'] = config['task_type']
+                    st.session_state['results_df'] = results_df
+                    # Best model info
+                    best_model_name = results_df.iloc[best_idx]['Model']
+                    st.success(f"🏆 **Best Model:** {best_model_name}")
+                    st.markdown('</div>', unsafe_allow_html=True)
+                except Exception as e:
+                    st.error(f"❌ Critical error in training: {str(e)}")
+                    st.info("💡 Try reducing the number of features or models")
+        with tab3:
+            if 'trained_models' not in st.session_state:
+                st.info("ℹ️ Train some models first in the 'Model Training' tab")
+                return
+            try:
+                st.markdown('<div class="custom-card">', unsafe_allow_html=True)
+                st.subheader("📈 Detailed Model Evaluation")
+                # Model selection for detailed evaluation
+                selected_eval_model = st.selectbox(
+                    "Select model for detailed evaluation",
+                    list(st.session_state['trained_models'].keys())
+                )
+                model_info = st.session_state['trained_models'][selected_eval_model]
+                model = model_info['model']
+                X_test = st.session_state['X_test']
+                y_test = st.session_state['y_test']
+                task_type = st.session_state['task_type']
+                try:
+                    y_pred = model.predict(X_test)
+                    if task_type == "Classification":
+                        # Confusion Matrix
+                        st.markdown("### Confusion Matrix")
+                        cm = confusion_matrix(y_test, y_pred)
+                        fig = px.imshow(cm,
+                                      text_auto=True,
+                                      aspect="auto",
+                                      color_continuous_scale='Blues',
+                                      title=f"Confusion Matrix - {selected_eval_model}")
+                        fig.update_layout(xaxis_title="Predicted", yaxis_title="Actual")
+                        st.plotly_chart(fig, use_container_width=True)
+                        # Classification Report
+                        st.markdown("### Classification Report")
+                        report = classification_report(y_test, y_pred, output_dict=True)
+                        report_df = pd.DataFrame(report).transpose()
+                        st.dataframe(report_df.style.format("{:.4f}"), use_container_width=True)
+                        # ROC Curve (for binary classification)
+                        if len(np.unique(y_test)) == 2 and hasattr(model, "predict_proba"):
+                            st.markdown("### ROC Curve")
+                            y_pred_proba = model.predict_proba(X_test)[:, 1]
+                            fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
+                            roc_auc = auc(fpr, tpr)
+                            fig = go.Figure()
+                            fig.add_trace(go.Scatter(x=fpr, y=tpr,
+                                                    mode='lines',
+                                                    name=f'ROC (AUC = {roc_auc:.3f})',
+                                                    line=dict(color='blue', width=2)))
+                            fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1],
+                                                    mode='lines',
+                                                    name='Random',
+                                                    line=dict(color='gray', dash='dash')))
+                            fig.update_layout(xaxis_title="False Positive Rate",
+                                            yaxis_title="True Positive Rate",
+                                            title=f"ROC Curve - {selected_eval_model}")
+                            st.plotly_chart(fig, use_container_width=True)
+                    else:  # Regression
+                        # Actual vs Predicted plot
+                        st.markdown("### Actual vs Predicted")
+                        fig = px.scatter(x=y_test, y=y_pred,
+                                       labels={'x': 'Actual', 'y': 'Predicted'},
+                                       title=f"Actual vs Predicted - {selected_eval_model}",
+                                       trendline="ols")
+                        # Add perfect prediction line
+                        min_val = min(y_test.min(), y_pred.min())
+                        max_val = max(y_test.max(), y_pred.max())
+                        fig.add_trace(go.Scatter(x=[min_val, max_val], y=[min_val, max_val],
+                                                mode='lines', name='Perfect Prediction',
+                                                line=dict(color='red', dash='dash')))
+                        st.plotly_chart(fig, use_container_width=True)
+                        # Residuals plot
+                        st.markdown("### Residuals Analysis")
+                        residuals = y_test - y_pred
+                        fig = make_subplots(rows=1, cols=2,
+                                           subplot_titles=("Residuals vs Predicted", "Residuals Distribution"))
+                        fig.add_trace(go.Scatter(x=y_pred, y=residuals,
+                                                mode='markers',
+                                                name='Residuals',
+                                                marker=dict(color='blue', opacity=0.5)), row=1, col=1)
+                        fig.add_hline(y=0, line_dash="dash", line_color="red", row=1, col=1)
+                        fig.add_trace(go.Histogram(x=residuals, nbinsx=30,
+                                                  name='Distribution',
+                                                  marker_color='green'), row=1, col=2)
+                        fig.update_layout(title=f"Residual Analysis - {selected_eval_model}")
+                        st.plotly_chart(fig, use_container_width=True)
+                        # Residual statistics
+                        col1, col2, col3 = st.columns(3)
+                        with col1:
+                            st.metric("Mean Residual", f"{residuals.mean():.4f}")
+                        with col2:
+                            st.metric("Std Residual", f"{residuals.std():.4f}")
+                        with col3:
+                            st.metric("Residual Range", f"{residuals.max() - residuals.min():.4f}")
+                    # Feature Importance (if available)
+                    if hasattr(model, 'feature_importances_'):
+                        st.markdown("### Feature Importance")
+                        feature_importance = pd.DataFrame({
+                            'feature': X_test.columns,
+                            'importance': model.feature_importances_
+                        }).sort_values('importance', ascending=True)
+                        fig = px.bar(feature_importance.tail(10),
+                                   x='importance', y='feature',
+                                   orientation='h',
+                                   title="Top 10 Feature Importances",
+                                   color='importance',
+                                   color_continuous_scale='Viridis')
+                        st.plotly_chart(fig, use_container_width=True)
+                except Exception as e:
+                    st.error(f"❌ Error in evaluation: {str(e)}")
+                st.markdown('</div>', unsafe_allow_html=True)
+            except Exception as e:
+                st.error(f"❌ Error loading evaluation: {str(e)}")
+        with tab4:
+            if 'trained_models' not in st.session_state:
+                st.info("ℹ️ Train some models first in the 'Model Training' tab")
+                return
+            try:
+                st.markdown('<div class="custom-card">', unsafe_allow_html=True)
+                st.subheader("🔮 Make Predictions")
+                # Model selection for predictions
+                selected_pred_model = st.selectbox(
+                    "Select model for predictions",
+                    list(st.session_state['trained_models'].keys()),
+                    key="pred_model"
+                )
+                model_info = st.session_state['trained_models'][selected_pred_model]
+                model = model_info['model']
+                scaler = model_info['scaler']
+                le = model_info.get('label_encoder')
+                feature_names = model_info['features']
+                # Input method
+                input_method = st.radio(
+                    "Input method",
+                    ["Manual input", "Upload new data", "Batch prediction"],
+                    horizontal=True
+                )
+                if input_method == "Manual input":
+                    st.markdown("### Enter feature values")
+                    input_data = {}
+                    cols = st.columns(3)
+                    for i, feature in enumerate(feature_names):
+                        with cols[i % 3]:
+                            try:
+                                # Get feature range from training data
+                                if feature in st.session_state['X_train'].columns:
+                                    min_val = float(st.session_state['X_train'][feature].min())
+                                    max_val = float(st.session_state['X_train'][feature].max())
+                                    mean_val = float(st.session_state['X_train'][feature].mean())
+                                    input_data[feature] = st.slider(
+                                        f"{feature}",
+                                        min_val, max_val, mean_val,
+                                        format="%.4f",
+                                        key=f"manual_{feature}"
+                                    )
+                                else:
+                                    input_data[feature] = st.number_input(
+                                        f"{feature}",
+                                        value=0.0,
+                                        key=f"manual_{feature}"
+                                    )
+                            except Exception as e:
+                                st.warning(f"⚠️ Error with {feature}: {str(e)}")
+                                input_data[feature] = 0.0
+                    if st.button("🔮 Predict", use_container_width=True):
+                        try:
+                            # Convert input to DataFrame
+                            input_df = pd.DataFrame([input_data])
+                            # Ensure all features are present
+                            for col in feature_names:
+                                if col not in input_df.columns:
+                                    input_df[col] = 0
+                            input_df = input_df[feature_names]
+                            # Scale if needed
+                            if scaler is not None:
+                                input_scaled = scaler.transform(input_df)
+                                input_df = pd.DataFrame(input_scaled, columns=feature_names)
+                            # Make prediction
+                            prediction = model.predict(input_df)[0]
+                            # Decode if needed
+                            if le is not None:
+                                prediction = le.inverse_transform([int(prediction)])[0]
+                            # Display prediction with styling
+                            st.markdown("""
+                            <div class="success-container" style="text-align: center; padding: 2rem;">
+                                <h3>🎯 Prediction Result</h3>
+                                <h1 style="font-size: 3rem;">{}</h1>
+                            </div>
+                            """.format(prediction), unsafe_allow_html=True)
+                        except Exception as e:
+                            st.error(f"❌ Prediction error: {str(e)}")
+                elif input_method == "Upload new data":
+                    pred_file = st.file_uploader("Upload data for predictions",
+                                                type=["csv", "xlsx"],
+                                                key="pred_file")
+                    if pred_file:
+                        try:
+                            if pred_file.name.endswith("csv"):
+                                pred_df = pd.read_csv(pred_file)
+                            else:
+                                pred_df = pd.read_excel(pred_file)
+                            st.subheader("📋 Uploaded Data Preview")
+                            st.dataframe(pred_df.head())
+                            if st.button("🔮 Predict for all rows", use_container_width=True):
+                                with st.spinner("Making predictions..."):
+                                    try:
+                                        # Prepare data
+                                        pred_processed = pred_df.copy()
+                                        # Handle categorical features if needed
+                                        for col in pred_processed.columns:
+                                            if pred_processed[col].dtype == 'object':
+                                                pred_processed = pd.get_dummies(pred_processed, columns=[col])
+                                        # Align columns with training data
+                                        for col in feature_names:
+                                            if col not in pred_processed.columns:
+                                                pred_processed[col] = 0
+                                        pred_processed = pred_processed[feature_names]
+                                        # Scale if needed
+                                        if scaler is not None:
+                                            pred_scaled = scaler.transform(pred_processed)
+                                            pred_processed = pd.DataFrame(pred_scaled, columns=feature_names)
+                                        # Make predictions
+                                        predictions = model.predict(pred_processed)
+                                        # Decode if needed
+                                        if le is not None:
+                                            predictions = le.inverse_transform(predictions.astype(int))
+                                        # Add predictions to dataframe
+                                        pred_df['Prediction'] = predictions
+                                        st.subheader("📊 Predictions Result")
+                                        st.dataframe(pred_df)
+                                        # Download predictions
+                                        csv = pred_df.to_csv(index=False)
+                                        st.download_button(
+                                            label="📥 Download Predictions",
+                                            data=csv,
+                                            file_name="predictions.csv",
+                                            mime="text/csv",
+                                            use_container_width=True
+                                        )
+                                    except Exception as e:
+                                        st.error(f"❌ Prediction error: {str(e)}")
+                        except Exception as e:
+                            st.error(f"❌ Error reading file: {str(e)}")
+                elif input_method == "Batch prediction":
+                    st.markdown("### Batch Prediction Settings")
+                    n_samples = st.number_input("Number of samples to generate",
+                                                min_value=1, max_value=1000, value=10)
+                    if st.button("🎲 Generate Random Samples & Predict", use_container_width=True):
+                        try:
+                            # Generate random samples based on training data distribution
+                            random_samples = {}
+                            for feature in feature_names:
+                                if feature in st.session_state['X_train'].columns:
+                                    mean = st.session_state['X_train'][feature].mean()
+                                    std = st.session_state['X_train'][feature].std()
+                                    random_samples[feature] = np.random.normal(mean, std, n_samples)
+                                else:
+                                    random_samples[feature] = np.zeros(n_samples)
+                            batch_df = pd.DataFrame(random_samples)
+                            # Scale if needed
+                            if scaler is not None:
+                                batch_scaled = scaler.transform(batch_df)
+                                batch_df = pd.DataFrame(batch_scaled, columns=feature_names)
+                            # Make predictions
+                            predictions = model.predict(batch_df)
+                            # Decode if needed
+                            if le is not None:
+                                predictions = le.inverse_transform(predictions.astype(int))
+                            # Add predictions to dataframe
+                            batch_df['Prediction'] = predictions
+                            st.subheader("📊 Batch Predictions")
+                            st.dataframe(batch_df)
+                            # Statistics
+                            if le is None:  # Numerical predictions
+                                st.subheader("📈 Prediction Statistics")
+                                col1, col2, col3 = st.columns(3)
+                                with col1:
+                                    st.metric("Mean", f"{predictions.mean():.4f}")
+                                with col2:
+                                    st.metric("Std", f"{predictions.std():.4f}")
+                                with col3:
+                                    st.metric("Range", f"{predictions.max() - predictions.min():.4f}")
+                            # Download predictions
+                            csv = batch_df.to_csv(index=False)
+                            st.download_button(
+                                label="📥 Download Batch Predictions",
+                                data=csv,
+                                file_name="batch_predictions.csv",
+                                mime="text/csv",
+                                use_container_width=True
+                            )
+                        except Exception as e:
+                            st.error(f"❌ Batch prediction error: {str(e)}")
+                st.markdown('</div>', unsafe_allow_html=True)
+            except Exception as e:
+                st.error(f"❌ Error in prediction: {str(e)}")
+        with tab5:
+            if 'results_df' not in st.session_state:
+                st.info("ℹ️ Train some models first in the 'Model Training' tab")
+                return
+            try:
+                st.markdown('<div class="custom-card">', unsafe_allow_html=True)
+                st.subheader("📋 Machine Learning Report")
+                results_df = st.session_state['results_df']
+                config = st.session_state.get('ml_config', {})
+                # Generate report
+                report = f"""
+                # Machine Learning Pipeline Report
+                ## Configuration Summary
+                - **Task Type:** {config.get('task_type', 'N/A')}
+                - **Target Variable:** {config.get('target', 'N/A')}
+                - **Number of Features:** {len(config.get('features', []))}
+                - **Test Size:** {config.get('test_size', 0.2)*100:.0f}%
+                - **Cross-Validation Folds:** {config.get('cv_folds', 5)}
+                - **Feature Scaling:** {config.get('scaler', 'None')}
+                ## Dataset Information
+                - **Total Samples:** {st.session_state.get('X_train', pd.DataFrame()).shape[0] + st.session_state.get('X_test', pd.DataFrame()).shape[0]}
+                - **Training Samples:** {st.session_state.get('X_train', pd.DataFrame()).shape[0]}
+                - **Test Samples:** {st.session_state.get('X_test', pd.DataFrame()).shape[0]}
+                ## Model Performance Summary
+                {results_df.to_string()}
+                ## Best Model
+                **{results_df.iloc[0]['Model']}** performed best based on {'F1 Score' if config.get('task_type') == 'Classification' else 'R² Score'}.
+                ## Recommendations
+                """
+                # Add recommendations based on results
+                if config.get('task_type') == 'Classification':
+                    if float(results_df['Accuracy'].iloc[0]) > 0.9:
+                        report += "\n- ✓ Excellent model performance achieved"
+                    elif float(results_df['Accuracy'].iloc[0]) > 0.7:
+                        report += "\n- ✓ Good model performance"
+                    else:
+                        report += "\n- ⚠️ Model performance could be improved. Consider feature engineering or trying different algorithms"
+                else:
+                    if float(results_df['R² Score'].iloc[0]) > 0.8:
+                        report += "\n- ✓ Excellent model performance achieved"
+                    elif float(results_df['R² Score'].iloc[0]) > 0.6:
+                        report += "\n- ✓ Good model performance"
+                    else:
+                        report += "\n- ⚠️ Model performance could be improved. Consider feature engineering or trying different algorithms"
+                st.markdown(report)
+                # Download report
+                st.download_button(
+                    label="📥 Download ML Report",
+                    data=report,
+                    file_name="ml_report.txt",
+                    mime="text/plain",
+                    use_container_width=True
+                )
+                st.markdown('</div>', unsafe_allow_html=True)
+            except Exception as e:
+                st.error(f"❌ Error generating report: {str(e)}")
+    except Exception as e:
+        st.error(f"❌ Critical error in ML pipeline: {str(e)}")
+        st.info("💡 Please check your data and try again. If the problem persists, try with a smaller dataset.")

requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+streamlit>=1.28.0
+pandas>=2.0.0
+numpy>=1.24.0
+scikit-learn>=1.3.0
+plotly>=5.17.0
+matplotlib>=3.7.0
+xgboost>=1.7.0
+lightgbm>=4.0.0
+openpyxl>=3.1.0
+scipy>=1.10.0
+shap>=0.42.0
+imbalanced-learn>=0.11.0
+category-encoders>=2.6.0
+statsmodels>=0.14.0
+seaborn>=0.12.0
+joblib>=1.3.0

statistical_analysis.py ADDED Viewed

	@@ -0,0 +1,928 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+import scipy.stats as stats
+import plotly.express as px
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+import statsmodels.api as sm
+from statsmodels.formula.api import ols
+from statsmodels.stats.multicomp import pairwise_tukeyhsd
+from statsmodels.tsa.stattools import adfuller, kpss
+from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
+import matplotlib.pyplot as plt
+import warnings
+warnings.filterwarnings('ignore')
+def statistical_analysis(df):
+    """
+    Enhanced statistical analysis with advanced statistical tests and visualizations
+    """
+    st.markdown("""
+    <div style='text-align: center; margin-bottom: 2rem;'>
+        <h2>📐 Advanced Statistical Analysis</h2>
+        <p style='color: gray;'>Comprehensive statistical tests, hypothesis testing, and probability analysis</p>
+    </div>
+    """, unsafe_allow_html=True)
+    # Error handling for empty dataframe
+    if df.empty:
+        st.error("❌ The dataset is empty. Please upload a valid dataset.")
+        return
+    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
+    categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
+    datetime_cols = df.select_dtypes(include=['datetime64']).columns.tolist()
+    if not numeric_cols:
+        st.warning("⚠️ No numeric columns found. Statistical analysis requires numeric data.")
+        return
+    # Create tabs for different statistical analyses
+    tab1, tab2, tab3, tab4, tab5, tab6 = st.tabs([
+        "📊 Descriptive Stats",
+        "📈 Correlation Analysis",
+        "🔬 Hypothesis Testing",
+        "📊 Distribution Analysis",
+        "📉 Time Series Analysis",
+        "🎲 Probability & Sampling"
+    ])
+    with tab1:
+        st.markdown('<div class="custom-card">', unsafe_allow_html=True)
+        st.subheader("📊 Descriptive Statistics")
+        try:
+            # Basic statistics with confidence intervals
+            stats_df = pd.DataFrame()
+            for col in numeric_cols:
+                data = df[col].dropna()
+                if len(data) > 0:
+                    # Calculate confidence interval
+                    ci = stats.t.interval(0.95, len(data)-1, loc=data.mean(), scale=stats.sem(data))
+                    stats_df[col] = {
+                        'Count': len(data),
+                        'Mean': data.mean(),
+                        'Std Dev': data.std(),
+                        'Variance': data.var(),
+                        'Min': data.min(),
+                        'Q1 (25%)': data.quantile(0.25),
+                        'Median (50%)': data.median(),
+                        'Q3 (75%)': data.quantile(0.75),
+                        'Max': data.max(),
+                        'Range': data.max() - data.min(),
+                        'IQR': data.quantile(0.75) - data.quantile(0.25),
+                        'Skewness': data.skew(),
+                        'Kurtosis': data.kurtosis(),
+                        'Coefficient of Variation (%)': (data.std() / data.mean() * 100) if data.mean() != 0 else np.nan,
+                        '95% CI Lower': ci[0],
+                        '95% CI Upper': ci[1]
+                    }
+            stats_df = pd.DataFrame(stats_df).T
+            st.dataframe(stats_df.style.format("{:.4f}"), use_container_width=True)
+            # Summary cards
+            st.subheader("📊 Summary Cards")
+            col1, col2, col3, col4 = st.columns(4)
+            with col1:
+                st.metric("Total Numeric Columns", len(numeric_cols))
+            with col2:
+                st.metric("Total Observations", f"{df.shape[0]:,}")
+            with col3:
+                st.metric("Complete Cases", f"{df.dropna().shape[0]:,}")
+            with col4:
+                completeness = (1 - df.isnull().sum().sum() / (df.shape[0] * df.shape[1])) * 100
+                st.metric("Data Completeness", f"{completeness:.1f}%")
+            # Distribution visualization
+            st.subheader("Distribution Analysis")
+            selected_col = st.selectbox("Select column for detailed distribution analysis", numeric_cols)
+            data = df[selected_col].dropna()
+            fig = make_subplots(rows=2, cols=2,
+                               subplot_titles=("Histogram with KDE", "Box Plot",
+                                             "Violin Plot", "Q-Q Plot"),
+                               specs=[[{"type": "xy"}, {"type": "xy"}],
+                                     [{"type": "xy"}, {"type": "xy"}]])
+            # Histogram with KDE
+            hist_data = go.Histogram(x=data, nbinsx=30, name="Histogram", opacity=0.7)
+            fig.add_trace(hist_data, row=1, col=1)
+            # Box plot
+            box_data = go.Box(y=data, name="Box Plot", boxpoints='outliers')
+            fig.add_trace(box_data, row=1, col=2)
+            # Violin plot
+            violin_data = go.Violin(y=data, name="Violin Plot", box_visible=True, meanline_visible=True)
+            fig.add_trace(violin_data, row=2, col=1)
+            # Q-Q plot
+            theoretical_q = np.random.normal(data.mean(), data.std(), len(data))
+            theoretical_q.sort()
+            data_sorted = np.sort(data)
+            qq_data = go.Scatter(x=theoretical_q, y=data_sorted, mode='markers', name='Q-Q')
+            fig.add_trace(qq_data, row=2, col=2)
+            # Add reference line to Q-Q plot
+            min_val = min(theoretical_q.min(), data_sorted.min())
+            max_val = max(theoretical_q.max(), data_sorted.max())
+            ref_line = go.Scatter(x=[min_val, max_val], y=[min_val, max_val],
+                                 mode='lines', name='Reference', line=dict(color='red', dash='dash'))
+            fig.add_trace(ref_line, row=2, col=2)
+            fig.update_layout(height=800, title_text=f"Distribution Analysis of {selected_col}")
+            st.plotly_chart(fig, use_container_width=True)
+            # Outlier detection
+            Q1 = data.quantile(0.25)
+            Q3 = data.quantile(0.75)
+            IQR = Q3 - Q1
+            outliers = data[(data < Q1 - 1.5 * IQR) | (data > Q3 + 1.5 * IQR)]
+            if len(outliers) > 0:
+                st.warning(f"⚠️ **Outliers detected**: {len(outliers)} outliers found ({len(outliers)/len(data)*100:.2f}%)")
+                with st.expander("View outlier values"):
+                    st.write(outliers.tolist())
+            else:
+                st.success("✅ No outliers detected in this column")
+        except Exception as e:
+            st.error(f"❌ Error in descriptive statistics: {str(e)}")
+            st.info("💡 Tip: Check if your data contains non-numeric values or extreme outliers")
+        st.markdown('</div>', unsafe_allow_html=True)
+    with tab2:
+        st.markdown('<div class="custom-card">', unsafe_allow_html=True)
+        st.subheader("📈 Advanced Correlation Analysis")
+        try:
+            if len(numeric_cols) >= 2:
+                # Multiple correlation methods
+                corr_method = st.radio(
+                    "Select correlation method",
+                    ["Pearson (linear)", "Spearman (rank)", "Kendall (ordinal)"],
+                    horizontal=True
+                )
+                method_map = {
+                    "Pearson (linear)": "pearson",
+                    "Spearman (rank)": "spearman",
+                    "Kendall (ordinal)": "kendall"
+                }
+                # Calculate correlation matrix
+                corr_matrix = df[numeric_cols].corr(method=method_map[corr_method])
+                # Heatmap with improved visualization
+                fig = px.imshow(corr_matrix,
+                               text_auto=True,
+                               aspect="auto",
+                               color_continuous_scale='RdBu_r',
+                               title=f"{corr_method} Correlation Matrix",
+                               zmin=-1, zmax=1)
+                fig.update_layout(height=600)
+                st.plotly_chart(fig, use_container_width=True)
+                # Correlation significance testing
+                st.subheader("📊 Correlation Significance Testing")
+                col1, col2 = st.columns(2)
+                with col1:
+                    feat1 = st.selectbox("Select first feature", numeric_cols, key="corr_feat1")
+                with col2:
+                    feat2 = st.selectbox("Select second feature", [c for c in numeric_cols if c != feat1], key="corr_feat2")
+                data1 = df[feat1].dropna()
+                data2 = df[feat2].dropna()
+                # Align data
+                combined = pd.concat([data1, data2], axis=1).dropna()
+                if len(combined) > 0:
+                    corr_coef, p_value = stats.pearsonr(combined.iloc[:, 0], combined.iloc[:, 1])
+                    st.write(f"**Pearson correlation coefficient:** {corr_coef:.4f}")
+                    st.write(f"**P-value:** {p_value:.4f}")
+                    if p_value < 0.05:
+                        st.success(f"✅ Statistically significant correlation (p < 0.05)")
+                    else:
+                        st.info(f"ℹ️ No statistically significant correlation (p >= 0.05)")
+                    # Confidence interval for correlation
+                    n = len(combined)
+                    r = corr_coef
+                    z = np.arctanh(r)
+                    se = 1 / np.sqrt(n - 3)
+                    ci_z = stats.norm.interval(0.95, loc=z, scale=se)
+                    ci_r = np.tanh(ci_z)
+                    st.write(f"**95% Confidence Interval:** [{ci_r[0]:.4f}, {ci_r[1]:.4f}]")
+                    # Scatter plot with regression line
+                    fig = px.scatter(combined, x=combined.columns[0], y=combined.columns[1],
+                                   trendline="ols", title=f"Relationship: {feat1} vs {feat2}")
+                    st.plotly_chart(fig, use_container_width=True)
+                # Partial correlation analysis
+                st.subheader("🔍 Partial Correlation Analysis")
+                if len(numeric_cols) >= 3:
+                    from sklearn.linear_model import LinearRegression
+                    control_var = st.selectbox("Select control variable",
+                                              [c for c in numeric_cols if c not in [feat1, feat2]])
+                    # Calculate partial correlation
+                    X_control = df[[control_var]].dropna()
+                    y1 = df[feat1].dropna()
+                    y2 = df[feat2].dropna()
+                    # Align data
+                    aligned_data = pd.concat([X_control, y1, y2], axis=1).dropna()
+                    if len(aligned_data) > 0:
+                        # Residualize
+                        model1 = LinearRegression().fit(aligned_data[[control_var]], aligned_data[feat1])
+                        res1 = aligned_data[feat1] - model1.predict(aligned_data[[control_var]])
+                        model2 = LinearRegression().fit(aligned_data[[control_var]], aligned_data[feat2])
+                        res2 = aligned_data[feat2] - model2.predict(aligned_data[[control_var]])
+                        partial_corr, partial_p = stats.pearsonr(res1, res2)
+                        st.write(f"**Partial correlation (controlling for {control_var}):** {partial_corr:.4f}")
+                        st.write(f"**P-value:** {partial_p:.4f}")
+                        if abs(partial_corr) < abs(corr_coef):
+                            st.info(f"ℹ️ The correlation decreases when controlling for {control_var}, suggesting it may be a confounding variable")
+            else:
+                st.warning("⚠️ Need at least 2 numeric columns for correlation analysis")
+        except Exception as e:
+            st.error(f"❌ Error in correlation analysis: {str(e)}")
+            st.info("💡 Tip: Ensure your data has sufficient non-null values for correlation calculation")
+        st.markdown('</div>', unsafe_allow_html=True)
+    with tab3:
+        st.markdown('<div class="custom-card">', unsafe_allow_html=True)
+        st.subheader("🔬 Statistical Hypothesis Testing")
+        try:
+            test_category = st.selectbox(
+                "Select test category",
+                ["Parametric Tests", "Non-parametric Tests", "ANOVA & Post-hoc", "Goodness of Fit"]
+            )
+            if test_category == "Parametric Tests":
+                param_test = st.selectbox(
+                    "Select parametric test",
+                    ["One-Sample t-test", "Independent t-test", "Paired t-test", "Z-test"]
+                )
+                if param_test == "One-Sample t-test":
+                    if numeric_cols:
+                        col = st.selectbox("Select variable", numeric_cols)
+                        test_value = st.number_input("Test value (population mean)", value=0.0)
+                        data = df[col].dropna()
+                        if len(data) > 0:
+                            t_stat, p_value = stats.ttest_1samp(data, test_value)
+                            st.write(f"**t-statistic:** {t_stat:.4f}")
+                            st.write(f"**p-value:** {p_value:.4f}")
+                            st.write(f"**Degrees of freedom:** {len(data)-1}")
+                            # Effect size (Cohen's d)
+                            cohens_d = (data.mean() - test_value) / data.std()
+                            st.write(f"**Cohen's d (effect size):** {cohens_d:.4f}")
+                            if p_value < 0.05:
+                                st.success(f"✅ Reject null hypothesis: Mean is significantly different from {test_value}")
+                            else:
+                                st.info(f"ℹ️ Fail to reject null hypothesis: Mean is not significantly different from {test_value}")
+                            # Visualization
+                            fig = go.Figure()
+                            fig.add_trace(go.Histogram(x=data, name="Sample", opacity=0.7))
+                            fig.add_vline(x=test_value, line_dash="dash", line_color="red",
+                                        annotation_text=f"Test value: {test_value}")
+                            fig.add_vline(x=data.mean(), line_color="green",
+                                        annotation_text=f"Sample mean: {data.mean():.2f}")
+                            fig.update_layout(title=f"One-Sample t-test: {col}")
+                            st.plotly_chart(fig, use_container_width=True)
+                elif param_test == "Independent t-test":
+                    if len(numeric_cols) >= 1 and len(categorical_cols) >= 1:
+                        num_col = st.selectbox("Select numeric variable", numeric_cols, key="ind_num")
+                        cat_col = st.selectbox("Select grouping variable", categorical_cols, key="ind_cat")
+                        groups = df[cat_col].dropna().unique()
+                        if len(groups) == 2:
+                            group1 = df[df[cat_col] == groups[0]][num_col].dropna()
+                            group2 = df[df[cat_col] == groups[1]][num_col].dropna()
+                            # Test for equal variances
+                            levene_stat, levene_p = stats.levene(group1, group2)
+                            equal_var = levene_p > 0.05
+                            t_stat, p_value = stats.ttest_ind(group1, group2, equal_var=equal_var)
+                            st.write(f"**Groups:** {groups[0]} (n={len(group1)}) vs {groups[1]} (n={len(group2)})")
+                            st.write(f"**Levene's test for equal variances:** p={levene_p:.4f}")
+                            st.write(f"**Assuming {'equal' if equal_var else 'unequal'} variances")
+                            st.write(f"**t-statistic:** {t_stat:.4f}")
+                            st.write(f"**p-value:** {p_value:.4f}")
+                            # Effect size (Cohen's d)
+                            pooled_std = np.sqrt(((len(group1)-1)*group1.std()**2 + (len(group2)-1)*group2.std()**2) /
+                                                (len(group1)+len(group2)-2))
+                            cohens_d = (group1.mean() - group2.mean()) / pooled_std
+                            st.write(f"**Cohen's d (effect size):** {cohens_d:.4f}")
+                            if p_value < 0.05:
+                                st.success(f"✅ Significant difference found between groups")
+                            else:
+                                st.info(f"ℹ️ No significant difference found between groups")
+                            # Visualization
+                            fig = px.box(df, x=cat_col, y=num_col, title=f"Comparison: {num_col} by {cat_col}")
+                            st.plotly_chart(fig, use_container_width=True)
+                        else:
+                            st.warning(f"⚠️ Independent t-test requires exactly 2 groups. Found {len(groups)} groups.")
+                elif param_test == "Paired t-test":
+                    if len(numeric_cols) >= 2:
+                        col1 = st.selectbox("Select first measurement", numeric_cols, key="paired1")
+                        col2 = st.selectbox("Select second measurement", numeric_cols, key="paired2")
+                        paired_data = df[[col1, col2]].dropna()
+                        if len(paired_data) > 0:
+                            t_stat, p_value = stats.ttest_rel(paired_data[col1], paired_data[col2])
+                            st.write(f"**Sample size:** {len(paired_data)}")
+                            st.write(f"**Mean difference:** {(paired_data[col1] - paired_data[col2]).mean():.4f}")
+                            st.write(f"**t-statistic:** {t_stat:.4f}")
+                            st.write(f"**p-value:** {p_value:.4f}")
+                            if p_value < 0.05:
+                                st.success(f"✅ Significant difference found between measurements")
+                            else:
+                                st.info(f"ℹ️ No significant difference found between measurements")
+                            # Visualization
+                            fig = go.Figure()
+                            fig.add_trace(go.Scatter(x=paired_data[col1], y=paired_data[col2],
+                                                    mode='markers', text=paired_data.index))
+                            fig.add_trace(go.Scatter(x=[paired_data[col1].min(), paired_data[col1].max()],
+                                                    y=[paired_data[col1].min(), paired_data[col1].max()],
+                                                    mode='lines', name='y=x', line=dict(dash='dash')))
+                            fig.update_layout(title=f"Paired Comparison: {col1} vs {col2}")
+                            st.plotly_chart(fig, use_container_width=True)
+            elif test_category == "Non-parametric Tests":
+                nonparam_test = st.selectbox(
+                    "Select non-parametric test",
+                    ["Mann-Whitney U", "Wilcoxon Signed-Rank", "Kruskal-Wallis H", "Friedman Test"]
+                )
+                if nonparam_test == "Mann-Whitney U":
+                    if len(numeric_cols) >= 1 and len(categorical_cols) >= 1:
+                        num_col = st.selectbox("Select numeric variable", numeric_cols, key="mw_num")
+                        cat_col = st.selectbox("Select grouping variable", categorical_cols, key="mw_cat")
+                        groups = df[cat_col].dropna().unique()
+                        if len(groups) == 2:
+                            group1 = df[df[cat_col] == groups[0]][num_col].dropna()
+                            group2 = df[df[cat_col] == groups[1]][num_col].dropna()
+                            u_stat, p_value = stats.mannwhitneyu(group1, group2, alternative='two-sided')
+                            st.write(f"**U-statistic:** {u_stat:.4f}")
+                            st.write(f"**p-value:** {p_value:.4f}")
+                            # Effect size (r = Z/√N)
+                            from scipy.stats import norm
+                            z_score = norm.ppf(p_value/2) if p_value < 1 else 0
+                            effect_size = abs(z_score) / np.sqrt(len(group1) + len(group2))
+                            st.write(f"**Effect size (r):** {effect_size:.4f}")
+                            if p_value < 0.05:
+                                st.success(f"✅ Significant difference found between groups")
+                            else:
+                                st.info(f"ℹ️ No significant difference found between groups")
+                            # Visualization
+                            fig = px.violin(df, x=cat_col, y=num_col, box=True, points="all",
+                                          title=f"Mann-Whitney U Test: {num_col} by {cat_col}")
+                            st.plotly_chart(fig, use_container_width=True)
+            elif test_category == "ANOVA & Post-hoc":
+                if len(numeric_cols) >= 1 and len(categorical_cols) >= 1:
+                    num_col = st.selectbox("Select numeric variable", numeric_cols, key="anova_num")
+                    cat_col = st.selectbox("Select grouping variable", categorical_cols, key="anova_cat")
+                    groups = [df[df[cat_col] == group][num_col].dropna()
+                             for group in df[cat_col].unique() if len(df[df[cat_col] == group]) > 0]
+                    if len(groups) >= 2:
+                        # One-way ANOVA
+                        f_stat, p_value = stats.f_oneway(*groups)
+                        st.write("**One-way ANOVA Results:**")
+                        st.write(f"**F-statistic:** {f_stat:.4f}")
+                        st.write(f"**p-value:** {p_value:.4f}")
+                        if p_value < 0.05:
+                            st.success("✅ Significant differences found between groups")
+                            # Post-hoc Tukey HSD
+                            if st.button("Run Tukey HSD Post-hoc Test"):
+                                tukey = pairwise_tukeyhsd(df[num_col].dropna(), df[cat_col].dropna())
+                                tukey_df = pd.DataFrame(data=tukey.summary().data[1:],
+                                                       columns=tukey.summary().data[0])
+                                st.dataframe(tukey_df)
+                                # Visualize confidence intervals
+                                fig = go.Figure()
+                                for i, row in enumerate(tukey_df.itertuples()):
+                                    if row.padj < 0.05:
+                                        color = 'green'
+                                    else:
+                                        color = 'red'
+                                    fig.add_trace(go.Scatter(x=[row[4], row[5]], y=[i, i],
+                                                            mode='lines', line=dict(color=color, width=3),
+                                                            name=f"{row[1]} vs {row[2]}"))
+                                fig.update_layout(title="Tukey HSD Confidence Intervals",
+                                                xaxis_title="Mean Difference",
+                                                yaxis_title="Comparison")
+                                st.plotly_chart(fig, use_container_width=True)
+                        else:
+                            st.info("ℹ️ No significant differences found between groups")
+                        # Visualization
+                        fig = px.box(df, x=cat_col, y=num_col, title=f"ANOVA: {num_col} by {cat_col}")
+                        st.plotly_chart(fig, use_container_width=True)
+        except Exception as e:
+            st.error(f"❌ Error in hypothesis testing: {str(e)}")
+            st.info("💡 Tip: Ensure you have sufficient data and appropriate variable types for the selected test")
+        st.markdown('</div>', unsafe_allow_html=True)
+    with tab4:
+        st.markdown('<div class="custom-card">', unsafe_allow_html=True)
+        st.subheader("📊 Distribution Analysis & Normality Tests")
+        try:
+            if numeric_cols:
+                col = st.selectbox("Select column for distribution analysis", numeric_cols, key="dist_col")
+                data = df[col].dropna()
+                if len(data) > 0:
+                    # Multiple normality tests
+                    st.markdown("### 🔍 Normality Tests")
+                    col1, col2 = st.columns(2)
+                    with col1:
+                        # Shapiro-Wilk test
+                        if len(data) <= 5000:
+                            shapiro_stat, shapiro_p = stats.shapiro(data)
+                            st.write("**Shapiro-Wilk Test**")
+                            st.write(f"Statistic: {shapiro_stat:.4f}")
+                            st.write(f"P-value: {shapiro_p:.4f}")
+                            if shapiro_p < 0.05:
+                                st.error("❌ Not normally distributed")
+                            else:
+                                st.success("✅ Normally distributed")
+                    with col2:
+                        # Kolmogorov-Smirnov test
+                        ks_stat, ks_p = stats.kstest(data, 'norm', args=(data.mean(), data.std()))
+                        st.write("**Kolmogorov-Smirnov Test**")
+                        st.write(f"Statistic: {ks_stat:.4f}")
+                        st.write(f"P-value: {ks_p:.4f}")
+                        if ks_p < 0.05:
+                            st.error("❌ Not normally distributed")
+                        else:
+                            st.success("✅ Normally distributed")
+                    # Anderson-Darling test
+                    anderson_stat, anderson_crit, anderson_sig = stats.anderson(data, dist='norm')
+                    st.write("**Anderson-Darling Test**")
+                    st.write(f"Statistic: {anderson_stat:.4f}")
+                    for i in range(len(anderson_crit)):
+                        st.write(f"Critical value at {anderson_sig[i]}%: {anderson_crit[i]:.4f}")
+                    # D'Agostino's K-squared test
+                    skew_stat, skew_p = stats.skewtest(data)
+                    kurt_stat, kurt_p = stats.kurtosistest(data)
+                    st.write("**D'Agostino's Tests**")
+                    st.write(f"Skewness test p-value: {skew_p:.4f}")
+                    st.write(f"Kurtosis test p-value: {kurt_p:.4f}")
+                    # Distribution fitting
+                    st.markdown("### 📈 Distribution Fitting")
+                    distributions = ['norm', 'expon', 'gamma', 'beta', 'lognorm', 'uniform']
+                    selected_dist = st.selectbox("Select distribution to fit", distributions)
+                    if selected_dist == 'norm':
+                        params = stats.norm.fit(data)
+                        pdf = stats.norm.pdf(np.sort(data), *params)
+                    elif selected_dist == 'expon':
+                        params = stats.expon.fit(data)
+                        pdf = stats.expon.pdf(np.sort(data), *params)
+                    elif selected_dist == 'gamma':
+                        params = stats.gamma.fit(data)
+                        pdf = stats.gamma.pdf(np.sort(data), *params)
+                    elif selected_dist == 'beta':
+                        # Scale data to [0,1] for beta distribution
+                        scaled_data = (data - data.min()) / (data.max() - data.min())
+                        scaled_data = scaled_data[(scaled_data > 0) & (scaled_data < 1)]
+                        if len(scaled_data) > 0:
+                            params = stats.beta.fit(scaled_data)
+                            pdf = stats.beta.pdf(np.sort(scaled_data), *params)
+                    elif selected_dist == 'lognorm':
+                        params = stats.lognorm.fit(data)
+                        pdf = stats.lognorm.pdf(np.sort(data), *params)
+                    elif selected_dist == 'uniform':
+                        params = stats.uniform.fit(data)
+                        pdf = stats.uniform.pdf(np.sort(data), *params)
+                    # Plot histogram with fitted distribution
+                    fig = go.Figure()
+                    fig.add_trace(go.Histogram(x=data, nbinsx=30, name="Data", opacity=0.7))
+                    if selected_dist != 'beta':
+                        fig.add_trace(go.Scatter(x=np.sort(data), y=pdf * len(data) * (data.max() - data.min()) / 30,
+                                               mode='lines', name=f"Fitted {selected_dist}",
+                                               line=dict(color='red', width=2)))
+                    fig.update_layout(title=f"Histogram with Fitted {selected_dist} Distribution")
+                    st.plotly_chart(fig, use_container_width=True)
+                    # Q-Q plot with confidence bands
+                    st.markdown("### 📊 Enhanced Q-Q Plot")
+                    # Generate theoretical quantiles
+                    theoretical_q = np.random.normal(data.mean(), data.std(), len(data))
+                    theoretical_q.sort()
+                    data_sorted = np.sort(data)
+                    # Calculate confidence bands (bootstrap)
+                    n_bootstrap = 100
+                    bootstrap_lines = []
+                    for i in range(n_bootstrap):
+                        bootstrap_sample = np.random.choice(data, len(data), replace=True)
+                        bootstrap_sample.sort()
+                        bootstrap_lines.append(bootstrap_sample)
+                    bootstrap_lines = np.array(bootstrap_lines)
+                    lower_band = np.percentile(bootstrap_lines, 2.5, axis=0)
+                    upper_band = np.percentile(bootstrap_lines, 97.5, axis=0)
+                    fig = go.Figure()
+                    # Add confidence band
+                    fig.add_trace(go.Scatter(x=np.concatenate([theoretical_q, theoretical_q[::-1]]),
+                                            y=np.concatenate([lower_band, upper_band[::-1]]),
+                                            fill='toself', fillcolor='rgba(0,100,80,0.2)',
+                                            line=dict(color='rgba(255,255,255,0)'),
+                                            name='95% CI'))
+                    # Add data points
+                    fig.add_trace(go.Scatter(x=theoretical_q, y=data_sorted,
+                                            mode='markers', name='Data'))
+                    # Add reference line
+                    fig.add_trace(go.Scatter(x=[data_sorted.min(), data_sorted.max()],
+                                            y=[data_sorted.min(), data_sorted.max()],
+                                            mode='lines', name='Reference',
+                                            line=dict(color='red', dash='dash')))
+                    fig.update_layout(title=f"Enhanced Q-Q Plot with 95% Confidence Band")
+                    st.plotly_chart(fig, use_container_width=True)
+        except Exception as e:
+            st.error(f"❌ Error in distribution analysis: {str(e)}")
+            st.info("💡 Tip: Ensure you have sufficient data points for distribution fitting")
+        st.markdown('</div>', unsafe_allow_html=True)
+    with tab5:
+        st.markdown('<div class="custom-card">', unsafe_allow_html=True)
+        st.subheader("📉 Advanced Time Series Analysis")
+        try:
+            if datetime_cols and numeric_cols:
+                date_col = st.selectbox("Select date column", datetime_cols)
+                value_col = st.selectbox("Select value column", numeric_cols, key="ts_value_adv")
+                # Prepare time series data
+                ts_df = df[[date_col, value_col]].dropna().sort_values(date_col)
+                ts_df.set_index(date_col, inplace=True)
+                if len(ts_df) >= 10:
+                    # Time series decomposition
+                    st.markdown("### 🔄 Time Series Decomposition")
+                    from statsmodels.tsa.seasonal import seasonal_decompose
+                    # Determine frequency
+                    freq_options = {
+                        'Auto-detect': None,
+                        'Daily (7)': 7,
+                        'Weekly (52)': 52,
+                        'Monthly (12)': 12,
+                        'Quarterly (4)': 4
+                    }
+                    selected_freq = st.selectbox("Select seasonal period", list(freq_options.keys()))
+                    period = freq_options[selected_freq]
+                    if period is None:
+                        # Auto-detect frequency
+                        try:
+                            freq = pd.infer_freq(ts_df.index)
+                            if freq:
+                                period_map = {'D': 7, 'W': 52, 'M': 12, 'Q': 4}
+                                period = period_map.get(freq[0], 7)
+                        except:
+                            period = 7
+                    if len(ts_df) >= 2 * period:
+                        decomposition = seasonal_decompose(ts_df[value_col], model='additive', period=period)
+                        fig = make_subplots(rows=4, cols=1,
+                                           subplot_titles=('Original', 'Trend', 'Seasonal', 'Residual'))
+                        fig.add_trace(go.Scatter(x=ts_df.index, y=ts_df[value_col],
+                                                mode='lines', name='Original'), row=1, col=1)
+                        fig.add_trace(go.Scatter(x=ts_df.index, y=decomposition.trend,
+                                                mode='lines', name='Trend'), row=2, col=1)
+                        fig.add_trace(go.Scatter(x=ts_df.index, y=decomposition.seasonal,
+                                                mode='lines', name='Seasonal'), row=3, col=1)
+                        fig.add_trace(go.Scatter(x=ts_df.index, y=decomposition.resid,
+                                                mode='lines', name='Residual'), row=4, col=1)
+                        fig.update_layout(height=800, title="Time Series Decomposition")
+                        st.plotly_chart(fig, use_container_width=True)
+                    # Stationarity tests
+                    st.markdown("### 📊 Stationarity Tests")
+                    col1, col2 = st.columns(2)
+                    with col1:
+                        # ADF test
+                        adf_result = adfuller(ts_df[value_col].dropna())
+                        st.write("**Augmented Dickey-Fuller Test**")
+                        st.write(f"ADF Statistic: {adf_result[0]:.4f}")
+                        st.write(f"p-value: {adf_result[1]:.4f}")
+                        st.write(f"Critical values:")
+                        for key, value in adf_result[4].items():
+                            st.write(f"   {key}: {value:.4f}")
+                        if adf_result[1] < 0.05:
+                            st.success("✅ Series is stationary")
+                        else:
+                            st.warning("⚠️ Series is non-stationary")
+                    with col2:
+                        # KPSS test
+                        kpss_result = kpss(ts_df[value_col].dropna(), regression='c')
+                        st.write("**KPSS Test**")
+                        st.write(f"KPSS Statistic: {kpss_result[0]:.4f}")
+                        st.write(f"p-value: {kpss_result[1]:.4f}")
+                        st.write(f"Critical values:")
+                        for key, value in kpss_result[3].items():
+                            st.write(f"   {key}: {value:.4f}")
+                        if kpss_result[1] < 0.05:
+                            st.warning("⚠️ Series is non-stationary")
+                        else:
+                            st.success("✅ Series is stationary")
+                    # ACF and PACF plots
+                    st.markdown("### 📈 ACF and PACF Plots")
+                    lags = st.slider("Number of lags", 10, 50, 20)
+                    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 8))
+                    plot_acf(ts_df[value_col].dropna(), lags=lags, ax=ax1)
+                    plot_pacf(ts_df[value_col].dropna(), lags=lags, ax=ax2)
+                    plt.tight_layout()
+                    st.pyplot(fig)
+                    # Forecasting with simple models
+                    st.markdown("### 🔮 Simple Forecasting")
+                    forecast_periods = st.slider("Forecast periods", 1, 30, 10)
+                    from statsmodels.tsa.holtwinters import ExponentialSmoothing
+                    model = ExponentialSmoothing(ts_df[value_col],
+                                                seasonal_periods=period,
+                                                trend='add', seasonal='add')
+                    fitted_model = model.fit()
+                    forecast = fitted_model.forecast(forecast_periods)
+                    # Plot forecast
+                    fig = go.Figure()
+                    fig.add_trace(go.Scatter(x=ts_df.index, y=ts_df[value_col],
+                                            mode='lines', name='Historical'))
+                    fig.add_trace(go.Scatter(x=forecast.index, y=forecast,
+                                            mode='lines+markers', name='Forecast',
+                                            line=dict(color='red')))
+                    fig.update_layout(title=f"Exponential Smoothing Forecast ({forecast_periods} periods)")
+                    st.plotly_chart(fig, use_container_width=True)
+            else:
+                st.info("ℹ️ Need both datetime and numeric columns for time series analysis")
+        except Exception as e:
+            st.error(f"❌ Error in time series analysis: {str(e)}")
+            st.info("💡 Tip: Ensure your date column is properly formatted as datetime")
+        st.markdown('</div>', unsafe_allow_html=True)
+    with tab6:
+        st.markdown('<div class="custom-card">', unsafe_allow_html=True)
+        st.subheader("🎲 Probability & Sampling Analysis")
+        try:
+            if numeric_cols:
+                col = st.selectbox("Select column for probability analysis", numeric_cols, key="prob_col")
+                data = df[col].dropna()
+                if len(data) > 0:
+                    # Probability distribution fitting
+                    st.markdown("### 📊 Probability Distribution Fitting")
+                    # Calculate empirical CDF
+                    sorted_data = np.sort(data)
+                    ecdf = np.arange(1, len(sorted_data)+1) / len(sorted_data)
+                    fig = go.Figure()
+                    fig.add_trace(go.Scatter(x=sorted_data, y=ecdf,
+                                            mode='lines', name='Empirical CDF'))
+                    # Fit theoretical distributions
+                    dist_options = ['Normal', 'Exponential', 'Gamma', 'Log-normal']
+                    selected_dist = st.multiselect("Select distributions to compare", dist_options, default=['Normal'])
+                    colors = ['red', 'green', 'blue', 'orange']
+                    for i, dist_name in enumerate(selected_dist):
+                        if dist_name == 'Normal':
+                            params = stats.norm.fit(data)
+                            theoretical_cdf = stats.norm.cdf(sorted_data, *params)
+                        elif dist_name == 'Exponential':
+                            params = stats.expon.fit(data)
+                            theoretical_cdf = stats.expon.cdf(sorted_data, *params)
+                        elif dist_name == 'Gamma':
+                            params = stats.gamma.fit(data)
+                            theoretical_cdf = stats.gamma.cdf(sorted_data, *params)
+                        elif dist_name == 'Log-normal':
+                            params = stats.lognorm.fit(data)
+                            theoretical_cdf = stats.lognorm.cdf(sorted_data, *params)
+                        fig.add_trace(go.Scatter(x=sorted_data, y=theoretical_cdf,
+                                                mode='lines', name=f'{dist_name} CDF',
+                                                line=dict(color=colors[i], dash='dash')))
+                    fig.update_layout(title="CDF Comparison: Empirical vs Theoretical",
+                                    xaxis_title=col, yaxis_title="Cumulative Probability")
+                    st.plotly_chart(fig, use_container_width=True)
+                    # Goodness of fit tests
+                    st.markdown("### 📈 Goodness of Fit Tests")
+                    for dist_name in selected_dist:
+                        if dist_name == 'Normal':
+                            ks_stat, ks_p = stats.kstest(data, 'norm', args=stats.norm.fit(data))
+                        elif dist_name == 'Exponential':
+                            ks_stat, ks_p = stats.kstest(data, 'expon', args=stats.expon.fit(data))
+                        elif dist_name == 'Gamma':
+                            ks_stat, ks_p = stats.kstest(data, 'gamma', args=stats.gamma.fit(data))
+                        elif dist_name == 'Log-normal':
+                            ks_stat, ks_p = stats.kstest(data, 'lognorm', args=stats.lognorm.fit(data))
+                        st.write(f"**{dist_name} Distribution**")
+                        st.write(f"KS Statistic: {ks_stat:.4f}")
+                        st.write(f"P-value: {ks_p:.4f}")
+                        if ks_p < 0.05:
+                            st.error(f"❌ Data does NOT follow {dist_name} distribution")
+                        else:
+                            st.success(f"✅ Data may follow {dist_name} distribution")
+                    # Sampling analysis
+                    st.markdown("### 🎯 Sampling Analysis")
+                    sample_size = st.slider("Sample size", 10, min(500, len(data)), 100)
+                    n_samples = st.slider("Number of samples", 10, 1000, 100)
+                    # Bootstrap sampling
+                    bootstrap_means = []
+                    for i in range(n_samples):
+                        sample = np.random.choice(data, sample_size, replace=True)
+                        bootstrap_means.append(sample.mean())
+                    bootstrap_means = np.array(bootstrap_means)
+                    # Plot sampling distribution
+                    fig = make_subplots(rows=1, cols=2,
+                                       subplot_titles=("Sampling Distribution of Mean",
+                                                      "Confidence Intervals"))
+                    fig.add_trace(go.Histogram(x=bootstrap_means, nbinsx=30,
+                                              name="Sample Means"), row=1, col=1)
+                    # Add confidence intervals
+                    ci_lower = np.percentile(bootstrap_means, 2.5)
+                    ci_upper = np.percentile(bootstrap_means, 97.5)
+                    fig.add_trace(go.Scatter(x=[ci_lower, ci_lower], y=[0, 10],
+                                            mode='lines', name='95% CI Lower',
+                                            line=dict(color='red', dash='dash')), row=1, col=1)
+                    fig.add_trace(go.Scatter(x=[ci_upper, ci_upper], y=[0, 10],
+                                            mode='lines', name='95% CI Upper',
+                                            line=dict(color='red', dash='dash')), row=1, col=1)
+                    # Confidence interval plot
+                    for i in range(min(20, n_samples)):
+                        sample_mean = bootstrap_means[i]
+                        fig.add_trace(go.Scatter(x=[i, i], y=[sample_mean - data.std()/np.sqrt(sample_size),
+                                                            sample_mean + data.std()/np.sqrt(sample_size)],
+                                                mode='lines', line=dict(color='blue', width=1),
+                                                showlegend=False), row=1, col=2)
+                        fig.add_trace(go.Scatter(x=[i], y=[sample_mean],
+                                                mode='markers', marker=dict(color='red', size=5),
+                                                showlegend=False), row=1, col=2)
+                    fig.update_layout(height=500, title="Bootstrap Sampling Analysis")
+                    st.plotly_chart(fig, use_container_width=True)
+                    # Sampling statistics
+                    col1, col2, col3 = st.columns(3)
+                    with col1:
+                        st.metric("Population Mean", f"{data.mean():.4f}")
+                    with col2:
+                        st.metric("Mean of Sample Means", f"{bootstrap_means.mean():.4f}")
+                    with col3:
+                        st.metric("Standard Error", f"{bootstrap_means.std():.4f}")
+                    st.write(f"**95% Confidence Interval:** [{ci_lower:.4f}, {ci_upper:.4f}]")
+        except Exception as e:
+            st.error(f"❌ Error in probability analysis: {str(e)}")
+            st.info("💡 Tip: Ensure you have sufficient data for probability analysis")
+        st.markdown('</div>', unsafe_allow_html=True)
+    # Export options
+    st.markdown("---")
+    st.markdown("### 📥 Export Statistical Report")
+    try:
+        report_text = f"""
+        STATISTICAL ANALYSIS REPORT
+        ===========================
+        Dataset Information:
+        • Total Rows: {df.shape[0]:,}
+        • Total Columns: {df.shape[1]}
+        • Numeric Columns: {len(numeric_cols)}
+        • Categorical Columns: {len(categorical_cols)}
+        • Datetime Columns: {len(datetime_cols)}
+        Summary Statistics:
+        {df[numeric_cols].describe().to_string()}
+        Analysis Performed:
+        • Descriptive Statistics
+        • Correlation Analysis
+        • Hypothesis Testing
+        • Distribution Analysis
+        • Time Series Analysis (if applicable)
+        • Probability & Sampling Analysis
+        """
+        st.download_button(
+            label="📥 Download Complete Statistical Report",
+            data=report_text,
+            file_name="statistical_analysis_report.txt",
+            mime="text/plain",
+            use_container_width=True
+        )
+    except Exception as e:
+        st.error(f"❌ Error generating report: {str(e)}")

utils.py ADDED Viewed

	@@ -0,0 +1,208 @@

+import pandas as pd
+import numpy as np
+import streamlit as st
+def detect_column_types(df):
+    """
+    Detect and return column types
+    """
+    numeric = df.select_dtypes(include=[np.number]).columns.tolist()
+    categorical = df.select_dtypes(include=['object', 'category']).columns.tolist()
+    datetime = df.select_dtypes(include=['datetime64']).columns.tolist()
+    boolean = df.select_dtypes(include=['bool']).columns.tolist()
+    return numeric, categorical, datetime, boolean
+def get_basic_stats(df):
+    """
+    Return basic statistics about the dataset
+    """
+    stats = {
+        'rows': df.shape[0],
+        'columns': df.shape[1],
+        'missing_values': df.isnull().sum().sum(),
+        'missing_percentage': (df.isnull().sum().sum() / (df.shape[0] * df.shape[1])) * 100,
+        'duplicates': df.duplicated().sum(),
+        'memory_usage': df.memory_usage(deep=True).sum() / 1024**2  # MB
+    }
+    return stats
+def suggest_visualizations(df):
+    """
+    Suggest appropriate visualizations based on data types
+    """
+    numeric, categorical, datetime, boolean = detect_column_types(df)
+    suggestions = []
+    if len(numeric) > 0:
+        suggestions.append({
+            'type': 'histogram',
+            'description': f'Distribution of numeric columns',
+            'columns': numeric[:3]
+        })
+    if len(categorical) > 0:
+        suggestions.append({
+            'type': 'bar_chart',
+            'description': f'Category distributions',
+            'columns': categorical[:3]
+        })
+    if len(numeric) >= 2:
+        suggestions.append({
+            'type': 'scatter_plot',
+            'description': 'Relationship between numeric variables',
+            'columns': numeric[:2]
+        })
+    if len(datetime) > 0 and len(numeric) > 0:
+        suggestions.append({
+            'type': 'line_chart',
+            'description': 'Time series trends',
+            'columns': [datetime[0], numeric[0]]
+        })
+    if len(numeric) > 1:
+        suggestions.append({
+            'type': 'correlation_heatmap',
+            'description': 'Correlations between numeric variables'
+        })
+    return suggestions
+def format_number(num):
+    """
+    Format large numbers with commas
+    """
+    if pd.isna(num):
+        return "N/A"
+    return f"{num:,.0f}"
+def format_percentage(num):
+    """
+    Format as percentage
+    """
+    if pd.isna(num):
+        return "N/A"
+    return f"{num:.1f}%"
+def get_data_quality_issues(df):
+    """
+    Identify data quality issues
+    """
+    issues = []
+    # Check for missing values
+    missing_cols = df.columns[df.isnull().any()].tolist()
+    if missing_cols:
+        issues.append({
+            'type': 'missing_values',
+            'severity': 'high' if df.isnull().sum().sum() > len(df) * 0.1 else 'medium',
+            'description': f'Missing values in {len(missing_cols)} columns',
+            'columns': missing_cols
+        })
+    # Check for duplicates
+    duplicates = df.duplicated().sum()
+    if duplicates > 0:
+        issues.append({
+            'type': 'duplicates',
+            'severity': 'medium' if duplicates > len(df) * 0.05 else 'low',
+            'description': f'{duplicates} duplicate rows found',
+            'count': duplicates
+        })
+    # Check for constant columns
+    constant_cols = [col for col in df.columns if df[col].nunique() == 1]
+    if constant_cols:
+        issues.append({
+            'type': 'constant_columns',
+            'severity': 'low',
+            'description': f'{len(constant_cols)} constant columns found',
+            'columns': constant_cols
+        })
+    # Check for outliers in numeric columns
+    numeric_cols = df.select_dtypes(include=[np.number]).columns
+    for col in numeric_cols:
+        Q1 = df[col].quantile(0.25)
+        Q3 = df[col].quantile(0.75)
+        IQR = Q3 - Q1
+        outliers = df[(df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)]
+        if len(outliers) > len(df) * 0.1:
+            issues.append({
+                'type': 'outliers',
+                'severity': 'medium',
+                'description': f'Significant outliers in {col}',
+                'column': col,
+                'outlier_count': len(outliers)
+            })
+            break  # Just report first outlier issue
+    return issues
+def get_recommendations(df):
+    """
+    Generate data analysis recommendations
+    """
+    numeric, categorical, datetime, boolean = detect_column_types(df)
+    recommendations = []
+    # Missing data recommendations
+    if df.isnull().sum().sum() > 0:
+        recommendations.append("Consider handling missing values using imputation or removal")
+    # Feature engineering suggestions
+    if len(numeric) >= 2:
+        recommendations.append("Create interaction features between highly correlated variables")
+    if datetime:
+        recommendations.append("Extract time-based features (hour, day, month, year) from datetime columns")
+    # Modeling suggestions
+    if len(numeric) > 5:
+        recommendations.append("Consider dimensionality reduction techniques (PCA, t-SNE)")
+    if df.shape[0] > 10000:
+        recommendations.append("Dataset is large - consider sampling for faster exploration")
+    # Visualization suggestions
+    if len(numeric) > 2:
+        recommendations.append("Use pair plots to visualize relationships between multiple variables")
+    if len(categorical) > 1:
+        recommendations.append("Create contingency tables to analyze categorical relationships")
+    return recommendations
+def create_sample_dataset():
+    """
+    Create a sample dataset for testing
+    """
+    np.random.seed(42)
+    n_rows = 1000
+    data = {
+        'id': range(n_rows),
+        'age': np.random.normal(40, 15, n_rows).clip(18, 90).astype(int),
+        'income': np.random.normal(50000, 20000, n_rows).clip(20000, 150000).astype(int),
+        'score': np.random.uniform(0, 100, n_rows).round(2),
+        'category': np.random.choice(['A', 'B', 'C', 'D'], n_rows),
+        'region': np.random.choice(['North', 'South', 'East', 'West'], n_rows),
+        'purchased': np.random.choice([0, 1], n_rows, p=[0.7, 0.3]),
+        'signup_date': pd.date_range('2023-01-01', periods=n_rows, freq='D'),
+        'satisfaction': np.random.choice([1, 2, 3, 4, 5], n_rows, p=[0.1, 0.15, 0.3, 0.25, 0.2])
+    }
+    # Add some missing values
+    df = pd.DataFrame(data)
+    mask = np.random.random(df.shape) < 0.05
+    df = df.mask(mask)
+    # Add some duplicates
+    duplicate_rows = np.random.choice(n_rows, 10, replace=False)
+    df = pd.concat([df, df.iloc[duplicate_rows]]).reset_index(drop=True)
+    return df

visualization.py ADDED Viewed

	@@ -0,0 +1,435 @@

+import streamlit as st
+import plotly.express as px
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+import pandas as pd
+import numpy as np
+def auto_visualizations(df):
+    st.markdown("""
+    <div style='text-align: center; margin-bottom: 2rem;'>
+        <h2>📊 Interactive Data Visualization</h2>
+        <p style='color: gray;'>Create beautiful, interactive visualizations with just a few clicks</p>
+    </div>
+    """, unsafe_allow_html=True)
+    # Get column types
+    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
+    cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
+    date_cols = df.select_dtypes(include=['datetime64']).columns.tolist()
+    # Visualization type selector
+    viz_type = st.selectbox(
+        "🎨 Select Visualization Type",
+        ["Distribution Plots", "Categorical Plots", "Relationship Plots",
+         "Time Series Plots", "Statistical Plots", "Advanced Plots"]
+    )
+    if viz_type == "Distribution Plots":
+        st.markdown('<div class="custom-card">', unsafe_allow_html=True)
+        st.subheader("📈 Distribution Plots")
+        if num_cols:
+            # Create tabs for different distribution plots
+            dist_tab1, dist_tab2, dist_tab3 = st.tabs(["Histogram", "Box Plot", "Violin Plot"])
+            with dist_tab1:
+                col1, col2 = st.columns(2)
+                with col1:
+                    hist_col = st.selectbox("Select column", num_cols, key="hist")
+                with col2:
+                    bins = st.slider("Number of bins", 5, 100, 30)
+                fig = px.histogram(df, x=hist_col, nbins=bins,
+                                  title=f"Distribution of {hist_col}",
+                                  marginal="box", opacity=0.7)
+                fig.update_layout(showlegend=False)
+                st.plotly_chart(fig, use_container_width=True)
+            with dist_tab2:
+                if cat_cols:
+                    box_col = st.selectbox("Numeric column", num_cols, key="box_num")
+                    box_cat = st.selectbox("Category column (optional)", ["None"] + cat_cols, key="box_cat")
+                    if box_cat == "None":
+                        fig = px.box(df, y=box_col, title=f"Box Plot of {box_col}")
+                    else:
+                        fig = px.box(df, x=box_cat, y=box_col, title=f"{box_col} by {box_cat}")
+                    st.plotly_chart(fig, use_container_width=True)
+                else:
+                    st.info("Add categorical columns to create grouped box plots")
+            with dist_tab3:
+                if cat_cols:
+                    violin_col = st.selectbox("Numeric column", num_cols, key="violin_num")
+                    violin_cat = st.selectbox("Category column", cat_cols, key="violin_cat")
+                    fig = px.violin(df, x=violin_cat, y=violin_col,
+                                   box=True, points="all",
+                                   title=f"Violin Plot of {violin_col} by {violin_cat}")
+                    st.plotly_chart(fig, use_container_width=True)
+        else:
+            st.warning("No numeric columns available for distribution plots")
+        st.markdown('</div>', unsafe_allow_html=True)
+    elif viz_type == "Categorical Plots":
+        st.markdown('<div class="custom-card">', unsafe_allow_html=True)
+        st.subheader("📊 Categorical Plots")
+        if cat_cols:
+            # Create tabs for categorical plots
+            cat_tab1, cat_tab2, cat_tab3 = st.tabs(["Bar Chart", "Pie Chart", "Sunburst Chart"])
+            with cat_tab1:
+                bar_col = st.selectbox("Select categorical column", cat_cols, key="bar")
+                # Get value counts
+                value_counts = df[bar_col].value_counts().reset_index()
+                value_counts.columns = [bar_col, 'count']
+                # Color option
+                if num_cols:
+                    color_by = st.selectbox("Color by (optional)", ["None"] + num_cols, key="bar_color")
+                else:
+                    color_by = "None"
+                if color_by == "None":
+                    fig = px.bar(value_counts, x=bar_col, y='count',
+                               title=f"Distribution of {bar_col}",
+                               color_discrete_sequence=['#636EFA'])
+                else:
+                    # Aggregate numeric column by category
+                    agg_data = df.groupby(bar_col)[color_by].mean().reset_index()
+                    fig = px.bar(agg_data, x=bar_col, y=color_by,
+                               title=f"Average {color_by} by {bar_col}",
+                               color=bar_col)
+                fig.update_layout(xaxis_tickangle=-45)
+                st.plotly_chart(fig, use_container_width=True)
+            with cat_tab2:
+                pie_col = st.selectbox("Select column for pie chart", cat_cols, key="pie")
+                # Limit to top 10 categories for readability
+                top_n = st.slider("Show top N categories", 3, 20, 10)
+                value_counts = df[pie_col].value_counts().head(top_n)
+                fig = px.pie(values=value_counts.values, names=value_counts.index,
+                           title=f"Proportion of {pie_col} (Top {top_n})",
+                           hole=0.3)
+                fig.update_traces(textposition='inside', textinfo='percent+label')
+                st.plotly_chart(fig, use_container_width=True)
+            with cat_tab3:
+                if len(cat_cols) >= 2:
+                    st.markdown("**Hierarchical View**")
+                    path = st.multiselect("Select hierarchy (order matters)",
+                                        cat_cols, default=cat_cols[:2])
+                    if len(path) >= 2:
+                        fig = px.sunburst(df, path=path,
+                                        title="Hierarchical Distribution")
+                        st.plotly_chart(fig, use_container_width=True)
+                else:
+                    st.info("Need at least 2 categorical columns for sunburst chart")
+        else:
+            st.warning("No categorical columns available")
+        st.markdown('</div>', unsafe_allow_html=True)
+    elif viz_type == "Relationship Plots":
+        st.markdown('<div class="custom-card">', unsafe_allow_html=True)
+        st.subheader("🔄 Relationship Plots")
+        if len(num_cols) >= 2:
+            rel_tab1, rel_tab2, rel_tab3 = st.tabs(["Scatter Plot", "Line Plot", "Heatmap"])
+            with rel_tab1:
+                col1, col2, col3 = st.columns(3)
+                with col1:
+                    x_col = st.selectbox("X axis", num_cols, key="scatter_x")
+                with col2:
+                    y_col = st.selectbox("Y axis", [c for c in num_cols if c != x_col], key="scatter_y")
+                with col3:
+                    color_col = st.selectbox("Color by", ["None"] + cat_cols + num_cols, key="scatter_color")
+                size_col = st.selectbox("Size by (optional)", ["None"] + num_cols, key="scatter_size")
+                # Create scatter plot
+                if color_col == "None" and size_col == "None":
+                    fig = px.scatter(df, x=x_col, y=y_col,
+                                   title=f"{y_col} vs {x_col}",
+                                   trendline="ols")
+                elif color_col != "None" and size_col == "None":
+                    fig = px.scatter(df, x=x_col, y=y_col, color=color_col,
+                                   title=f"{y_col} vs {x_col} colored by {color_col}",
+                                   trendline="ols")
+                elif color_col == "None" and size_col != "None":
+                    fig = px.scatter(df, x=x_col, y=y_col, size=size_col,
+                                   title=f"{y_col} vs {x_col} sized by {size_col}",
+                                   trendline="ols")
+                else:
+                    fig = px.scatter(df, x=x_col, y=y_col, color=color_col, size=size_col,
+                                   title=f"{y_col} vs {x_col}",
+                                   trendline="ols")
+                st.plotly_chart(fig, use_container_width=True)
+            with rel_tab2:
+                col1, col2 = st.columns(2)
+                with col1:
+                    line_x = st.selectbox("X axis (usually time)", num_cols + date_cols, key="line_x")
+                with col2:
+                    line_y = st.selectbox("Y axis", num_cols, key="line_y")
+                line_color = st.selectbox("Color by", ["None"] + cat_cols, key="line_color")
+                if line_color == "None":
+                    fig = px.line(df, x=line_x, y=line_y,
+                                title=f"{line_y} over {line_x}")
+                else:
+                    fig = px.line(df, x=line_x, y=line_y, color=line_color,
+                                title=f"{line_y} over {line_x} by {line_color}")
+                st.plotly_chart(fig, use_container_width=True)
+            with rel_tab3:
+                # Correlation heatmap
+                corr_matrix = df[num_cols].corr()
+                # Mask for upper triangle
+                mask = np.triu(np.ones_like(corr_matrix), k=1)
+                masked_corr = corr_matrix * (1 - mask)
+                fig = px.imshow(masked_corr,
+                              text_auto=True,
+                              aspect="auto",
+                              color_continuous_scale='RdBu_r',
+                              title="Correlation Heatmap",
+                              zmin=-1, zmax=1)
+                st.plotly_chart(fig, use_container_width=True)
+                # Show strongest correlations
+                st.markdown("**Strongest Correlations:**")
+                corr_pairs = []
+                for i in range(len(num_cols)):
+                    for j in range(i+1, len(num_cols)):
+                        corr_pairs.append((num_cols[i], num_cols[j],
+                                         corr_matrix.iloc[i, j]))
+                corr_pairs.sort(key=lambda x: abs(x[2]), reverse=True)
+                for col1, col2, corr in corr_pairs[:5]:
+                    strength = "🟢" if abs(corr) > 0.7 else "🟡" if abs(corr) > 0.3 else "🔴"
+                    st.write(f"{strength} **{col1}** & **{col2}**: {corr:.3f}")
+        else:
+            st.warning("Need at least 2 numeric columns for relationship plots")
+        st.markdown('</div>', unsafe_allow_html=True)
+    elif viz_type == "Time Series Plots":
+        st.markdown('<div class="custom-card">', unsafe_allow_html=True)
+        st.subheader("📅 Time Series Plots")
+        if date_cols:
+            ts_tab1, ts_tab2 = st.tabs(["Time Series", "Resampling"])
+            with ts_tab1:
+                date_col = st.selectbox("Date column", date_cols, key="ts_date")
+                value_col = st.selectbox("Value column", num_cols if num_cols else [], key="ts_value")
+                if num_cols and date_col:
+                    # Sort by date
+                    df_sorted = df.sort_values(date_col)
+                    fig = go.Figure()
+                    fig.add_trace(go.Scatter(x=df_sorted[date_col], y=df_sorted[value_col],
+                                            mode='lines+markers', name=value_col))
+                    fig.update_layout(title=f"{value_col} over Time",
+                                    xaxis_title="Date",
+                                    yaxis_title=value_col)
+                    st.plotly_chart(fig, use_container_width=True)
+            with ts_tab2:
+                if num_cols and date_cols:
+                    date_col = st.selectbox("Select date column", date_cols, key="resample_date")
+                    resample_col = st.selectbox("Select column to resample", num_cols, key="resample_col")
+                    freq = st.selectbox("Resampling frequency",
+                                      ["Daily", "Weekly", "Monthly", "Quarterly", "Yearly"])
+                    freq_map = {
+                        "Daily": "D",
+                        "Weekly": "W",
+                        "Monthly": "M",
+                        "Quarterly": "Q",
+                        "Yearly": "Y"
+                    }
+                    # Set date as index
+                    df_date = df.set_index(date_col)
+                    # Resample
+                    resampled = df_date[resample_col].resample(freq_map[freq]).mean().reset_index()
+                    fig = px.line(resampled, x=date_col, y=resample_col,
+                                title=f"{resample_col} ({freq} Aggregated)")
+                    st.plotly_chart(fig, use_container_width=True)
+        else:
+            st.warning("No datetime columns found. Convert a column to datetime first.")
+        st.markdown('</div>', unsafe_allow_html=True)
+    elif viz_type == "Statistical Plots":
+        st.markdown('<div class="custom-card">', unsafe_allow_html=True)
+        st.subheader("📐 Statistical Plots")
+        if num_cols:
+            stat_tab1, stat_tab2, stat_tab3 = st.tabs(["QQ Plot", "ECDF", "Density Heatmap"])
+            with stat_tab1:
+                qq_col = st.selectbox("Select column for QQ plot", num_cols, key="qq")
+                # Calculate quantiles
+                data = df[qq_col].dropna()
+                theoretical_quantiles = np.percentile(np.random.normal(0, 1, len(data)),
+                                                     np.linspace(0, 100, len(data)))
+                sample_quantiles = np.percentile(data, np.linspace(0, 100, len(data)))
+                fig = go.Figure()
+                fig.add_trace(go.Scatter(x=theoretical_quantiles, y=sample_quantiles,
+                                        mode='markers', name='Data'))
+                # Add diagonal line
+                min_val = min(theoretical_quantiles.min(), sample_quantiles.min())
+                max_val = max(theoretical_quantiles.max(), sample_quantiles.max())
+                fig.add_trace(go.Scatter(x=[min_val, max_val], y=[min_val, max_val],
+                                        mode='lines', name='Normal',
+                                        line=dict(color='red', dash='dash')))
+                fig.update_layout(title=f"QQ Plot - {qq_col}",
+                                xaxis_title="Theoretical Quantiles",
+                                yaxis_title="Sample Quantiles")
+                st.plotly_chart(fig, use_container_width=True)
+            with stat_tab2:
+                ecdf_col = st.selectbox("Select column for ECDF", num_cols, key="ecdf")
+                fig = px.ecdf(df, x=ecdf_col,
+                            title=f"Empirical Cumulative Distribution - {ecdf_col}")
+                st.plotly_chart(fig, use_container_width=True)
+            with stat_tab3:
+                if len(num_cols) >= 2:
+                    x_col = st.selectbox("X axis", num_cols, key="density_x")
+                    y_col = st.selectbox("Y axis", [c for c in num_cols if c != x_col], key="density_y")
+                    fig = px.density_heatmap(df, x=x_col, y=y_col,
+                                           title=f"Density Heatmap: {y_col} vs {x_col}",
+                                           marginal_x="histogram",
+                                           marginal_y="histogram")
+                    st.plotly_chart(fig, use_container_width=True)
+        else:
+            st.warning("No numeric columns available for statistical plots")
+        st.markdown('</div>', unsafe_allow_html=True)
+    elif viz_type == "Advanced Plots":
+        st.markdown('<div class="custom-card">', unsafe_allow_html=True)
+        st.subheader("🚀 Advanced Visualizations")
+        adv_tab1, adv_tab2, adv_tab3 = st.tabs(["3D Scatter", "Parallel Coordinates", "Radar Chart"])
+        with adv_tab1:
+            if len(num_cols) >= 3:
+                col1, col2, col3 = st.columns(3)
+                with col1:
+                    x_3d = st.selectbox("X axis", num_cols, key="3d_x")
+                with col2:
+                    y_3d = st.selectbox("Y axis", [c for c in num_cols if c != x_3d], key="3d_y")
+                with col3:
+                    z_3d = st.selectbox("Z axis", [c for c in num_cols if c not in [x_3d, y_3d]], key="3d_z")
+                color_3d = st.selectbox("Color by", ["None"] + cat_cols + num_cols, key="3d_color")
+                if color_3d == "None":
+                    fig = px.scatter_3d(df, x=x_3d, y=y_3d, z=z_3d,
+                                      title=f"3D Scatter: {x_3d}, {y_3d}, {z_3d}")
+                else:
+                    fig = px.scatter_3d(df, x=x_3d, y=y_3d, z=z_3d, color=color_3d,
+                                      title=f"3D Scatter colored by {color_3d}")
+                st.plotly_chart(fig, use_container_width=True)
+            else:
+                st.info("Need at least 3 numeric columns for 3D scatter plot")
+        with adv_tab2:
+            if num_cols:
+                selected_dims = st.multiselect("Select dimensions", num_cols, default=num_cols[:4])
+                if selected_dims and len(selected_dims) >= 2:
+                    color_dim = st.selectbox("Color dimension", ["None"] + cat_cols + num_cols)
+                    if color_dim == "None":
+                        fig = px.parallel_coordinates(df, dimensions=selected_dims,
+                                                    title="Parallel Coordinates Plot")
+                    else:
+                        fig = px.parallel_coordinates(df, dimensions=selected_dims,
+                                                    color=color_dim,
+                                                    title=f"Parallel Coordinates colored by {color_dim}")
+                    st.plotly_chart(fig, use_container_width=True)
+        with adv_tab3:
+            if num_cols:
+                st.markdown("**Radar Chart** (requires at least 3 numeric columns)")
+                selected_radar = st.multiselect("Select metrics for radar chart",
+                                              num_cols, default=num_cols[:3])
+                if len(selected_radar) >= 3:
+                    # Get first row as sample
+                    sample = df[selected_radar].iloc[0]
+                    fig = go.Figure(data=go.Scatterpolar(
+                        r=sample.values,
+                        theta=selected_radar,
+                        fill='toself'
+                    ))
+                    fig.update_layout(
+                        polar=dict(
+                            radialaxis=dict(
+                                visible=True,
+                                range=[sample.min(), sample.max()]
+                            )),
+                        showlegend=False,
+                        title="Radar Chart (First Row)"
+                    )
+                    st.plotly_chart(fig, use_container_width=True)
+        st.markdown('</div>', unsafe_allow_html=True)
+    # Download plot data option
+    st.markdown("---")
+    st.markdown("### 💾 Export Options")
+    col1, col2 = st.columns(2)
+    with col1:
+        st.info("To save any plot, hover over it and click the camera icon 📷")
+    with col2:
+        csv = df.to_csv(index=False)
+        st.download_button(
+            label="📥 Download Data as CSV",
+            data=csv,
+            file_name="visualization_data.csv",
+            mime="text/csv",
+            use_container_width=True
+        )