TimeFlowPro

Running

File size: 103,174 Bytes

# ============================================
# TimeFlow Pro - Data Analysis and Preprocessing
# ============================================
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
import streamlit as st
import pandas as pd
import numpy as np
import os
import sys
import glob
import re
from datetime import datetime, timedelta
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
from PIL import Image
import matplotlib.pyplot as plt
import warnings

from pipeline.main_pipeline import EnhancedDataPreprocessingPipeline

warnings.filterwarnings('ignore')

# Add project path
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from config.config import Config
from data_loader.data_loader import DataLoader
from visualization.visualization_manager import VisualisationManager

# ============================================
# PAGE CONFIGURATION
# ============================================
st.set_page_config(
    page_title="TimeFlow Pro - Data Analysis and Preprocessing",
    page_icon="📊",
    layout="wide",
    initial_sidebar_state="expanded"
)

# ============================================
# STATE MANAGEMENT CLASS
# ============================================
class StreamlitApp:
    """Main Streamlit application class"""
    
    def __init__(self):
        self.init_session_state()
        self.config = None
        self.pipeline = None
        self.data = None
        
    def init_session_state(self):
        """Initialise session state"""
        if 'pipeline_completed' not in st.session_state:
            st.session_state.pipeline_completed = False
        if 'processed_data' not in st.session_state:
            st.session_state.processed_data = None
        if 'modeling_data' not in st.session_state:
            st.session_state.modeling_data = None
        if 'current_step' not in st.session_state:
            st.session_state.current_step = 1
        if 'uploaded_file' not in st.session_state:
            st.session_state.uploaded_file = None
        if 'config_params' not in st.session_state:
            st.session_state.config_params = self.get_default_config()
        if 'plots_path' not in st.session_state:
            st.session_state.plots_path = None
        if 'available_plots' not in st.session_state:
            st.session_state.available_plots = {}
        if 'synthetic_data_generated' not in st.session_state:
            st.session_state.synthetic_data_generated = False
        if 'auto_pipeline_ready' not in st.session_state:
            st.session_state.auto_pipeline_ready = False
        if 'quick_test_mode' not in st.session_state:
            st.session_state.quick_test_mode = False
    
    def get_default_config(self):
        """Get default configuration"""
        return {
            'data_path': '',
            'results_dir': 'streamlit_results',
            'target_column': '',
            'start_year': 1970,
            'end_year': 1990,
            'max_lags': 5,
            'seasonal_period': 365,
            'rolling_windows': [7, 30, 90],
            'expanding_windows': [30, 90],
            'test_size': 0.2,
            'validation_size': 0.1,
            'scaling_method': 'robust',
            'feature_selection_method': 'correlation',
            'max_features': 20,
            'missing_threshold': 0.3,
            'outlier_method': 'iqr',
            'enable_validation': True,
            'split_method': 'time_based'
        }
    
    def create_sidebar(self):
        """Create sidebar"""
        with st.sidebar:
            st.title("🎯 TimeFlowPro")
            st.markdown("---")
            
            # Navigation
            st.subheader("Navigation")
            steps = {
                1: "📁 Data Loading",
                2: "⚙️ Configuration",
                3: "🔍 Data Analysis",
                4: "⚡ Pipeline Execution",
                5: "📊 Results",
                6: "📈 Visualisations",
                7: "🤖 Modelling"
            }
            
            for step_num, step_name in steps.items():
                if st.button(
                    f"{step_name}",
                    key=f"nav_{step_num}",
                    type="primary" if st.session_state.current_step == step_num else "secondary",
                    width='stretch'
                ):
                    st.session_state.current_step = step_num
                    st.rerun()
            
            st.markdown("---")
            
            # Quick start with synthetic data
            st.subheader("⚡ Quick Test")
            
            if st.button("🚀 Quick Start with Synthetic Data", 
                        type="primary", 
                        width='stretch',
                        help="Generate synthetic data and run pipeline immediately"):
                st.session_state.quick_test_mode = True
                st.session_state.current_step = 1
                st.rerun()
            
            st.markdown("---")
            
            # Project information
            st.subheader("📈 About the Project")
            st.info("""
            TimeFlow Pro - Data Analysis and Preprocessing.
            
            **New Features:**
            - Synthetic data generation for testing
            - Automatic pipeline execution
            - Quick testing without file upload
            
            **Standard Features:**
            - Missing data analysis and processing
            - Outlier detection
            - Feature engineering
            - Stationarity analysis
            - Data scaling
            - Feature selection
            """)
            
            # Progress indicator
            if st.session_state.pipeline_completed:
                st.success("✅ Pipeline completed")
            else:
                st.warning("⚠️ Pipeline not started")
            
            # Quick test indicator
            if st.session_state.quick_test_mode:
                st.info("⚡ Quick test mode active")
        
    def generate_synthetic_data(self, n_days=1095, include_seasonality=True, include_trend=True, 
                            include_noise=True, include_exogenous=True, data_type="complex"):
        """
        Generate synthetic data for testing
        
        Args:
            n_days (int): Number of days of data
            include_seasonality (bool): Include seasonality
            include_trend (bool): Include trend
            include_noise (bool): Include noise
            include_exogenous (bool): Include exogenous variables
            data_type (str): Data type (simple, medium, complex)
            
        Returns:
            pd.DataFrame: Generated synthetic data
        """
        try:
            # Base parameters depending on data type
            if data_type == "simple":
                n_days = min(n_days, 365)  # Limit for simple type
                trend_strength = 0.005
                noise_std = 2
                include_exogenous = False
            elif data_type == "medium":
                n_days = min(n_days, 730)  # Limit for medium type
                trend_strength = 0.01
                noise_std = 5
                include_exogenous = True
            else:  # complex
                n_days = min(n_days, 1095)  # Limit for complex type
                trend_strength = 0.02
                noise_std = 10
                include_exogenous = True
            
            # Create dates
            start_date = datetime.now() - timedelta(days=n_days)
            dates = pd.date_range(start=start_date, periods=n_days, freq='D')
                        
            # Base trend
            if include_trend:
                trend = np.linspace(0, trend_strength * n_days, n_days)
            else:
                trend = np.zeros(n_days)
            
            # Seasonality
            if include_seasonality:
                # Annual seasonality
                seasonal = 10 * np.sin(2 * np.pi * np.arange(n_days) / 365)
                # Quarterly seasonality
                seasonal += 5 * np.sin(2 * np.pi * np.arange(n_days) / 90)
                # Monthly seasonality
                seasonal += 3 * np.sin(2 * np.pi * np.arange(n_days) / 30)
                # Weekly seasonality
                seasonal += 2 * np.sin(2 * np.pi * np.arange(n_days) / 7)
            else:
                seasonal = np.zeros(n_days)
            
            # Main target variable (water consumption)
            base_value = 100
            raskhodvoda = base_value + trend + seasonal
            
            # Add noise
            if include_noise:
                noise = np.random.normal(0, noise_std, n_days)
                raskhodvoda += noise
            
            # Create DataFrame
            data = pd.DataFrame({
                'date': dates,
                'raskhodvoda': raskhodvoda
            })
            
            # Add exogenous variables
            if include_exogenous:
                # Temperature (seasonal)
                data['temperature'] = 15 + 10 * np.sin(2 * np.pi * np.arange(n_days) / 365) + np.random.normal(0, 3, n_days)
                
                # Precipitation (random spikes)
                precipitation = np.random.exponential(2, n_days)
                # Add seasonality to precipitation
                precipitation_seasonality = 5 * np.sin(2 * np.pi * np.arange(n_days) / 365 + np.pi/2)
                data['precipitation'] = np.maximum(0, precipitation + precipitation_seasonality)
                
                # Pressure
                data['pressure'] = 760 + np.random.normal(0, 5, n_days)
                
                # Humidity
                data['humidity'] = 60 + 20 * np.sin(2 * np.pi * np.arange(n_days) / 180) + np.random.normal(0, 10, n_days)
                
                # Electricity consumption (correlated with target variable)
                data['electricity_consumption'] = raskhodvoda * 0.8 + np.random.normal(0, 5, n_days)
                
                # Day of week (categorical variable)
                data['day_of_week'] = dates.dayofweek
                data['is_weekend'] = (data['day_of_week'] >= 5).astype(int)
                
                # Holidays (random)
                holidays = np.random.choice([0, 1], size=n_days, p=[0.95, 0.05])
                data['is_holiday'] = holidays
                
                # Lag variables
                for lag in [1, 7, 30]:
                    data[f'raskhodvoda_lag_{lag}'] = data['raskhodvoda'].shift(lag)
                
                # Moving averages
                for window in [7, 30]:
                    data[f'raskhodvoda_ma_{window}'] = data['raskhodvoda'].rolling(window=window).mean()
            
            # Add missing values for realism (5% random missing values)
            # CORRECTION: proper creation of missing value mask
            for col in data.columns:
                if col != 'date':  # Don't add missing values to dates
                    mask = np.random.random(len(data)) < 0.05
                    data.loc[mask, col] = np.nan
            
            # Add outliers (1% of data)
            # CORRECTION: proper creation of outlier mask
            numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist()
            for col in numeric_cols:
                outlier_mask = np.random.random(len(data)) < 0.01
                if outlier_mask.any():
                    # Find outlier indices
                    outlier_indices = data.index[outlier_mask]
                    for idx in outlier_indices:
                        if col in data.columns:
                            mean_val = data[col].mean(skipna=True)
                            std_val = data[col].std(skipna=True)
                            if not np.isnan(mean_val) and not np.isnan(std_val) and std_val > 0:
                                outlier_value = mean_val + 5 * std_val * np.random.choice([-1, 1])
                                data.at[idx, col] = outlier_value
            
            # Reset index
            data.reset_index(drop=True, inplace=True)
            
            st.session_state.synthetic_data_generated = True
            return data
            
        except Exception as e:
            st.error(f"Error generating synthetic data: {str(e)}")
            import traceback
            st.error(f"Error traceback: {traceback.format_exc()}")
            return None
        
    def quick_test_pipeline(self):
        """Quick pipeline execution with synthetic data"""
        with st.spinner("🚀 Running quick test with synthetic data..."):
            try:
                # Step 1: Generate synthetic data
                st.info("Step 1: Generating synthetic data...")
                synthetic_data = self.generate_synthetic_data(
                    n_days=365,  # Reduced for speed
                    include_seasonality=True,
                    include_trend=True,
                    include_noise=True,
                    include_exogenous=True,
                    data_type="medium"  # Changed to medium for balance between speed and quality
                )
                
                if synthetic_data is None:
                    st.error("Failed to generate synthetic data")
                    return
                
                # Save data to temporary file
                temp_file = "temp_synthetic_data.csv"
                synthetic_data.to_csv(temp_file, index=False)
                
                # Step 2: Configure settings
                st.info("Step 2: Configuring settings...")
                config_params = st.session_state.config_params.copy()
                config_params.update({
                    'data_path': temp_file,
                    'target_column': 'raskhodvoda',
                    'start_year': 2020,
                    'end_year': 2023,
                    'max_lags': 7,
                    'seasonal_period': 365,
                    'rolling_windows': [7, 30],
                    'expanding_windows': [30],
                    'test_size': 0.2,
                    'validation_size': 0.1,
                    'scaling_method': 'robust',
                    'feature_selection_method': 'correlation',
                    'max_features': 10,  # Reduced for speed
                    'missing_threshold': 0.3,
                    'outlier_method': 'iqr',
                    'enable_validation': True,
                    'split_method': 'time_based'
                })
                
                # Step 3: Create and run pipeline
                st.info("Step 3: Creating and running pipeline...")
                
                # Create progress bar
                progress_bar = st.progress(0)
                status_text = st.empty()
                
                # Update configuration
                st.session_state.config_params = config_params
                st.session_state.uploaded_file = temp_file
                st.session_state.data_preview = synthetic_data
                
                # Create configuration
                status_text.text("Creating configuration...")
                progress_bar.progress(20)
                
                config = Config(**config_params)
                
                # Create pipeline
                status_text.text("Initialising pipeline...")
                progress_bar.progress(40)
                
                self.pipeline = EnhancedDataPreprocessingPipeline(config)
                
                # Run pipeline
                status_text.text("Running preprocessing pipeline...")
                progress_bar.progress(60)
                
                processed_data = self.pipeline.run_full_pipeline(
                    use_synthetic=False,  # Synthetic data already loaded
                    save_intermediate=True,
                    create_reports=True
                )
                
                # Update progress
                if processed_data is not None:
                    status_text.text("Getting data for modelling...")
                    progress_bar.progress(80)
                    
                    modeling_data = self.pipeline.get_final_data_for_modelling()
                    
                    # Save to session state
                    st.session_state.processed_data = processed_data
                    st.session_state.modeling_data = modeling_data
                    st.session_state.pipeline_completed = True
                    st.session_state.plots_path = os.path.join(config.results_dir, 'plots')
                    st.session_state.auto_pipeline_ready = True
                    
                    # Collect information about available plots
                    self.collect_available_plots()
                    
                    # Completion
                    status_text.text("Completing...")
                    progress_bar.progress(100)
                    
                    st.success("✅ Quick test completed successfully!")
                    
                    # Show results
                    col1, col2, col3 = st.columns(3)
                    
                    with col1:
                        st.metric("Records generated", f"{synthetic_data.shape[0]:,}")
                    
                    with col2:
                        st.metric("Processed data", f"{processed_data.shape[0]:,} rows")
                    
                    with col3:
                        st.metric("Final features", f"{processed_data.shape[1]} columns")
                    
                    # Automatic transition to results
                    st.session_state.current_step = 5
                    st.rerun()
                
                else:
                    st.error("❌ Error running pipeline")
                    st.error("Check logs for more information")
            
            except Exception as e:
                st.error(f"❌ Error during quick test: {str(e)}")
                import traceback
                st.error(f"Error traceback: {traceback.format_exc()}")
                

    def render_step_1_data_loading(self):
        """Step 1: Data Loading"""
        st.header("📁 Data Loading")
        
        # Check quick test mode
        if st.session_state.quick_test_mode and not st.session_state.auto_pipeline_ready:
            st.info("⚡ Quick test mode activated. Generating synthetic data and running pipeline...")
            self.quick_test_pipeline()
            return
        
        col1, col2 = st.columns([2, 1])
        
        with col1:
            # File upload
            uploaded_file = st.file_uploader(
                "Upload CSV file with data",
                type=['csv', 'xlsx', 'parquet'],
                help="Supported formats: CSV, Excel, Parquet"
            )
            
            if uploaded_file is not None:
                # Save file temporarily
                file_path = f"temp_data.{uploaded_file.name.split('.')[-1]}"
                with open(file_path, "wb") as f:
                    f.write(uploaded_file.getbuffer())
                
                st.session_state.uploaded_file = file_path
                st.session_state.config_params['data_path'] = file_path
                
                # Load and preview data
                try:
                    if file_path.endswith('.csv'):
                        data = pd.read_csv(file_path)
                    elif file_path.endswith('.xlsx'):
                        data = pd.read_excel(file_path)
                    elif file_path.endswith('.parquet'):
                        data = pd.read_parquet(file_path)
                    else:
                        st.error("Unsupported file format")
                        return
                    
                    st.session_state.data_preview = data
                    
                    # Data preview
                    st.subheader("Data Preview")
                    st.dataframe(data.head(50), width='stretch')
                    
                    # Basic information
                    st.subheader("📋 Data Information")
                    
                    info_col1, info_col2, info_col3 = st.columns(3)
                    
                    with info_col1:
                        st.metric("Rows", data.shape[0])
                        st.metric("Columns", data.shape[1])
                    
                    with info_col2:
                        numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist()
                        st.metric("Numeric columns", len(numeric_cols))
                        categorical_cols = data.select_dtypes(include=['object', 'category']).columns.tolist()
                        st.metric("Categorical columns", len(categorical_cols))
                    
                    with info_col3:
                        total_missing = data.isnull().sum().sum()
                        missing_percentage = (total_missing / (data.shape[0] * data.shape[1])) * 100
                        st.metric("Missing values", f"{total_missing:,}")
                        st.metric("Missing percentage", f"{missing_percentage:.2f}%")
                    
                    # Automatic target column selection if not set
                    if 'target_column' not in st.session_state.config_params or not st.session_state.config_params['target_column']:
                        numeric_columns = data.select_dtypes(include=[np.number]).columns.tolist()
                        if numeric_columns:
                            # Automatically select column with typical name
                            target_keywords = ['target', 'y', 'value', 'price', 'sales', 'demand', 'raskhod', 'расход']
                            selected_target = None
                            
                            for col in numeric_columns:
                                if any(keyword in col.lower() for keyword in target_keywords):
                                    selected_target = col
                                    break
                            
                            # If not found by keywords, take last numeric column
                            if not selected_target and numeric_columns:
                                selected_target = numeric_columns[-1]
                            
                            if selected_target:
                                st.session_state.config_params['target_column'] = selected_target
                                st.info(f"Target variable automatically selected: **{selected_target}**")
                                st.info("You can change it in the next step")
                    
                    # Button to proceed to next step
                    if st.button("➡️ Go to Configuration", type="primary", width='stretch'):
                        st.session_state.current_step = 2
                        st.rerun()
                        
                except Exception as e:
                    st.error(f"Error loading data: {str(e)}")
        
        with col2:
            # Demo data
            st.subheader("🎮 Demo Mode")
            
            demo_option = st.radio(
                "Choose demo data:",
                ["Synthetic Data", "Time Series Example"]
            )
            
            # Synthetic data settings
            with st.expander("⚙️ Synthetic Data Settings", expanded=False):
                data_type = st.selectbox(
                    "Data Type",
                    options=["Simple", "Medium", "Complex"],
                    index=1,
                    help="Simple: 1 year, few features\nMedium: 2 years, main features\nComplex: 3 years, all features"
                )
                
                n_days = st.slider(
                    "Number of days",
                    min_value=90,
                    max_value=1825,
                    value=1095,
                    step=30,
                    help="Number of days in synthetic data"
                )
                
                include_trend = st.checkbox("Include trend", value=True)
                include_seasonality = st.checkbox("Include seasonality", value=True)
                include_noise = st.checkbox("Include noise", value=True)
                include_exogenous = st.checkbox("Include additional features", value=True)
            
            if st.button("Generate and Load Synthetic Data", width='stretch'):
                with st.spinner("Creating synthetic data..."):
                    try:
                        # Data type mapping
                        data_type_map = {
                            "Simple": "simple",
                            "Medium": "medium", 
                            "Complex": "complex"
                        }
                        
                        # Generate synthetic data
                        synthetic_data = self.generate_synthetic_data(
                            n_days=n_days,
                            include_seasonality=include_seasonality,
                            include_trend=include_trend,
                            include_noise=include_noise,
                            include_exogenous=include_exogenous,
                            data_type=data_type_map[data_type]
                        )
                        
                        if synthetic_data is not None:
                            st.session_state.data_preview = synthetic_data
                            st.session_state.uploaded_file = "synthetic_data"
                            st.session_state.config_params['data_path'] = 'synthetic_data'
                            
                            # Automatically select target variable
                            if 'raskhodvoda' in synthetic_data.columns:
                                st.session_state.config_params['target_column'] = 'raskhodvoda'
                            
                            st.success(f"✅ Synthetic data created: {synthetic_data.shape[0]} rows, {synthetic_data.shape[1]} columns")
                            
                            # Show preview
                            st.subheader("Synthetic Data Preview")
                            st.dataframe(synthetic_data.head(20), width='stretch')
                            
                            # Statistics
                            st.subheader("📊 Synthetic Data Statistics")
                            
                            stat_col1, stat_col2 = st.columns(2)
                            
                            with stat_col1:
                                st.metric("Period", f"{synthetic_data.shape[0]} days")
                                # CORRECTION: convert dates to strings for display
                                if 'date' in synthetic_data.columns:
                                    min_date = synthetic_data['date'].min()
                                    max_date = synthetic_data['date'].max()
                                    if isinstance(min_date, (pd.Timestamp, datetime)):
                                        st.text(f"Start: {min_date.strftime('%Y-%m-%d')}")
                                    else:
                                        st.text(f"Start: {str(min_date)}")
                                    
                                    if isinstance(max_date, (pd.Timestamp, datetime)):
                                        st.text(f"End: {max_date.strftime('%Y-%m-%d')}")
                                    else:
                                        st.text(f"End: {str(max_date)}")
                            
                            with stat_col2:
                                if 'raskhodvoda' in synthetic_data.columns:
                                    st.metric("Average consumption", f"{synthetic_data['raskhodvoda'].mean():.2f}")
                                    st.metric("Max consumption", f"{synthetic_data['raskhodvoda'].max():.2f}")
                                    st.metric("Min consumption", f"{synthetic_data['raskhodvoda'].min():.2f}")
                            
                            # Quick pipeline execution
                            st.markdown("---")
                            if st.button("🚀 Quick Run Pipeline with This Data", type="primary", width='stretch'):
                                st.session_state.quick_test_mode = True
                                st.session_state.auto_pipeline_ready = False
                                st.rerun()
                            
                            st.rerun()
                        else:
                            st.error("Failed to generate synthetic data")
                        
                    except Exception as e:
                        st.error(f"Error creating synthetic data: {str(e)}")
            
            st.markdown("---")
            
            # Instructions
            st.subheader("📖 Instructions")
            st.markdown("""
            1. Upload CSV file with data **OR**
            2. Generate synthetic data for testing
            3. Check data preview
            4. Target variable will be selected automatically
            5. Go to configuration to specify parameters
            
            **Data Requirements:**
            - Date in separate column or index
            - Clean column names
            - Time series with regular intervals
            """)
            
    def render_step_2_configuration(self):
        """Step 2: Pipeline Configuration"""
        st.header("⚙️ Pipeline Configuration")
        
        # Automatic configuration for synthetic data
        if st.session_state.uploaded_file == "synthetic_data" or st.session_state.config_params['data_path'] == 'synthetic_data':
            st.info("⚡ Synthetic data detected. Optimised configuration applied.")
            
            # Automatic parameter setup for synthetic data
            if st.button("Apply Recommended Settings for Synthetic Data", width='stretch'):
                st.session_state.config_params.update({
                    'target_column': 'raskhodvoda',
                    'max_lags': 7,
                    'seasonal_period': 365,
                    'rolling_windows': [7, 30, 90],
                    'expanding_windows': [30, 90],
                    'test_size': 0.2,
                    'validation_size': 0.1,
                    'scaling_method': 'robust',
                    'feature_selection_method': 'correlation',
                    'max_features': 15,
                    'missing_threshold': 0.3,
                    'outlier_method': 'iqr',
                    'enable_validation': True
                })
                st.success("Settings applied!")
                st.rerun()
        
        # Configuration sections
        tab1, tab2, tab3, tab4 = st.tabs([
            "📊 Basic Parameters",
            "🔧 Data Processing",
            "🎯 Features and Selection",
            "📈 Temporal Parameters"
        ])
        
        with tab1:
            col1, col2 = st.columns(2)
            
            with col1:
                st.subheader("Basic Parameters")
                st.session_state.config_params['results_dir'] = st.text_input(
                    "Results Directory",
                    value=st.session_state.config_params['results_dir']
                )
                
                # CORRECTION: replace text_input with selectbox for target variable selection
                if hasattr(st.session_state, 'data_preview') and st.session_state.data_preview is not None:
                    # Get all data columns
                    all_columns = st.session_state.data_preview.columns.tolist()
                    
                    # If target variable already set and present in data, use it
                    current_target = st.session_state.config_params.get('target_column', '')
                    default_index = 0
                    
                    if current_target in all_columns:
                        default_index = all_columns.index(current_target)
                    elif len(all_columns) > 0:
                        # Try to find suitable default column
                        numeric_columns = st.session_state.data_preview.select_dtypes(include=[np.number]).columns.tolist()
                        if numeric_columns:
                            # Look for columns with typical target variable names
                            target_keywords = ['target', 'y', 'value', 'price', 'sales', 'demand', 'raskhod', 'расход']
                            for i, col in enumerate(all_columns):
                                if any(keyword in col.lower() for keyword in target_keywords):
                                    default_index = i
                                    break
                            # If not found by keywords, take first numeric column
                            if default_index == 0 and numeric_columns[0] in all_columns:
                                default_index = all_columns.index(numeric_columns[0])
                    
                    st.session_state.config_params['target_column'] = st.selectbox(
                        "Select Target Variable",
                        options=all_columns,
                        index=default_index,
                        help="Select column to be predicted"
                    )
                else:
                    # If data not loaded, keep text field
                    st.session_state.config_params['target_column'] = st.text_input(
                        "Target Variable",
                        value=st.session_state.config_params.get('target_column', ''),
                        help="Enter target column name"
                    )
                
                st.session_state.config_params['enable_validation'] = st.checkbox(
                    "Enable Data Validation",
                    value=st.session_state.config_params['enable_validation']
                )
            
            with col2:
                st.subheader("Data Split")
                st.session_state.config_params['test_size'] = st.slider(
                    "Test Set Size (%)",
                    min_value=5,
                    max_value=40,
                    value=int(st.session_state.config_params['test_size'] * 100),
                    step=5,
                    format="%d%%"
                ) / 100
                
                st.session_state.config_params['validation_size'] = st.slider(
                    "Validation Set Size (%)",
                    min_value=5,
                    max_value=30,
                    value=int(st.session_state.config_params['validation_size'] * 100),
                    step=5,
                    format="%d%%"
                ) / 100
                
                split_methods = ['time_based', 'random']
                st.session_state.config_params['split_method'] = st.selectbox(
                    "Split Method",
                    options=split_methods,
                    index=split_methods.index(st.session_state.config_params['split_method'])
                )
        
        with tab2:
            col1, col2 = st.columns(2)
            
            with col1:
                st.subheader("Missing Value Processing")
                st.session_state.config_params['missing_threshold'] = st.slider(
                    "Missing Value Column Removal Threshold",
                    min_value=0.0,
                    max_value=0.5,
                    value=st.session_state.config_params['missing_threshold'],
                    step=0.05,
                    format="%.2f"
                )
                
                st.subheader("Outlier Processing")
                outlier_methods = ['iqr', 'zscore', 'isolation_forest']
                st.session_state.config_params['outlier_method'] = st.selectbox(
                    "Outlier Detection Method",
                    options=outlier_methods,
                    index=outlier_methods.index(st.session_state.config_params['outlier_method'])
                )
            
            with col2:
                st.subheader("Data Scaling")
                scaling_methods = ['robust', 'standard', 'minmax', 'none']
                st.session_state.config_params['scaling_method'] = st.selectbox(
                    "Scaling Method",
                    options=scaling_methods,
                    index=scaling_methods.index(st.session_state.config_params['scaling_method'])
                )
                
                if st.session_state.config_params['scaling_method'] == 'none':
                    st.info("⚠️ Data will not be scaled")
        
        with tab3:
            col1, col2 = st.columns(2)
            
            with col1:
                st.subheader("Feature Engineering")
                st.session_state.config_params['max_lags'] = st.slider(
                    "Maximum Number of Lags",
                    min_value=1,
                    max_value=20,
                    value=st.session_state.config_params['max_lags'],
                    step=1
                )
                
                rolling_windows_input = st.text_input(
                    "Windows for Rolling Statistics (comma-separated)",
                    value=', '.join(map(str, st.session_state.config_params['rolling_windows']))
                )
                if rolling_windows_input:
                    st.session_state.config_params['rolling_windows'] = [
                        int(x.strip()) for x in rolling_windows_input.split(',') if x.strip().isdigit()
                    ]
            
            with col2:
                st.subheader("Feature Selection")
                feature_methods = ['correlation', 'variance', 'mutual_info', 'rf', 'none']
                st.session_state.config_params['feature_selection_method'] = st.selectbox(
                    "Feature Selection Method",
                    options=feature_methods,
                    index=feature_methods.index(st.session_state.config_params['feature_selection_method'])
                )
                
                st.session_state.config_params['max_features'] = st.slider(
                    "Maximum Number of Features",
                    min_value=5,
                    max_value=100,
                    value=st.session_state.config_params['max_features'],
                    step=5
                )
        
        with tab4:
            col1, col2 = st.columns(2)
            
            with col1:
                st.subheader("Temporal Parameters")
                
                # If there is data for preview, show date range
                if hasattr(st.session_state, 'data_preview'):
                    if 'date' in st.session_state.data_preview.columns:
                        date_col = 'date'
                    elif isinstance(st.session_state.data_preview.index, pd.DatetimeIndex):
                        dates = st.session_state.data_preview.index
                    else:
                        # Try to find date column
                        date_cols = [col for col in st.session_state.data_preview.columns 
                                   if 'date' in col.lower() or 'time' in col.lower()]
                        date_col = date_cols[0] if date_cols else None
                    
                    if date_col:
                        if date_col in st.session_state.data_preview.columns:
                            dates = pd.to_datetime(st.session_state.data_preview[date_col])
                        else:
                            dates = st.session_state.data_preview.index
                        
                        if len(dates) > 0:
                            min_date = dates.min()
                            max_date = dates.max()
                            
                            col1_date, col2_date = st.columns(2)
                            with col1_date:
                                st.session_state.config_params['start_year'] = st.number_input(
                                    "Start Year",
                                    min_value=1900,
                                    max_value=2100,
                                    value=min_date.year,
                                    step=1
                                )
                            with col2_date:
                                st.session_state.config_params['end_year'] = st.number_input(
                                    "End Year",
                                    min_value=1900,
                                    max_value=2100,
                                    value=max_date.year,
                                    step=1
                                )
            
            with col2:
                st.subheader("Seasonality")
                st.session_state.config_params['seasonal_period'] = st.selectbox(
                    "Seasonal Period",
                    options=[7, 30, 90, 365, 12, 24],
                    index=[7, 30, 90, 365, 12, 24].index(
                        st.session_state.config_params['seasonal_period']
                    ) if st.session_state.config_params['seasonal_period'] in [7, 30, 90, 365, 12, 24] else 0
                )
                
                expanding_windows_input = st.text_input(
                    "Windows for Expanding Statistics (comma-separated)",
                    value=', '.join(map(str, st.session_state.config_params['expanding_windows']))
                )
                if expanding_windows_input:
                    st.session_state.config_params['expanding_windows'] = [
                        int(x.strip()) for x in expanding_windows_input.split(',') if x.strip().isdigit()
                    ]
        
        # Navigation buttons
        col1, col2, col3 = st.columns([1, 1, 1])
        
        with col1:
            if st.button("⬅️ Back to Loading", width='stretch'):
                st.session_state.current_step = 1
                st.rerun()
        
        with col3:
            if st.button("Go to Analysis ➡️", type="primary", width='stretch'):
                st.session_state.current_step = 3
                st.rerun()
    
    def render_step_3_data_analysis(self):
        """Step 3: Data Analysis"""
        st.header("🔍 Data Analysis")
        
        if not hasattr(st.session_state, 'data_preview') or st.session_state.data_preview is None:
            st.warning("First load data in Step 1")
            if st.button("Return to Data Loading"):
                st.session_state.current_step = 1
                st.rerun()
            return
        
        data = st.session_state.data_preview
        
        # Analysis tabs
        tab1, tab2, tab3, tab4 = st.tabs([
            "📈 Statistics",
            "🔍 Distributions",
            "📅 Temporal Analysis",
            "❓ Missing Values and Outliers"
        ])
        
        with tab1:
            col1, col2 = st.columns(2)
            
            with col1:
                st.subheader("Basic Statistics")
                st.dataframe(data.describe().round(2), width='stretch')
            
            with col2:
                st.subheader("Data Types")
                dtype_info = pd.DataFrame({
                    'Column': data.columns,
                    'Type': data.dtypes.values,
                    'Unique Values': [data[col].nunique() for col in data.columns]
                })
                st.dataframe(dtype_info, width='stretch')
        
        with tab2:
            # Select column for visualisation
            numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist()
            
            if numeric_cols:
                selected_col = st.selectbox(
                    "Select Column for Analysis",
                    options=numeric_cols
                )
                
                col1, col2 = st.columns(2)
                
                with col1:
                    # Histogram
                    fig = px.histogram(
                        data, 
                        x=selected_col,
                        title=f"Distribution of {selected_col}",
                        nbins=50,
                        color_discrete_sequence=['#636EFA']
                    )
                    st.plotly_chart(fig, width='stretch')
                
                with col2:
                    # Box plot
                    fig = go.Figure()
                    fig.add_trace(go.Box(
                        y=data[selected_col],
                        name=selected_col,
                        boxpoints='outliers',
                        marker_color='#EF553B'
                    ))
                    fig.update_layout(
                        title=f"Box plot {selected_col}",
                        yaxis_title=selected_col
                    )
                    st.plotly_chart(fig, width='stretch')
            else:
                st.warning("No numeric columns for distribution analysis")
        
        with tab3:
            # Time series analysis
            date_cols = [col for col in data.columns if 'date' in col.lower()]
            
            if date_cols or isinstance(data.index, pd.DatetimeIndex):
                if date_cols:
                    date_col = date_cols[0]
                    dates = pd.to_datetime(data[date_col])
                else:
                    dates = data.index
                    date_col = 'index'
                
                # Check for numeric columns
                if len(numeric_cols) > 0:
                    # Select column for time series
                    ts_col = st.selectbox(
                        "Select Column for Time Series",
                        options=numeric_cols
                    )
                    
                    # Time series
                    fig = go.Figure()
                    fig.add_trace(go.Scatter(
                        x=dates,
                        y=data[ts_col],
                        mode='lines',
                        name=ts_col,
                        line=dict(color='#636EFA', width=2)
                    ))
                    
                    fig.update_layout(
                        title=f"Time Series: {ts_col}",
                        xaxis_title="Date",
                        yaxis_title=ts_col,
                        hovermode='x unified'
                    )
                    
                    st.plotly_chart(fig, width='stretch')
                    
                    # Seasonality (if sufficient data)
                    if len(dates) > 30:
                        # Monthly trend
                        if hasattr(dates, 'month'):
                            monthly_data = data.groupby(dates.dt.month)[ts_col].mean()
                            
                            fig2 = px.bar(
                                x=monthly_data.index,
                                y=monthly_data.values,
                                title=f"Monthly Seasonality: {ts_col}",
                                labels={'x': 'Month', 'y': 'Average Value'}
                            )
                            st.plotly_chart(fig2, width='stretch')
                else:
                    st.warning("No numeric columns for temporal analysis")
            else:
                st.info("For temporal analysis, date column or DatetimeIndex required")
        
        with tab4:
            col1, col2 = st.columns(2)
            
            with col1:
                # Missing value analysis
                st.subheader("Missing Values")
                missing_data = data.isnull().sum()
                missing_percentage = (missing_data / len(data)) * 100
                
                missing_df = pd.DataFrame({
                    'Column': missing_data.index,
                    'Missing Count': missing_data.values,
                    'Missing Percentage': missing_percentage.values
                }).sort_values('Missing Count', ascending=False)
                
                st.dataframe(missing_df, width='stretch')
                
                # Missing values visualisation
                if missing_data.sum() > 0:
                    fig = px.bar(
                        missing_df,
                        x='Column',
                        y='Missing Percentage',
                        title="Missing Percentage by Column",
                        color='Missing Percentage',
                        color_continuous_scale='Reds'
                    )
                    st.plotly_chart(fig, width='stretch')
            
            with col2:
                # Quick outlier analysis
                st.subheader("Quick Outlier Analysis")
                
                if len(numeric_cols) > 0:
                    outlier_summary = []
                    
                    for col in numeric_cols[:5]:  # Limit to 5 columns for speed
                        q1 = data[col].quantile(0.25)
                        q3 = data[col].quantile(0.75)
                        iqr = q3 - q1
                        lower_bound = q1 - 1.5 * iqr
                        upper_bound = q3 + 1.5 * iqr
                        
                        outliers = data[(data[col] < lower_bound) | (data[col] > upper_bound)]
                        outlier_pct = (len(outliers) / len(data)) * 100
                        
                        outlier_summary.append({
                            'Column': col,
                            'Outliers': len(outliers),
                            'Percentage': f"{outlier_pct:.2f}%"
                        })
                    
                    outlier_df = pd.DataFrame(outlier_summary)
                    st.dataframe(outlier_df, width='stretch')
                else:
                    st.warning("No numeric columns for outlier analysis")
        
        # Navigation buttons
        col1, col2, col3 = st.columns([1, 1, 1])
        
        with col1:
            if st.button("⬅️ Back to Configuration", width='stretch'):
                st.session_state.current_step = 2
                st.rerun()
        
        with col3:
            if st.button("Run Pipeline ➡️", type="primary", width='stretch'):
                st.session_state.current_step = 4
                st.rerun()
                  
    def render_step_4_pipeline_execution(self):
        """Step 4: Pipeline Execution"""
        st.header("⚡ Pipeline Execution")
        
        # Readiness check
        ready_to_run = True
        issues = []
        
        if not st.session_state.uploaded_file and st.session_state.config_params['data_path'] != 'demo' and st.session_state.config_params['data_path'] != 'synthetic_data':
            issues.append("Data not loaded")
            ready_to_run = False
        
        if not st.session_state.config_params['target_column']:
            issues.append("Target variable not selected")
            ready_to_run = False
        
        # Automatic synthetic data generation if quick test enabled
        if st.session_state.quick_test_mode and not st.session_state.auto_pipeline_ready:
            st.info("⚡ Quick test mode activated. Generating synthetic data...")
            self.quick_test_pipeline()
            return
        
        # Display warnings
        if issues:
            st.error("⚠️ Fix before running:")
            for issue in issues:
                st.write(f"- {issue}")
            
            # Suggest using synthetic data
            st.markdown("---")
            st.subheader("🎮 Quick Solution")
            
            col1, col2 = st.columns(2)
            
            with col1:
                if st.button("Generate Synthetic Data", width='stretch'):
                    st.session_state.current_step = 1
                    st.rerun()
            
            with col2:
                if st.button("To Data Loading", width='stretch'):
                    st.session_state.current_step = 1
                    st.rerun()
            
            col3, col4 = st.columns(2)
            with col3:
                if st.button("To Configuration", width='stretch'):
                    st.session_state.current_step = 2
                    st.rerun()
            
            return
        
        # Display configuration
        st.subheader("Execution Configuration")
        
        config_col1, config_col2 = st.columns(2)
        
        with config_col1:
            st.metric("Target Variable", st.session_state.config_params['target_column'])
            st.metric("Test Set", f"{st.session_state.config_params['test_size']*100:.0f}%")
            st.metric("Scaling Method", st.session_state.config_params['scaling_method'])
        
        with config_col2:
            st.metric("Max Lags", st.session_state.config_params['max_lags'])
            st.metric("Feature Selection Method", st.session_state.config_params['feature_selection_method'])
            st.metric("Validation Enabled", "Yes" if st.session_state.config_params['enable_validation'] else "No")
        
        # Execution options
        st.subheader("Execution Options")
        
        col1, col2 = st.columns(2)
        
        with col1:
            use_synthetic = st.checkbox(
                "Use Synthetic Data",
                value=(st.session_state.config_params['data_path'] == 'demo' or 
                       st.session_state.config_params['data_path'] == 'synthetic_data'),
                disabled=(st.session_state.config_params['data_path'] == 'demo' or 
                         st.session_state.config_params['data_path'] == 'synthetic_data')
            )
            
            save_intermediate = st.checkbox(
                "Save Intermediate Results",
                value=True
            )
        
        with col2:
            create_reports = st.checkbox(
                "Create Reports",
                value=True
            )
            
            create_visualisations = st.checkbox(
                "Create Visualisations",
                value=True,
                help="Create data analysis plots"
            )
        
        # Run button
        if st.button("🚀 Run Preprocessing Pipeline", type="primary", width='stretch'):
            
            # Create progress bar
            progress_bar = st.progress(0)
            status_text = st.empty()
            
            try:
                # Create configuration
                status_text.text("Creating configuration...")
                progress_bar.progress(10)
                
                config = Config(**st.session_state.config_params)
                
                # Create pipeline
                status_text.text("Initialising pipeline...")
                progress_bar.progress(20)
                
                self.pipeline = EnhancedDataPreprocessingPipeline(config)
                
                # Determine whether to use synthetic data
                use_synthetic_flag = (use_synthetic or 
                                     st.session_state.config_params['data_path'] == 'demo' or 
                                     st.session_state.config_params['data_path'] == 'synthetic_data')
                
                # Run pipeline
                status_text.text("Running preprocessing pipeline...")
                progress_bar.progress(30)
                
                processed_data = self.pipeline.run_full_pipeline(
                    use_synthetic=use_synthetic_flag,
                    save_intermediate=save_intermediate,
                    create_reports=create_reports
                )
                
                # Update progress
                if processed_data is not None:
                    status_text.text("Getting data for modelling...")
                    progress_bar.progress(80)
                    
                    modeling_data = self.pipeline.get_final_data_for_modelling()
                    
                    # Save to session state
                    st.session_state.processed_data = processed_data
                    st.session_state.modeling_data = modeling_data
                    st.session_state.pipeline_completed = True
                    st.session_state.plots_path = os.path.join(config.results_dir, 'plots')
                    
                    # Collect information about available plots
                    self.collect_available_plots()
                    
                    # Completion
                    status_text.text("Completing...")
                    progress_bar.progress(100)
                    
                    st.success("✅ Pipeline completed successfully!")
                    
                    # Show results
                    col1, col2, col3 = st.columns(3)
                    
                    with col1:
                        if hasattr(self.pipeline, 'results') and 'data_loading' in self.pipeline.results:
                            st.metric("Original Data", f"{self.pipeline.results['data_loading']['shape'][0]:,} rows")
                        else:
                            st.metric("Original Data", "Information unavailable")
                    
                    with col2:
                        st.metric("Processed Data", f"{processed_data.shape[0]:,} rows")
                    
                    with col3:
                        st.metric("Final Features", f"{processed_data.shape[1]} columns")
                    
                    # Button to proceed to results
                    if st.button("📊 Go to Results", type="primary", width='stretch'):
                        st.session_state.current_step = 5
                        st.rerun()
                
                else:
                    st.error("❌ Error executing pipeline")
                    st.error("Check logs for more information")
            
            except Exception as e:
                progress_bar.progress(0)
                status_text.text("")
                st.error(f"❌ Error: {str(e)}")
                st.exception(e)
        
        # Back button
        if st.button("⬅️ Back to Analysis", width='stretch'):
            st.session_state.current_step = 3
            st.rerun()
    
    def collect_available_plots(self):
        """Collect information about available plots"""
        if not st.session_state.plots_path or not os.path.exists(st.session_state.plots_path):
            st.session_state.available_plots = {}
            return
        
        plots_categories = {
            'summary': ['summary_dashboard.png'],
            'missing_values': ['missing_values_analysis.png'],
            'outliers': ['outliers_analysis.png', 'outlier_handling_results.png', 'temporal_outliers.png'],
            'stationarity': ['stationarity_*.png'],
            'data_split': ['data_split.png'],
            'scaling': ['scaling_results.png'],
            'feature_selection': ['feature_selection_*.png'],
            'correlations': ['correlation_matrix.png', 'high_correlations.png', 'target_correlations.png', 'vif_scores.png']
        }
        
        available_plots = {}
        
        for category, patterns in plots_categories.items():
            category_plots = []
            
            # Search for files for each pattern
            for pattern in patterns:
                # For general patterns
                if '*' in pattern:
                    search_path = os.path.join(st.session_state.plots_path, pattern)
                    files = glob.glob(search_path)
                    
                    # Also search in subfolders
                    for root, dirs, filenames in os.walk(st.session_state.plots_path):
                        for filename in filenames:
                            if pattern.replace('*', '') in filename and filename.endswith('.png'):
                                full_path = os.path.join(root, filename)
                                if full_path not in files:
                                    files.append(full_path)
                else:
                    # For specific file names
                    file_path = os.path.join(st.session_state.plots_path, pattern)
                    
                    # Check in main folder
                    if os.path.exists(file_path):
                        files = [file_path]
                    else:
                        # Check in subfolders
                        files = []
                        for root, dirs, filenames in os.walk(st.session_state.plots_path):
                            for filename in filenames:
                                if filename == pattern:
                                    files.append(os.path.join(root, filename))
                
                for file in files:
                    if os.path.exists(file):
                        # Get relative path for display
                        rel_path = os.path.relpath(file, st.session_state.plots_path)
                        category_plots.append({
                            'path': file,
                            'name': os.path.basename(file),
                            'rel_path': rel_path,
                            'size': os.path.getsize(file)
                        })
            
            if category_plots:
                available_plots[category] = category_plots
        
        # Also add all found PNG files in general folder
        all_png_files = []
        for root, dirs, filenames in os.walk(st.session_state.plots_path):
            for filename in filenames:
                if filename.endswith('.png'):
                    file_path = os.path.join(root, filename)
                    # Check if this file already added
                    already_added = False
                    for category_plots in available_plots.values():
                        for plot in category_plots:
                            if plot['path'] == file_path:
                                already_added = True
                                break
                    
                    if not already_added:
                        rel_path = os.path.relpath(file_path, st.session_state.plots_path)
                        all_png_files.append({
                            'path': file_path,
                            'name': filename,
                            'rel_path': rel_path,
                            'size': os.path.getsize(file_path)
                        })
        
        if all_png_files:
            available_plots['other'] = all_png_files
        
        st.session_state.available_plots = available_plots
    
    def render_step_5_results(self):
        """Step 5: Results"""
        st.header("📊 Pipeline Results")
        
        if not st.session_state.pipeline_completed or st.session_state.processed_data is None:
            st.warning("Pipeline not yet run or not completed successfully")
            
            # Suggest using quick test
            st.markdown("---")
            st.subheader("🎮 Quick Start")
            
            col1, col2 = st.columns(2)
            with col1:
                if st.button("🚀 Run Quick Test", type="primary", width='stretch'):
                    st.session_state.quick_test_mode = True
                    st.session_state.current_step = 1
                    st.rerun()
            
            with col2:
                if st.button("Load Data", width='stretch'):
                    st.session_state.current_step = 1
                    st.rerun()
            
            return
        
        data = st.session_state.processed_data
        modeling_data = st.session_state.modeling_data
        
        # Results tabs
        tab1, tab2, tab3, tab4 = st.tabs([
            "📈 Data Overview",
            "📊 Feature Analysis",
            "📉 Validation",
            "💾 Export"
        ])
        
        with tab1:
            st.subheader("Processed Data")
            
            # Basic information
            info_col1, info_col2, info_col3, info_col4 = st.columns(4)
            
            with info_col1:
                st.metric("Total Records", f"{data.shape[0]:,}")
            with info_col2:
                st.metric("Total Features", data.shape[1])
            with info_col3:
                numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist()
                st.metric("Numeric Features", len(numeric_cols))
            with info_col4:
                missing_total = data.isnull().sum().sum()
                st.metric("Missing Values", missing_total)
            
            # Data preview
            st.subheader("Data Preview")
            st.dataframe(data.head(100), width='stretch')
            
            # Statistics
            st.subheader("Processed Data Statistics")
            st.dataframe(data.describe().round(4), width='stretch')
        
        with tab2:
            st.subheader("Feature Analysis")
            
            if modeling_data and 'feature_names' in modeling_data:
                features = modeling_data['feature_names']
                
                # Feature list
                st.write(f"**Selected Features:** {len(features)}")
                
                # Display features as cards
                cols_per_row = 4
                for i in range(0, len(features), cols_per_row):
                    cols = st.columns(cols_per_row)
                    for j in range(cols_per_row):
                        idx = i + j
                        if idx < len(features):
                            with cols[j]:
                                st.info(features[idx])
                
                # Feature importance (if available)
                if (self.pipeline is not None and 
                    hasattr(self.pipeline, 'feature_selector') and 
                    self.pipeline.feature_selector is not None):
                    
                    # Check for feature_importances_
                    if hasattr(self.pipeline.feature_selector, 'feature_importances_'):
                        importances = self.pipeline.feature_selector.feature_importances_
                        
                        if importances is not None and len(importances) > 0:
                            importance_df = pd.DataFrame({
                                'Feature': features[:len(importances)] if len(features) >= len(importances) else features,
                                'Importance': importances[:len(features)] if len(importances) >= len(features) else importances
                            }).sort_values('Importance', ascending=False)
                            
                            st.subheader("Feature Importance")
                            
                            fig = px.bar(
                                importance_df.head(20),
                                x='Importance',
                                y='Feature',
                                orientation='h',
                                title="Top-20 Features by Importance",
                                color='Importance',
                                color_continuous_scale='Viridis'
                            )
                            st.plotly_chart(fig, width='stretch')
            
            # Correlation matrix (limited for performance)
            if data.shape[1] <= 50:  # Performance limit
                st.subheader("Correlation Matrix (first 20 features)")
                
                # Select only numeric columns and limit quantity
                numeric_data = data.select_dtypes(include=[np.number])
                if len(numeric_data.columns) > 20:
                    numeric_data = numeric_data.iloc[:, :20]
                
                if not numeric_data.empty and len(numeric_data.columns) > 1:
                    corr_matrix = numeric_data.corr()
                    
                    fig = go.Figure(data=go.Heatmap(
                        z=corr_matrix.values,
                        x=corr_matrix.columns,
                        y=corr_matrix.columns,
                        colorscale='RdBu',
                        zmin=-1,
                        zmax=1,
                        text=corr_matrix.round(2).values,
                        texttemplate='%{text}',
                        textfont={"size": 10}
                    ))
                    
                    fig.update_layout(
                        title="Correlation Matrix",
                        width=800,
                        height=800
                    )
                    
                    st.plotly_chart(fig, width='stretch')
                else:
                    st.info("Insufficient data for correlation matrix")
        
        with tab3:
            st.subheader("Validation Results")
            
            # Improved validation result availability check
            validation_available = False
            validation_data = None
            
            if self.pipeline is not None:
                # Check for results in pipeline
                if hasattr(self.pipeline, 'results'):
                    # Look for validation results under different keys
                    validation_keys = ['final_validation', 'validation_results', 'validation', 'validation_checks']
                    for key in validation_keys:
                        if key in self.pipeline.results:
                            validation_data = self.pipeline.results[key]
                            validation_available = True
                            break
                
                # If not found in results, check other attributes
                if not validation_available and hasattr(self.pipeline, 'validation_report'):
                    validation_data = self.pipeline.validation_report
                    validation_available = True
                
                # Or check processing results
                if not validation_available and hasattr(self.pipeline, 'get_validation_summary'):
                    try:
                        validation_data = self.pipeline.get_validation_summary()
                        validation_available = True
                    except:
                        pass
            
            # If validation results available
            if validation_available and validation_data:
                st.success("✅ Validation results available")
                
                # Check validation data format
                if isinstance(validation_data, dict):
                    # Display as dictionary
                    col1, col2 = st.columns(2)
                    
                    with col1:
                        # Status
                        status = validation_data.get('status', 'UNKNOWN')
                        if status == 'PASS':
                            st.success(f"Status: {status}")
                        elif status == 'WARNING':
                            st.warning(f"Status: {status}")
                        else:
                            st.error(f"Status: {status}")
                        
                        # Overall score
                        score = validation_data.get('overall_score', validation_data.get('score', 0))
                        if score:
                            st.metric("Overall Score", f"{score}/100")
                    
                    with col2:
                        # Check counters
                        if 'checks' in validation_data:
                            checks = validation_data['checks']
                        elif 'basic_checks' in validation_data:
                            checks = validation_data['basic_checks']
                        else:
                            checks = validation_data
                        
                        if isinstance(checks, dict):
                            passed = sum(1 for check in checks.values() 
                                       if isinstance(check, dict) and check.get('passed', False))
                            total = len(checks)
                            st.metric("Checks Passed", f"{passed}/{total}")
                    
                    # Check details
                    st.subheader("Check Details")
                    
                    # Determine where checks are located
                    checks_to_display = None
                    if 'checks' in validation_data:
                        checks_to_display = validation_data['checks']
                    elif 'basic_checks' in validation_data:
                        checks_to_display = validation_data['basic_checks']
                    elif any(isinstance(v, dict) and 'passed' in v for v in validation_data.values()):
                        checks_to_display = validation_data
                    
                    if checks_to_display and isinstance(checks_to_display, dict):
                        for check_name, check_info in checks_to_display.items():
                            if isinstance(check_info, dict):
                                col1, col2, col3 = st.columns([3, 1, 3])
                                
                                with col1:
                                    # Check description
                                    description = check_info.get('description', check_name)
                                    st.write(f"**{description}**")
                                
                                with col2:
                                    # Status
                                    if check_info.get('passed', False):
                                        st.success("✅")
                                    else:
                                        st.error("❌")
                                
                                with col3:
                                    # Message
                                    if 'message' in check_info:
                                        st.caption(check_info['message'])
                            else:
                                # Simple format
                                st.write(f"**{check_name}**: {check_info}")
                    else:
                        # Display all validation data
                        st.json(validation_data)
                else:
                    # If not dictionary, display as is
                    st.write("Validation results:")
                    st.write(validation_data)
            else:
                # If no validation results, show pipeline information
                st.info("Validation results in report format not available, but pipeline execution statistics presented below")
                
                # Pipeline stage statistics
                st.subheader("Pipeline Execution Statistics")
                
                # Create stage table
                stages = [
                    ("Data Loading", "✅ Successful" if data is not None else "❌ Error"),
                    ("Missing Value Processing", "✅ Completed"),
                    ("Outlier Processing", "✅ Completed"),
                    ("Feature Engineering", "✅ Completed"),
                    ("Scaling", "✅ Completed"),
                    ("Feature Selection", "✅ Completed"),
                    ("Data Split", "✅ Completed" if modeling_data else "❌ Not completed")
                ]
                
                for stage_name, status in stages:
                    col1, col2 = st.columns([3, 1])
                    with col1:
                        st.write(f"**{stage_name}**")
                    with col2:
                        if "✅" in status:
                            st.success(status)
                        else:
                            st.error(status)
                
                # If pipeline exists, show available metrics
                if self.pipeline is not None:
                    # Check for various metrics
                    st.subheader("Data Quality Metrics")
                    
                    col1, col2, col3 = st.columns(3)
                    
                    with col1:
                        # Data quality
                        if data is not None:
                            missing_pct = (data.isnull().sum().sum() / (data.shape[0] * data.shape[1])) * 100
                            st.metric("Missing Values", f"{missing_pct:.2f}%")
                    
                    with col2:
                        # Feature information
                        if data is not None:
                            numeric_cols = len(data.select_dtypes(include=[np.number]).columns)
                            st.metric("Numeric Features", numeric_cols)
                    
                    with col3:
                        # Split information
                        if modeling_data and 'X_train' in modeling_data:
                            train_size = len(modeling_data['X_train'])
                            total_size = train_size
                            if 'X_test' in modeling_data:
                                total_size += len(modeling_data['X_test'])
                            if 'X_val' in modeling_data:
                                total_size += len(modeling_data['X_val'])
                            
                            if total_size > 0:
                                train_pct = (train_size / total_size) * 100
                                st.metric("Training Set", f"{train_pct:.1f}%")
        
        with tab4:
            st.subheader("Data Export")
            
            # Export formats
            export_format = st.radio(
                "Export Format",
                options=['CSV', 'Parquet', 'Excel'],
                horizontal=True
            )
            
            # Export buttons
            if data is not None:
                # Export processed data
                st.write("**Processed Data**")
                
                if export_format == 'CSV':
                    csv = data.to_csv(index=True)
                    st.download_button(
                        label="📥 Download CSV",
                        data=csv,
                        file_name="streamlit_processed_data.csv",
                        mime="text/csv",
                        width='stretch'
                    )
                
                elif export_format == 'Parquet':
                    # For Parquet need to save to temporary file
                    import io
                    buffer = io.BytesIO()
                    data.to_parquet(buffer)
                    buffer.seek(0)
                    
                    st.download_button(
                        label="📥 Download Parquet",
                        data=buffer,
                        file_name="streamlit_processed_data.parquet",
                        mime="application/octet-stream",
                        width='stretch'
                    )
                
                elif export_format == 'Excel':
                    import io
                    buffer = io.BytesIO()
                    with pd.ExcelWriter(buffer, engine='openpyxl') as writer:
                        data.to_excel(writer, sheet_name='Processed_Data')
                    
                    buffer.seek(0)
                    
                    st.download_button(
                        label="📥 Download Excel",
                        data=buffer,
                        file_name="streamlit_processed_data.xlsx",
                        mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
                        width='stretch'
                    )
            
            # Export modeling data
            if modeling_data:
                st.write("**Modeling Data**")
                
                col1, col2, col3 = st.columns(3)
                
                with col1:
                    if 'X_train' in modeling_data and modeling_data['X_train'] is not None:
                        train_df = pd.concat([
                            modeling_data['X_train'], 
                            modeling_data['y_train'].rename('target')
                        ], axis=1) if 'y_train' in modeling_data else modeling_data['X_train']
                        
                        st.download_button(
                            label="📥 Training Set",
                            data=train_df.to_csv(),
                            file_name="train_data.csv",
                            mime="text/csv",
                            width='stretch'
                        )
                
                with col2:
                    if 'X_val' in modeling_data and modeling_data['X_val'] is not None:
                        val_df = pd.concat([
                            modeling_data['X_val'], 
                            modeling_data['y_val'].rename('target')
                        ], axis=1) if 'y_val' in modeling_data else modeling_data['X_val']
                        
                        st.download_button(
                            label="📥 Validation Set",
                            data=val_df.to_csv(),
                            file_name="validation_data.csv",
                            mime="text/csv",
                            width='stretch'
                        )
                
                with col3:
                    if 'X_test' in modeling_data and modeling_data['X_test'] is not None:
                        test_df = pd.concat([
                            modeling_data['X_test'], 
                            modeling_data['y_test'].rename('target')
                        ], axis=1) if 'y_test' in modeling_data else modeling_data['X_test']
                        
                        st.download_button(
                            label="📥 Test Set",
                            data=test_df.to_csv(),
                            file_name="test_data.csv",
                            mime="text/csv",
                            width='stretch'
                        )
        
        # Navigation
        st.markdown("---")
        col1, col2, col3 = st.columns([1, 1, 1])
        
        with col1:
            if st.button("⬅️ Back to Pipeline", width='stretch'):
                st.session_state.current_step = 4
                st.rerun()
        
        with col3:
            if st.button("Go to Visualisations ➡️", type="primary", width='stretch'):
                st.session_state.current_step = 6
                st.rerun()
    
    def render_step_6_visualisations(self):
        """Step 6: Visualisations"""
        st.header("📈 Pipeline Visualisations")
        
        if not st.session_state.pipeline_completed:
            st.warning("First run pipeline in Step 4")
            
            # Suggest quick test
            st.markdown("---")
            st.subheader("🎮 Quick Test")
            
            col1, col2 = st.columns(2)
            with col1:
                if st.button("🚀 Run Quick Test", type="primary", width='stretch'):
                    st.session_state.quick_test_mode = True
                    st.session_state.current_step = 1
                    st.rerun()
            
            with col2:
                if st.button("Run Pipeline", width='stretch'):
                    st.session_state.current_step = 4
                    st.rerun()
            
            return
        
        # Check for plots
        if not st.session_state.available_plots:
            st.warning("Plots not found. Ensure pipeline was run with visualisation option enabled.")
            
            # Try to collect plots again
            if st.button("Try to Find Plots", width='stretch'):
                self.collect_available_plots()
                st.rerun()
            
            return
        
        # Plot statistics
        total_plots = sum(len(plots) for plots in st.session_state.available_plots.values())
        st.success(f"✅ Found {total_plots} plots")
        
        # Plot category tabs
        categories = list(st.session_state.available_plots.keys())
        
        if 'summary' in categories:
            categories.remove('summary')
            categories.insert(0, 'summary')
        
        tabs = st.tabs([cat.capitalize().replace('_', ' ') for cat in categories])
        
        for i, category in enumerate(categories):
            with tabs[i]:
                self.display_category_plots(category)
        
        # All plots in one gallery
        st.markdown("---")
        st.subheader("🖼️ All Plots Gallery")
        
        # Collect all plots
        all_plots = []
        for category, plots in st.session_state.available_plots.items():
            for plot in plots:
                all_plots.append((category, plot))
        
        # Display plots in grid
        cols_per_row = 3
        for i in range(0, len(all_plots), cols_per_row):
            cols = st.columns(cols_per_row)
            for j in range(cols_per_row):
                idx = i + j
                if idx < len(all_plots):
                    category, plot_info = all_plots[idx]
                    with cols[j]:
                        self.display_plot_card(plot_info, category)
    
    def display_category_plots(self, category):
        """Display plots in category"""
        plots = st.session_state.available_plots.get(category, [])
        
        if not plots:
            st.info(f"No plots in category '{category}'")
            return
        
        st.subheader(f"{category.capitalize().replace('_', ' ')} ({len(plots)} plots)")
        
        # Sort plots by name
        plots_sorted = sorted(plots, key=lambda x: x['name'])
        
        # Display plots in accordions for convenience
        for plot_info in plots_sorted:
            with st.expander(f"📊 {plot_info['name'].replace('_', ' ').replace('.png', '')}", expanded=True):
                self.display_plot_image(plot_info)
    
    def display_plot_card(self, plot_info, category):
        """Display plot card"""
        try:
            # Load image
            image = Image.open(plot_info['path'])
            
            # Create safe key for state
            safe_key = plot_info['path'].replace('/', '_').replace('\\', '_').replace('.', '_')
            
            # Initialise state for this plot if not exists
            if f"show_{safe_key}" not in st.session_state:
                st.session_state[f"show_{safe_key}"] = False
            
            # Create card
            with st.container():
                st.markdown(f"**{plot_info['name'].replace('_', ' ').replace('.png', '')}**")
                st.image(image, width='stretch', caption=plot_info['rel_path'])
                
                # File information
                size_kb = plot_info['size'] / 1024
                st.caption(f"Size: {size_kb:.1f} KB | Category: {category}")
                
                # Zoom control buttons
                col1, col2 = st.columns(2)
                
                with col1:
                    # Zoom button
                    if st.button("🔍 Zoom", key=f"zoom_{safe_key}", width='stretch'):
                        st.session_state[f"show_{safe_key}"] = True
                        # Don't use st.rerun() here
                
                with col2:
                    # Hide zoomed image button (if shown)
                    if st.session_state[f"show_{safe_key}"]:
                        if st.button("✕ Hide", key=f"hide_{safe_key}", width='stretch'):
                            st.session_state[f"show_{safe_key}"] = False
                            # Don't use st.rerun() here
                
                # If zoom button clicked, show zoomed image
                if st.session_state[f"show_{safe_key}"]:
                    st.markdown("---")
                    st.subheader(f"🔍 {plot_info['name'].replace('_', ' ').replace('.png', '')}")
                    st.image(image, width='stretch')
            
        except Exception as e:
            st.error(f"Error loading plot: {str(e)}")
            st.code(f"Path: {plot_info['path']}")
            

    def display_plot_image(self, plot_info):
        """Display plot image"""
        try:
            # Load image
            image = Image.open(plot_info['path'])
            
            # Display with information
            col1, col2 = st.columns([3, 1])
            
            with col1:
                st.image(image, width='stretch')
            
            with col2:
                # File information
                st.metric("Size", f"{plot_info['size'] / 1024:.1f} KB")
                st.metric("Resolution", f"{image.width}×{image.height}")
                
                # File format
                st.write(f"**Format:** {image.format}")
                
                # Download button
                with open(plot_info['path'], 'rb') as file:
                    btn = st.download_button(
                        label="📥 Download",
                        data=file,
                        file_name=plot_info['name'],
                        mime="image/png",
                        width='stretch'
                    )
        
        except Exception as e:
            st.error(f"Error loading plot: {str(e)}")
            st.code(f"Path: {plot_info['path']}")
    
    def render_step_7_modeling(self):
        """Step 7: Modelling Preparation"""
        st.header("🤖 Modelling Preparation")
        
        if not st.session_state.pipeline_completed or st.session_state.modeling_data is None:
            st.warning("First run pipeline in Step 4")
            
            # Suggest quick test
            st.markdown("---")
            st.subheader("🎮 Quick Test")
            
            col1, col2 = st.columns(2)
            with col1:
                if st.button("🚀 Run Quick Test", type="primary", width='stretch'):
                    st.session_state.quick_test_mode = True
                    st.session_state.current_step = 1
                    st.rerun()
            
            with col2:
                if st.button("Run Pipeline", width='stretch'):
                    st.session_state.current_step = 4
                    st.rerun()
            
            return
        
        modeling_data = st.session_state.modeling_data
        
        # Basic information
        col1, col2, col3, col4 = st.columns(4)
        
        with col1:
            if 'X_train' in modeling_data and modeling_data['X_train'] is not None:
                st.metric("Training Set", f"{modeling_data['X_train'].shape[0]:,} records")
        with col2:
            if 'X_val' in modeling_data and modeling_data['X_val'] is not None:
                st.metric("Validation Set", f"{modeling_data['X_val'].shape[0]:,} records")
        with col3:
            if 'X_test' in modeling_data and modeling_data['X_test'] is not None:
                st.metric("Test Set", f"{modeling_data['X_test'].shape[0]:,} records")
        with col4:
            if 'feature_names' in modeling_data and modeling_data['feature_names'] is not None:
                st.metric("Number of Features", len(modeling_data['feature_names']))
        
        # Tabs
        tab1, tab2, tab3 = st.tabs([
            "📐 Data Structure",
            "📊 Target Variable Distribution",
            "🔗 ML Integration"
        ])
        
        with tab1:
            st.subheader("Modeling Data Structure")
            
            # Information table
            data_info = []
            
            if 'X_train' in modeling_data and modeling_data['X_train'] is not None:
                data_info.append({
                    'Dataset': 'Training',
                    'Samples': modeling_data['X_train'].shape[0],
                    'Features': modeling_data['X_train'].shape[1],
                    'Target Variable': 'Yes' if 'y_train' in modeling_data and modeling_data['y_train'] is not None else 'No'
                })
            
            if 'X_val' in modeling_data and modeling_data['X_val'] is not None:
                data_info.append({
                    'Dataset': 'Validation',
                    'Samples': modeling_data['X_val'].shape[0],
                    'Features': modeling_data['X_val'].shape[1],
                    'Target Variable': 'Yes' if 'y_val' in modeling_data and modeling_data['y_val'] is not None else 'No'
                })
            
            if 'X_test' in modeling_data and modeling_data['X_test'] is not None:
                data_info.append({
                    'Dataset': 'Test',
                    'Samples': modeling_data['X_test'].shape[0],
                    'Features': modeling_data['X_test'].shape[1],
                    'Target Variable': 'Yes' if 'y_test' in modeling_data and modeling_data['y_test'] is not None else 'No'
                })
            
            if data_info:
                st.table(pd.DataFrame(data_info))
            else:
                st.info("Modeling data not available")
            
            # Data sample
            st.subheader("Training Data Sample")
            
            if ('X_train' in modeling_data and modeling_data['X_train'] is not None and 
                'y_train' in modeling_data and modeling_data['y_train'] is not None):
                sample_data = pd.concat([
                    modeling_data['X_train'].head(10),
                    modeling_data['y_train'].head(10).rename('target')
                ], axis=1)
                
                st.dataframe(sample_data, width='stretch')
        
        with tab2:
            st.subheader("Target Variable Distribution")
            
            if 'y_train' in modeling_data and modeling_data['y_train'] is not None:
                # Target variable histogram
                fig = px.histogram(
                    x=modeling_data['y_train'],
                    nbins=50,
                    title="Target Variable Distribution (Training Set)",
                    labels={'x': 'Target Variable', 'y': 'Frequency'},
                    color_discrete_sequence=['#00CC96']
                )
                
                st.plotly_chart(fig, width='stretch')
                
                # Statistics
                col1, col2, col3, col4 = st.columns(4)
                
                with col1:
                    st.metric("Mean", f"{modeling_data['y_train'].mean():.2f}")
                with col2:
                    st.metric("Standard Deviation", f"{modeling_data['y_train'].std():.2f}")
                with col3:
                    st.metric("Minimum", f"{modeling_data['y_train'].min():.2f}")
                with col4:
                    st.metric("Maximum", f"{modeling_data['y_train'].max():.2f}")
            else:
                st.info("Target variable not available")
        
        with tab3:
            st.subheader("Machine Learning Library Integration")
            
            st.info("""
            Your data is ready for use with any Python ML libraries.
            Below are code examples for various libraries.
            """)
            
            # Library selection
            ml_library = st.selectbox(
                "Select ML Library",
                options=["Scikit-learn", "XGBoost", "LightGBM", "CatBoost", "PyTorch", "TensorFlow"]
            )
            
            # Code generation
            code_placeholder = st.empty()
            
            if ml_library == "Scikit-learn":
                code = """# Example usage with Scikit-learn
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Use prepared data
X_train = modeling_data['X_train']
y_train = modeling_data['y_train']
X_val = modeling_data['X_val']
y_val = modeling_data['y_val']

# Create and train model
model = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    random_state=42
)

model.fit(X_train, y_train)

# Predictions and evaluation
y_pred = model.predict(X_val)

print(f"RMSE: {np.sqrt(mean_squared_error(y_val, y_pred)):.4f}")
print(f"R² Score: {r2_score(y_val, y_pred):.4f}")
print(f"Feature Importance: {model.feature_importances_}")"""
                
            elif ml_library == "XGBoost":
                code = """# Example usage with XGBoost
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import numpy as np

# Prepare data in DMatrix format
dtrain = xgb.DMatrix(modeling_data['X_train'], label=modeling_data['y_train'])
dval = xgb.DMatrix(modeling_data['X_val'], label=modeling_data['y_val'])

# Model parameters
params = {
    'objective': 'reg:squarederror',
    'max_depth': 6,
    'learning_rate': 0.1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'seed': 42
}

# Train model
model = xgb.train(
    params,
    dtrain,
    num_boost_round=100,
    evals=[(dval, 'validation')],
    early_stopping_rounds=10,
    verbose_eval=False
)

# Predictions
y_pred = model.predict(dval)

print(f"RMSE: {np.sqrt(mean_squared_error(modeling_data['y_val'], y_pred)):.4f}")
print(f"Number of Trees: {model.best_ntree_limit}")"""
            
            elif ml_library == "LightGBM":
                code = """# Example usage with LightGBM
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
import numpy as np

# Prepare data
train_data = lgb.Dataset(
    modeling_data['X_train'],
    label=modeling_data['y_train']
)

val_data = lgb.Dataset(
    modeling_data['X_val'],
    label=modeling_data['y_val'],
    reference=train_data
)

# Model parameters
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

# Train model
model = lgb.train(
    params,
    train_data,
    valid_sets=[val_data],
    num_boost_round=100,
    callbacks=[lgb.early_stopping(10)]
)

# Predictions
y_pred = model.predict(modeling_data['X_val'])

print(f"RMSE: {np.sqrt(mean_squared_error(modeling_data['y_val'], y_pred)):.4f}")
print(f"Best Iteration: {model.best_iteration}")"""
            
            else:
                code = f"""# Template for {ml_library}
# Your data available in modeling_data variable

X_train = modeling_data['X_train']
y_train = modeling_data['y_train']
X_val = modeling_data['X_val']
y_val = modeling_data['y_val']
X_test = modeling_data['X_test']
y_test = modeling_data['y_test']

# Code for {ml_library}...
print(f"Data sizes:")
print(f"  X_train: {{X_train.shape}}")
print(f"  y_train: {{y_train.shape}}")
print(f"  X_val: {{X_val.shape}}")
print(f"  X_test: {{X_test.shape}}")"""
            
            # Display code
            code_placeholder.code(code, language='python')
            
            # Copy code button
            try:
                import pyperclip
                if st.button("📋 Copy Code", width='stretch'):
                    try:
                        pyperclip.copy(code)
                        st.success("Code copied to clipboard!")
                    except:
                        st.warning("Failed to copy code. Copy manually.")
            except:
                st.warning("To copy code, install pyperclip library: pip install pyperclip")
        
        # Final information
        st.markdown("---")
        st.success("""
        🎉 Congratulations! You have successfully prepared data for machine learning.
        
        **Next Steps:**
        1. Use code above for integration with chosen ML library
        2. Experiment with various models
        3. Optimise hyperparameters
        4. Evaluate results on test set
        """)
        
        # Navigation
        col1, col2 = st.columns([1, 1])
        
        with col1:
            if st.button("⬅️ Back to Visualisations", width='stretch'):
                st.session_state.current_step = 6
                st.rerun()
        
        with col2:
            if st.button("🔄 Run New Pipeline", type="primary", width='stretch'):
                # Reset state
                st.session_state.pipeline_completed = False
                st.session_state.processed_data = None
                st.session_state.modeling_data = None
                st.session_state.current_step = 1
                st.session_state.uploaded_file = None
                st.session_state.plots_path = None
                st.session_state.available_plots = {}
                st.session_state.synthetic_data_generated = False
                st.session_state.auto_pipeline_ready = False
                st.session_state.quick_test_mode = False
                st.rerun()
    
    def render_footer(self):
        """Application footer"""
        st.markdown("---")
        
        col1, col2, col3 = st.columns(3)
        
        with col1:
            st.markdown("**TimeFlowPro** v1.1.0")
            st.caption("Added synthetic data generation")
        
        with col2:
            st.markdown("📧 Contacts: cool.araby@gmail.com")
        
        with col3:
            st.markdown("© 2026 All Rights Reserved")
    
    def run(self):
        """Run application"""
        # Header
        st.title("📊 TimeFlow Pro - Data Analysis and Preprocessing")
        st.markdown("---")
        
        # Sidebar
        self.create_sidebar()
        
        # Main content depending on step
        if st.session_state.current_step == 1:
            self.render_step_1_data_loading()
        elif st.session_state.current_step == 2:
            self.render_step_2_configuration()
        elif st.session_state.current_step == 3:
            self.render_step_3_data_analysis()
        elif st.session_state.current_step == 4:
            self.render_step_4_pipeline_execution()
        elif st.session_state.current_step == 5:
            self.render_step_5_results()
        elif st.session_state.current_step == 6:
            self.render_step_6_visualisations()
        elif st.session_state.current_step == 7:
            self.render_step_7_modeling()
        
        # Footer
        self.render_footer()

# ============================================
# APPLICATION LAUNCH
# ============================================
if __name__ == "__main__":
    app = StreamlitApp()
    app.run()