TimeFlowPro

Running

App Files Files Community

TimeFlowPro / app.py

ArabovMK

Update app, run_pipeline.

d0a9de5 1 day ago

raw

history blame contribute delete

103 kB

	# ============================================
	# TimeFlow Pro - Data Analysis and Preprocessing
	# ============================================
	import sys
	import os
	sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
	import streamlit as st
	import pandas as pd
	import numpy as np
	import os
	import sys
	import glob
	import re
	from datetime import datetime, timedelta
	import plotly.graph_objects as go
	import plotly.express as px
	from plotly.subplots import make_subplots
	from PIL import Image
	import matplotlib.pyplot as plt
	import warnings

	from pipeline.main_pipeline import EnhancedDataPreprocessingPipeline

	warnings.filterwarnings('ignore')

	# Add project path
	sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

	from config.config import Config
	from data_loader.data_loader import DataLoader
	from visualization.visualization_manager import VisualisationManager

	# ============================================
	# PAGE CONFIGURATION
	# ============================================
	st.set_page_config(
	page_title="TimeFlow Pro - Data Analysis and Preprocessing",
	page_icon="📊",
	layout="wide",
	initial_sidebar_state="expanded"
	)

	# ============================================
	# STATE MANAGEMENT CLASS
	# ============================================
	class StreamlitApp:
	"""Main Streamlit application class"""

	def __init__(self):
	self.init_session_state()
	self.config = None
	self.pipeline = None
	self.data = None

	def init_session_state(self):
	"""Initialise session state"""
	if 'pipeline_completed' not in st.session_state:
	st.session_state.pipeline_completed = False
	if 'processed_data' not in st.session_state:
	st.session_state.processed_data = None
	if 'modeling_data' not in st.session_state:
	st.session_state.modeling_data = None
	if 'current_step' not in st.session_state:
	st.session_state.current_step = 1
	if 'uploaded_file' not in st.session_state:
	st.session_state.uploaded_file = None
	if 'config_params' not in st.session_state:
	st.session_state.config_params = self.get_default_config()
	if 'plots_path' not in st.session_state:
	st.session_state.plots_path = None
	if 'available_plots' not in st.session_state:
	st.session_state.available_plots = {}
	if 'synthetic_data_generated' not in st.session_state:
	st.session_state.synthetic_data_generated = False
	if 'auto_pipeline_ready' not in st.session_state:
	st.session_state.auto_pipeline_ready = False
	if 'quick_test_mode' not in st.session_state:
	st.session_state.quick_test_mode = False

	def get_default_config(self):
	"""Get default configuration"""
	return {
	'data_path': '',
	'results_dir': 'streamlit_results',
	'target_column': '',
	'start_year': 1970,
	'end_year': 1990,
	'max_lags': 5,
	'seasonal_period': 365,
	'rolling_windows': [7, 30, 90],
	'expanding_windows': [30, 90],
	'test_size': 0.2,
	'validation_size': 0.1,
	'scaling_method': 'robust',
	'feature_selection_method': 'correlation',
	'max_features': 20,
	'missing_threshold': 0.3,
	'outlier_method': 'iqr',
	'enable_validation': True,
	'split_method': 'time_based'
	}

	def create_sidebar(self):
	"""Create sidebar"""
	with st.sidebar:
	st.title("🎯 TimeFlowPro")
	st.markdown("---")

	# Navigation
	st.subheader("Navigation")
	steps = {
	1: "📁 Data Loading",
	2: "⚙️ Configuration",
	3: "🔍 Data Analysis",
	4: "⚡ Pipeline Execution",
	5: "📊 Results",
	6: "📈 Visualisations",
	7: "🤖 Modelling"
	}

	for step_num, step_name in steps.items():
	if st.button(
	f"{step_name}",
	key=f"nav_{step_num}",
	type="primary" if st.session_state.current_step == step_num else "secondary",
	width='stretch'
	):
	st.session_state.current_step = step_num
	st.rerun()

	st.markdown("---")

	# Quick start with synthetic data
	st.subheader("⚡ Quick Test")

	if st.button("🚀 Quick Start with Synthetic Data",
	type="primary",
	width='stretch',
	help="Generate synthetic data and run pipeline immediately"):
	st.session_state.quick_test_mode = True
	st.session_state.current_step = 1
	st.rerun()

	st.markdown("---")

	# Project information
	st.subheader("📈 About the Project")
	st.info("""
	TimeFlow Pro - Data Analysis and Preprocessing.

	New Features:
	- Synthetic data generation for testing
	- Automatic pipeline execution
	- Quick testing without file upload

	Standard Features:
	- Missing data analysis and processing
	- Outlier detection
	- Feature engineering
	- Stationarity analysis
	- Data scaling
	- Feature selection
	""")

	# Progress indicator
	if st.session_state.pipeline_completed:
	st.success("✅ Pipeline completed")
	else:
	st.warning("⚠️ Pipeline not started")

	# Quick test indicator
	if st.session_state.quick_test_mode:
	st.info("⚡ Quick test mode active")

	def generate_synthetic_data(self, n_days=1095, include_seasonality=True, include_trend=True,
	include_noise=True, include_exogenous=True, data_type="complex"):
	"""
	Generate synthetic data for testing

	Args:
	n_days (int): Number of days of data
	include_seasonality (bool): Include seasonality
	include_trend (bool): Include trend
	include_noise (bool): Include noise
	include_exogenous (bool): Include exogenous variables
	data_type (str): Data type (simple, medium, complex)

	Returns:
	pd.DataFrame: Generated synthetic data
	"""
	try:
	# Base parameters depending on data type
	if data_type == "simple":
	n_days = min(n_days, 365) # Limit for simple type
	trend_strength = 0.005
	noise_std = 2
	include_exogenous = False
	elif data_type == "medium":
	n_days = min(n_days, 730) # Limit for medium type
	trend_strength = 0.01
	noise_std = 5
	include_exogenous = True
	else: # complex
	n_days = min(n_days, 1095) # Limit for complex type
	trend_strength = 0.02
	noise_std = 10
	include_exogenous = True

	# Create dates
	start_date = datetime.now() - timedelta(days=n_days)
	dates = pd.date_range(start=start_date, periods=n_days, freq='D')

	# Base trend
	if include_trend:
	trend = np.linspace(0, trend_strength * n_days, n_days)
	else:
	trend = np.zeros(n_days)

	# Seasonality
	if include_seasonality:
	# Annual seasonality
	seasonal = 10 * np.sin(2 * np.pi * np.arange(n_days) / 365)
	# Quarterly seasonality
	seasonal += 5 * np.sin(2 * np.pi * np.arange(n_days) / 90)
	# Monthly seasonality
	seasonal += 3 * np.sin(2 * np.pi * np.arange(n_days) / 30)
	# Weekly seasonality
	seasonal += 2 * np.sin(2 * np.pi * np.arange(n_days) / 7)
	else:
	seasonal = np.zeros(n_days)

	# Main target variable (water consumption)
	base_value = 100
	raskhodvoda = base_value + trend + seasonal

	# Add noise
	if include_noise:
	noise = np.random.normal(0, noise_std, n_days)
	raskhodvoda += noise

	# Create DataFrame
	data = pd.DataFrame({
	'date': dates,
	'raskhodvoda': raskhodvoda
	})

	# Add exogenous variables
	if include_exogenous:
	# Temperature (seasonal)
	data['temperature'] = 15 + 10 * np.sin(2 * np.pi * np.arange(n_days) / 365) + np.random.normal(0, 3, n_days)

	# Precipitation (random spikes)
	precipitation = np.random.exponential(2, n_days)
	# Add seasonality to precipitation
	precipitation_seasonality = 5 * np.sin(2 * np.pi * np.arange(n_days) / 365 + np.pi/2)
	data['precipitation'] = np.maximum(0, precipitation + precipitation_seasonality)

	# Pressure
	data['pressure'] = 760 + np.random.normal(0, 5, n_days)

	# Humidity
	data['humidity'] = 60 + 20 * np.sin(2 * np.pi * np.arange(n_days) / 180) + np.random.normal(0, 10, n_days)

	# Electricity consumption (correlated with target variable)
	data['electricity_consumption'] = raskhodvoda * 0.8 + np.random.normal(0, 5, n_days)

	# Day of week (categorical variable)
	data['day_of_week'] = dates.dayofweek
	data['is_weekend'] = (data['day_of_week'] >= 5).astype(int)

	# Holidays (random)
	holidays = np.random.choice([0, 1], size=n_days, p=[0.95, 0.05])
	data['is_holiday'] = holidays

	# Lag variables
	for lag in [1, 7, 30]:
	data[f'raskhodvoda_lag_{lag}'] = data['raskhodvoda'].shift(lag)

	# Moving averages
	for window in [7, 30]:
	data[f'raskhodvoda_ma_{window}'] = data['raskhodvoda'].rolling(window=window).mean()

	# Add missing values for realism (5% random missing values)
	# CORRECTION: proper creation of missing value mask
	for col in data.columns:
	if col != 'date': # Don't add missing values to dates
	mask = np.random.random(len(data)) < 0.05
	data.loc[mask, col] = np.nan

	# Add outliers (1% of data)
	# CORRECTION: proper creation of outlier mask
	numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist()
	for col in numeric_cols:
	outlier_mask = np.random.random(len(data)) < 0.01
	if outlier_mask.any():
	# Find outlier indices
	outlier_indices = data.index[outlier_mask]
	for idx in outlier_indices:
	if col in data.columns:
	mean_val = data[col].mean(skipna=True)
	std_val = data[col].std(skipna=True)
	if not np.isnan(mean_val) and not np.isnan(std_val) and std_val > 0:
	outlier_value = mean_val + 5 * std_val * np.random.choice([-1, 1])
	data.at[idx, col] = outlier_value

	# Reset index
	data.reset_index(drop=True, inplace=True)

	st.session_state.synthetic_data_generated = True
	return data

	except Exception as e:
	st.error(f"Error generating synthetic data: {str(e)}")
	import traceback
	st.error(f"Error traceback: {traceback.format_exc()}")
	return None

	def quick_test_pipeline(self):
	"""Quick pipeline execution with synthetic data"""
	with st.spinner("🚀 Running quick test with synthetic data..."):
	try:
	# Step 1: Generate synthetic data
	st.info("Step 1: Generating synthetic data...")
	synthetic_data = self.generate_synthetic_data(
	n_days=365, # Reduced for speed
	include_seasonality=True,
	include_trend=True,
	include_noise=True,
	include_exogenous=True,
	data_type="medium" # Changed to medium for balance between speed and quality
	)

	if synthetic_data is None:
	st.error("Failed to generate synthetic data")
	return

	# Save data to temporary file
	temp_file = "temp_synthetic_data.csv"
	synthetic_data.to_csv(temp_file, index=False)

	# Step 2: Configure settings
	st.info("Step 2: Configuring settings...")
	config_params = st.session_state.config_params.copy()
	config_params.update({
	'data_path': temp_file,
	'target_column': 'raskhodvoda',
	'start_year': 2020,
	'end_year': 2023,
	'max_lags': 7,
	'seasonal_period': 365,
	'rolling_windows': [7, 30],
	'expanding_windows': [30],
	'test_size': 0.2,
	'validation_size': 0.1,
	'scaling_method': 'robust',
	'feature_selection_method': 'correlation',
	'max_features': 10, # Reduced for speed
	'missing_threshold': 0.3,
	'outlier_method': 'iqr',
	'enable_validation': True,
	'split_method': 'time_based'
	})

	# Step 3: Create and run pipeline
	st.info("Step 3: Creating and running pipeline...")

	# Create progress bar
	progress_bar = st.progress(0)
	status_text = st.empty()

	# Update configuration
	st.session_state.config_params = config_params
	st.session_state.uploaded_file = temp_file
	st.session_state.data_preview = synthetic_data

	# Create configuration
	status_text.text("Creating configuration...")
	progress_bar.progress(20)

	config = Config(**config_params)

	# Create pipeline
	status_text.text("Initialising pipeline...")
	progress_bar.progress(40)

	self.pipeline = EnhancedDataPreprocessingPipeline(config)

	# Run pipeline
	status_text.text("Running preprocessing pipeline...")
	progress_bar.progress(60)

	processed_data = self.pipeline.run_full_pipeline(
	use_synthetic=False, # Synthetic data already loaded
	save_intermediate=True,
	create_reports=True
	)

	# Update progress
	if processed_data is not None:
	status_text.text("Getting data for modelling...")
	progress_bar.progress(80)

	modeling_data = self.pipeline.get_final_data_for_modelling()

	# Save to session state
	st.session_state.processed_data = processed_data
	st.session_state.modeling_data = modeling_data
	st.session_state.pipeline_completed = True
	st.session_state.plots_path = os.path.join(config.results_dir, 'plots')
	st.session_state.auto_pipeline_ready = True

	# Collect information about available plots
	self.collect_available_plots()

	# Completion
	status_text.text("Completing...")
	progress_bar.progress(100)

	st.success("✅ Quick test completed successfully!")

	# Show results
	col1, col2, col3 = st.columns(3)

	with col1:
	st.metric("Records generated", f"{synthetic_data.shape[0]:,}")

	with col2:
	st.metric("Processed data", f"{processed_data.shape[0]:,} rows")

	with col3:
	st.metric("Final features", f"{processed_data.shape[1]} columns")

	# Automatic transition to results
	st.session_state.current_step = 5
	st.rerun()

	else:
	st.error("❌ Error running pipeline")
	st.error("Check logs for more information")

	except Exception as e:
	st.error(f"❌ Error during quick test: {str(e)}")
	import traceback
	st.error(f"Error traceback: {traceback.format_exc()}")


	def render_step_1_data_loading(self):
	"""Step 1: Data Loading"""
	st.header("📁 Data Loading")

	# Check quick test mode
	if st.session_state.quick_test_mode and not st.session_state.auto_pipeline_ready:
	st.info("⚡ Quick test mode activated. Generating synthetic data and running pipeline...")
	self.quick_test_pipeline()
	return

	col1, col2 = st.columns([2, 1])

	with col1:
	# File upload
	uploaded_file = st.file_uploader(
	"Upload CSV file with data",
	type=['csv', 'xlsx', 'parquet'],
	help="Supported formats: CSV, Excel, Parquet"
	)

	if uploaded_file is not None:
	# Save file temporarily
	file_path = f"temp_data.{uploaded_file.name.split('.')[-1]}"
	with open(file_path, "wb") as f:
	f.write(uploaded_file.getbuffer())

	st.session_state.uploaded_file = file_path
	st.session_state.config_params['data_path'] = file_path

	# Load and preview data
	try:
	if file_path.endswith('.csv'):
	data = pd.read_csv(file_path)
	elif file_path.endswith('.xlsx'):
	data = pd.read_excel(file_path)
	elif file_path.endswith('.parquet'):
	data = pd.read_parquet(file_path)
	else:
	st.error("Unsupported file format")
	return

	st.session_state.data_preview = data

	# Data preview
	st.subheader("Data Preview")
	st.dataframe(data.head(50), width='stretch')

	# Basic information
	st.subheader("📋 Data Information")

	info_col1, info_col2, info_col3 = st.columns(3)

	with info_col1:
	st.metric("Rows", data.shape[0])
	st.metric("Columns", data.shape[1])

	with info_col2:
	numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist()
	st.metric("Numeric columns", len(numeric_cols))
	categorical_cols = data.select_dtypes(include=['object', 'category']).columns.tolist()
	st.metric("Categorical columns", len(categorical_cols))

	with info_col3:
	total_missing = data.isnull().sum().sum()
	missing_percentage = (total_missing / (data.shape[0] * data.shape[1])) * 100
	st.metric("Missing values", f"{total_missing:,}")
	st.metric("Missing percentage", f"{missing_percentage:.2f}%")

	# Automatic target column selection if not set
	if 'target_column' not in st.session_state.config_params or not st.session_state.config_params['target_column']:
	numeric_columns = data.select_dtypes(include=[np.number]).columns.tolist()
	if numeric_columns:
	# Automatically select column with typical name
	target_keywords = ['target', 'y', 'value', 'price', 'sales', 'demand', 'raskhod', 'расход']
	selected_target = None

	for col in numeric_columns:
	if any(keyword in col.lower() for keyword in target_keywords):
	selected_target = col
	break

	# If not found by keywords, take last numeric column
	if not selected_target and numeric_columns:
	selected_target = numeric_columns[-1]

	if selected_target:
	st.session_state.config_params['target_column'] = selected_target
	st.info(f"Target variable automatically selected: {selected_target}")
	st.info("You can change it in the next step")

	# Button to proceed to next step
	if st.button("➡️ Go to Configuration", type="primary", width='stretch'):
	st.session_state.current_step = 2
	st.rerun()

	except Exception as e:
	st.error(f"Error loading data: {str(e)}")

	with col2:
	# Demo data
	st.subheader("🎮 Demo Mode")

	demo_option = st.radio(
	"Choose demo data:",
	["Synthetic Data", "Time Series Example"]
	)

	# Synthetic data settings
	with st.expander("⚙️ Synthetic Data Settings", expanded=False):
	data_type = st.selectbox(
	"Data Type",
	options=["Simple", "Medium", "Complex"],
	index=1,
	help="Simple: 1 year, few features\nMedium: 2 years, main features\nComplex: 3 years, all features"
	)

	n_days = st.slider(
	"Number of days",
	min_value=90,
	max_value=1825,
	value=1095,
	step=30,
	help="Number of days in synthetic data"
	)

	include_trend = st.checkbox("Include trend", value=True)
	include_seasonality = st.checkbox("Include seasonality", value=True)
	include_noise = st.checkbox("Include noise", value=True)
	include_exogenous = st.checkbox("Include additional features", value=True)

	if st.button("Generate and Load Synthetic Data", width='stretch'):
	with st.spinner("Creating synthetic data..."):
	try:
	# Data type mapping
	data_type_map = {
	"Simple": "simple",
	"Medium": "medium",
	"Complex": "complex"
	}

	# Generate synthetic data
	synthetic_data = self.generate_synthetic_data(
	n_days=n_days,
	include_seasonality=include_seasonality,
	include_trend=include_trend,
	include_noise=include_noise,
	include_exogenous=include_exogenous,
	data_type=data_type_map[data_type]
	)

	if synthetic_data is not None:
	st.session_state.data_preview = synthetic_data
	st.session_state.uploaded_file = "synthetic_data"
	st.session_state.config_params['data_path'] = 'synthetic_data'

	# Automatically select target variable
	if 'raskhodvoda' in synthetic_data.columns:
	st.session_state.config_params['target_column'] = 'raskhodvoda'

	st.success(f"✅ Synthetic data created: {synthetic_data.shape[0]} rows, {synthetic_data.shape[1]} columns")

	# Show preview
	st.subheader("Synthetic Data Preview")
	st.dataframe(synthetic_data.head(20), width='stretch')

	# Statistics
	st.subheader("📊 Synthetic Data Statistics")

	stat_col1, stat_col2 = st.columns(2)

	with stat_col1:
	st.metric("Period", f"{synthetic_data.shape[0]} days")
	# CORRECTION: convert dates to strings for display
	if 'date' in synthetic_data.columns:
	min_date = synthetic_data['date'].min()
	max_date = synthetic_data['date'].max()
	if isinstance(min_date, (pd.Timestamp, datetime)):
	st.text(f"Start: {min_date.strftime('%Y-%m-%d')}")
	else:
	st.text(f"Start: {str(min_date)}")

	if isinstance(max_date, (pd.Timestamp, datetime)):
	st.text(f"End: {max_date.strftime('%Y-%m-%d')}")
	else:
	st.text(f"End: {str(max_date)}")

	with stat_col2:
	if 'raskhodvoda' in synthetic_data.columns:
	st.metric("Average consumption", f"{synthetic_data['raskhodvoda'].mean():.2f}")
	st.metric("Max consumption", f"{synthetic_data['raskhodvoda'].max():.2f}")
	st.metric("Min consumption", f"{synthetic_data['raskhodvoda'].min():.2f}")

	# Quick pipeline execution
	st.markdown("---")
	if st.button("🚀 Quick Run Pipeline with This Data", type="primary", width='stretch'):
	st.session_state.quick_test_mode = True
	st.session_state.auto_pipeline_ready = False
	st.rerun()

	st.rerun()
	else:
	st.error("Failed to generate synthetic data")

	except Exception as e:
	st.error(f"Error creating synthetic data: {str(e)}")

	st.markdown("---")

	# Instructions
	st.subheader("📖 Instructions")
	st.markdown("""
	1. Upload CSV file with data OR
	2. Generate synthetic data for testing
	3. Check data preview
	4. Target variable will be selected automatically
	5. Go to configuration to specify parameters

	Data Requirements:
	- Date in separate column or index
	- Clean column names
	- Time series with regular intervals
	""")

	def render_step_2_configuration(self):
	"""Step 2: Pipeline Configuration"""
	st.header("⚙️ Pipeline Configuration")

	# Automatic configuration for synthetic data
	if st.session_state.uploaded_file == "synthetic_data" or st.session_state.config_params['data_path'] == 'synthetic_data':
	st.info("⚡ Synthetic data detected. Optimised configuration applied.")

	# Automatic parameter setup for synthetic data
	if st.button("Apply Recommended Settings for Synthetic Data", width='stretch'):
	st.session_state.config_params.update({
	'target_column': 'raskhodvoda',
	'max_lags': 7,
	'seasonal_period': 365,
	'rolling_windows': [7, 30, 90],
	'expanding_windows': [30, 90],
	'test_size': 0.2,
	'validation_size': 0.1,
	'scaling_method': 'robust',
	'feature_selection_method': 'correlation',
	'max_features': 15,
	'missing_threshold': 0.3,
	'outlier_method': 'iqr',
	'enable_validation': True
	})
	st.success("Settings applied!")
	st.rerun()

	# Configuration sections
	tab1, tab2, tab3, tab4 = st.tabs([
	"📊 Basic Parameters",
	"🔧 Data Processing",
	"🎯 Features and Selection",
	"📈 Temporal Parameters"
	])

	with tab1:
	col1, col2 = st.columns(2)

	with col1:
	st.subheader("Basic Parameters")
	st.session_state.config_params['results_dir'] = st.text_input(
	"Results Directory",
	value=st.session_state.config_params['results_dir']
	)

	# CORRECTION: replace text_input with selectbox for target variable selection
	if hasattr(st.session_state, 'data_preview') and st.session_state.data_preview is not None:
	# Get all data columns
	all_columns = st.session_state.data_preview.columns.tolist()

	# If target variable already set and present in data, use it
	current_target = st.session_state.config_params.get('target_column', '')
	default_index = 0

	if current_target in all_columns:
	default_index = all_columns.index(current_target)
	elif len(all_columns) > 0:
	# Try to find suitable default column
	numeric_columns = st.session_state.data_preview.select_dtypes(include=[np.number]).columns.tolist()
	if numeric_columns:
	# Look for columns with typical target variable names
	target_keywords = ['target', 'y', 'value', 'price', 'sales', 'demand', 'raskhod', 'расход']
	for i, col in enumerate(all_columns):
	if any(keyword in col.lower() for keyword in target_keywords):
	default_index = i
	break
	# If not found by keywords, take first numeric column
	if default_index == 0 and numeric_columns[0] in all_columns:
	default_index = all_columns.index(numeric_columns[0])

	st.session_state.config_params['target_column'] = st.selectbox(
	"Select Target Variable",
	options=all_columns,
	index=default_index,
	help="Select column to be predicted"
	)
	else:
	# If data not loaded, keep text field
	st.session_state.config_params['target_column'] = st.text_input(
	"Target Variable",
	value=st.session_state.config_params.get('target_column', ''),
	help="Enter target column name"
	)

	st.session_state.config_params['enable_validation'] = st.checkbox(
	"Enable Data Validation",
	value=st.session_state.config_params['enable_validation']
	)

	with col2:
	st.subheader("Data Split")
	st.session_state.config_params['test_size'] = st.slider(
	"Test Set Size (%)",
	min_value=5,
	max_value=40,
	value=int(st.session_state.config_params['test_size'] * 100),
	step=5,
	format="%d%%"
	) / 100

	st.session_state.config_params['validation_size'] = st.slider(
	"Validation Set Size (%)",
	min_value=5,
	max_value=30,
	value=int(st.session_state.config_params['validation_size'] * 100),
	step=5,
	format="%d%%"
	) / 100

	split_methods = ['time_based', 'random']
	st.session_state.config_params['split_method'] = st.selectbox(
	"Split Method",
	options=split_methods,
	index=split_methods.index(st.session_state.config_params['split_method'])
	)

	with tab2:
	col1, col2 = st.columns(2)

	with col1:
	st.subheader("Missing Value Processing")
	st.session_state.config_params['missing_threshold'] = st.slider(
	"Missing Value Column Removal Threshold",
	min_value=0.0,
	max_value=0.5,
	value=st.session_state.config_params['missing_threshold'],
	step=0.05,
	format="%.2f"
	)

	st.subheader("Outlier Processing")
	outlier_methods = ['iqr', 'zscore', 'isolation_forest']
	st.session_state.config_params['outlier_method'] = st.selectbox(
	"Outlier Detection Method",
	options=outlier_methods,
	index=outlier_methods.index(st.session_state.config_params['outlier_method'])
	)

	with col2:
	st.subheader("Data Scaling")
	scaling_methods = ['robust', 'standard', 'minmax', 'none']
	st.session_state.config_params['scaling_method'] = st.selectbox(
	"Scaling Method",
	options=scaling_methods,
	index=scaling_methods.index(st.session_state.config_params['scaling_method'])
	)

	if st.session_state.config_params['scaling_method'] == 'none':
	st.info("⚠️ Data will not be scaled")

	with tab3:
	col1, col2 = st.columns(2)

	with col1:
	st.subheader("Feature Engineering")
	st.session_state.config_params['max_lags'] = st.slider(
	"Maximum Number of Lags",
	min_value=1,
	max_value=20,
	value=st.session_state.config_params['max_lags'],
	step=1
	)

	rolling_windows_input = st.text_input(
	"Windows for Rolling Statistics (comma-separated)",
	value=', '.join(map(str, st.session_state.config_params['rolling_windows']))
	)
	if rolling_windows_input:
	st.session_state.config_params['rolling_windows'] = [
	int(x.strip()) for x in rolling_windows_input.split(',') if x.strip().isdigit()
	]

	with col2:
	st.subheader("Feature Selection")
	feature_methods = ['correlation', 'variance', 'mutual_info', 'rf', 'none']
	st.session_state.config_params['feature_selection_method'] = st.selectbox(
	"Feature Selection Method",
	options=feature_methods,
	index=feature_methods.index(st.session_state.config_params['feature_selection_method'])
	)

	st.session_state.config_params['max_features'] = st.slider(
	"Maximum Number of Features",
	min_value=5,
	max_value=100,
	value=st.session_state.config_params['max_features'],
	step=5
	)

	with tab4:
	col1, col2 = st.columns(2)

	with col1:
	st.subheader("Temporal Parameters")

	# If there is data for preview, show date range
	if hasattr(st.session_state, 'data_preview'):
	if 'date' in st.session_state.data_preview.columns:
	date_col = 'date'
	elif isinstance(st.session_state.data_preview.index, pd.DatetimeIndex):
	dates = st.session_state.data_preview.index
	else:
	# Try to find date column
	date_cols = [col for col in st.session_state.data_preview.columns
	if 'date' in col.lower() or 'time' in col.lower()]
	date_col = date_cols[0] if date_cols else None

	if date_col:
	if date_col in st.session_state.data_preview.columns:
	dates = pd.to_datetime(st.session_state.data_preview[date_col])
	else:
	dates = st.session_state.data_preview.index

	if len(dates) > 0:
	min_date = dates.min()
	max_date = dates.max()

	col1_date, col2_date = st.columns(2)
	with col1_date:
	st.session_state.config_params['start_year'] = st.number_input(
	"Start Year",
	min_value=1900,
	max_value=2100,
	value=min_date.year,
	step=1
	)
	with col2_date:
	st.session_state.config_params['end_year'] = st.number_input(
	"End Year",
	min_value=1900,
	max_value=2100,
	value=max_date.year,
	step=1
	)

	with col2:
	st.subheader("Seasonality")
	st.session_state.config_params['seasonal_period'] = st.selectbox(
	"Seasonal Period",
	options=[7, 30, 90, 365, 12, 24],
	index=[7, 30, 90, 365, 12, 24].index(
	st.session_state.config_params['seasonal_period']
	) if st.session_state.config_params['seasonal_period'] in [7, 30, 90, 365, 12, 24] else 0
	)

	expanding_windows_input = st.text_input(
	"Windows for Expanding Statistics (comma-separated)",
	value=', '.join(map(str, st.session_state.config_params['expanding_windows']))
	)
	if expanding_windows_input:
	st.session_state.config_params['expanding_windows'] = [
	int(x.strip()) for x in expanding_windows_input.split(',') if x.strip().isdigit()
	]

	# Navigation buttons
	col1, col2, col3 = st.columns([1, 1, 1])

	with col1:
	if st.button("⬅️ Back to Loading", width='stretch'):
	st.session_state.current_step = 1
	st.rerun()

	with col3:
	if st.button("Go to Analysis ➡️", type="primary", width='stretch'):
	st.session_state.current_step = 3
	st.rerun()

	def render_step_3_data_analysis(self):
	"""Step 3: Data Analysis"""
	st.header("🔍 Data Analysis")

	if not hasattr(st.session_state, 'data_preview') or st.session_state.data_preview is None:
	st.warning("First load data in Step 1")
	if st.button("Return to Data Loading"):
	st.session_state.current_step = 1
	st.rerun()
	return

	data = st.session_state.data_preview

	# Analysis tabs
	tab1, tab2, tab3, tab4 = st.tabs([
	"📈 Statistics",
	"🔍 Distributions",
	"📅 Temporal Analysis",
	"❓ Missing Values and Outliers"
	])

	with tab1:
	col1, col2 = st.columns(2)

	with col1:
	st.subheader("Basic Statistics")
	st.dataframe(data.describe().round(2), width='stretch')

	with col2:
	st.subheader("Data Types")
	dtype_info = pd.DataFrame({
	'Column': data.columns,
	'Type': data.dtypes.values,
	'Unique Values': [data[col].nunique() for col in data.columns]
	})
	st.dataframe(dtype_info, width='stretch')

	with tab2:
	# Select column for visualisation
	numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist()

	if numeric_cols:
	selected_col = st.selectbox(
	"Select Column for Analysis",
	options=numeric_cols
	)

	col1, col2 = st.columns(2)

	with col1:
	# Histogram
	fig = px.histogram(
	data,
	x=selected_col,
	title=f"Distribution of {selected_col}",
	nbins=50,
	color_discrete_sequence=['#636EFA']
	)
	st.plotly_chart(fig, width='stretch')

	with col2:
	# Box plot
	fig = go.Figure()
	fig.add_trace(go.Box(
	y=data[selected_col],
	name=selected_col,
	boxpoints='outliers',
	marker_color='#EF553B'
	))
	fig.update_layout(
	title=f"Box plot {selected_col}",
	yaxis_title=selected_col
	)
	st.plotly_chart(fig, width='stretch')
	else:
	st.warning("No numeric columns for distribution analysis")

	with tab3:
	# Time series analysis
	date_cols = [col for col in data.columns if 'date' in col.lower()]

	if date_cols or isinstance(data.index, pd.DatetimeIndex):
	if date_cols:
	date_col = date_cols[0]
	dates = pd.to_datetime(data[date_col])
	else:
	dates = data.index
	date_col = 'index'

	# Check for numeric columns
	if len(numeric_cols) > 0:
	# Select column for time series
	ts_col = st.selectbox(
	"Select Column for Time Series",
	options=numeric_cols
	)

	# Time series
	fig = go.Figure()
	fig.add_trace(go.Scatter(
	x=dates,
	y=data[ts_col],
	mode='lines',
	name=ts_col,
	line=dict(color='#636EFA', width=2)
	))

	fig.update_layout(
	title=f"Time Series: {ts_col}",
	xaxis_title="Date",
	yaxis_title=ts_col,
	hovermode='x unified'
	)

	st.plotly_chart(fig, width='stretch')

	# Seasonality (if sufficient data)
	if len(dates) > 30:
	# Monthly trend
	if hasattr(dates, 'month'):
	monthly_data = data.groupby(dates.dt.month)[ts_col].mean()

	fig2 = px.bar(
	x=monthly_data.index,
	y=monthly_data.values,
	title=f"Monthly Seasonality: {ts_col}",
	labels={'x': 'Month', 'y': 'Average Value'}
	)
	st.plotly_chart(fig2, width='stretch')
	else:
	st.warning("No numeric columns for temporal analysis")
	else:
	st.info("For temporal analysis, date column or DatetimeIndex required")

	with tab4:
	col1, col2 = st.columns(2)

	with col1:
	# Missing value analysis
	st.subheader("Missing Values")
	missing_data = data.isnull().sum()
	missing_percentage = (missing_data / len(data)) * 100

	missing_df = pd.DataFrame({
	'Column': missing_data.index,
	'Missing Count': missing_data.values,
	'Missing Percentage': missing_percentage.values
	}).sort_values('Missing Count', ascending=False)

	st.dataframe(missing_df, width='stretch')

	# Missing values visualisation
	if missing_data.sum() > 0:
	fig = px.bar(
	missing_df,
	x='Column',
	y='Missing Percentage',
	title="Missing Percentage by Column",
	color='Missing Percentage',
	color_continuous_scale='Reds'
	)
	st.plotly_chart(fig, width='stretch')

	with col2:
	# Quick outlier analysis
	st.subheader("Quick Outlier Analysis")

	if len(numeric_cols) > 0:
	outlier_summary = []

	for col in numeric_cols[:5]: # Limit to 5 columns for speed
	q1 = data[col].quantile(0.25)
	q3 = data[col].quantile(0.75)
	iqr = q3 - q1
	lower_bound = q1 - 1.5 * iqr
	upper_bound = q3 + 1.5 * iqr

	outliers = data[(data[col] < lower_bound) \| (data[col] > upper_bound)]
	outlier_pct = (len(outliers) / len(data)) * 100

	outlier_summary.append({
	'Column': col,
	'Outliers': len(outliers),
	'Percentage': f"{outlier_pct:.2f}%"
	})

	outlier_df = pd.DataFrame(outlier_summary)
	st.dataframe(outlier_df, width='stretch')
	else:
	st.warning("No numeric columns for outlier analysis")

	# Navigation buttons
	col1, col2, col3 = st.columns([1, 1, 1])

	with col1:
	if st.button("⬅️ Back to Configuration", width='stretch'):
	st.session_state.current_step = 2
	st.rerun()

	with col3:
	if st.button("Run Pipeline ➡️", type="primary", width='stretch'):
	st.session_state.current_step = 4
	st.rerun()

	def render_step_4_pipeline_execution(self):
	"""Step 4: Pipeline Execution"""
	st.header("⚡ Pipeline Execution")

	# Readiness check
	ready_to_run = True
	issues = []

	if not st.session_state.uploaded_file and st.session_state.config_params['data_path'] != 'demo' and st.session_state.config_params['data_path'] != 'synthetic_data':
	issues.append("Data not loaded")
	ready_to_run = False

	if not st.session_state.config_params['target_column']:
	issues.append("Target variable not selected")
	ready_to_run = False

	# Automatic synthetic data generation if quick test enabled
	if st.session_state.quick_test_mode and not st.session_state.auto_pipeline_ready:
	st.info("⚡ Quick test mode activated. Generating synthetic data...")
	self.quick_test_pipeline()
	return

	# Display warnings
	if issues:
	st.error("⚠️ Fix before running:")
	for issue in issues:
	st.write(f"- {issue}")

	# Suggest using synthetic data
	st.markdown("---")
	st.subheader("🎮 Quick Solution")

	col1, col2 = st.columns(2)

	with col1:
	if st.button("Generate Synthetic Data", width='stretch'):
	st.session_state.current_step = 1
	st.rerun()

	with col2:
	if st.button("To Data Loading", width='stretch'):
	st.session_state.current_step = 1
	st.rerun()

	col3, col4 = st.columns(2)
	with col3:
	if st.button("To Configuration", width='stretch'):
	st.session_state.current_step = 2
	st.rerun()

	return

	# Display configuration
	st.subheader("Execution Configuration")

	config_col1, config_col2 = st.columns(2)

	with config_col1:
	st.metric("Target Variable", st.session_state.config_params['target_column'])
	st.metric("Test Set", f"{st.session_state.config_params['test_size']*100:.0f}%")
	st.metric("Scaling Method", st.session_state.config_params['scaling_method'])

	with config_col2:
	st.metric("Max Lags", st.session_state.config_params['max_lags'])
	st.metric("Feature Selection Method", st.session_state.config_params['feature_selection_method'])
	st.metric("Validation Enabled", "Yes" if st.session_state.config_params['enable_validation'] else "No")

	# Execution options
	st.subheader("Execution Options")

	col1, col2 = st.columns(2)

	with col1:
	use_synthetic = st.checkbox(
	"Use Synthetic Data",
	value=(st.session_state.config_params['data_path'] == 'demo' or
	st.session_state.config_params['data_path'] == 'synthetic_data'),
	disabled=(st.session_state.config_params['data_path'] == 'demo' or
	st.session_state.config_params['data_path'] == 'synthetic_data')
	)

	save_intermediate = st.checkbox(
	"Save Intermediate Results",
	value=True
	)

	with col2:
	create_reports = st.checkbox(
	"Create Reports",
	value=True
	)

	create_visualisations = st.checkbox(
	"Create Visualisations",
	value=True,
	help="Create data analysis plots"
	)

	# Run button
	if st.button("🚀 Run Preprocessing Pipeline", type="primary", width='stretch'):

	# Create progress bar
	progress_bar = st.progress(0)
	status_text = st.empty()

	try:
	# Create configuration
	status_text.text("Creating configuration...")
	progress_bar.progress(10)

	config = Config(**st.session_state.config_params)

	# Create pipeline
	status_text.text("Initialising pipeline...")
	progress_bar.progress(20)

	self.pipeline = EnhancedDataPreprocessingPipeline(config)

	# Determine whether to use synthetic data
	use_synthetic_flag = (use_synthetic or
	st.session_state.config_params['data_path'] == 'demo' or
	st.session_state.config_params['data_path'] == 'synthetic_data')

	# Run pipeline
	status_text.text("Running preprocessing pipeline...")
	progress_bar.progress(30)

	processed_data = self.pipeline.run_full_pipeline(
	use_synthetic=use_synthetic_flag,
	save_intermediate=save_intermediate,
	create_reports=create_reports
	)

	# Update progress
	if processed_data is not None:
	status_text.text("Getting data for modelling...")
	progress_bar.progress(80)

	modeling_data = self.pipeline.get_final_data_for_modelling()

	# Save to session state
	st.session_state.processed_data = processed_data
	st.session_state.modeling_data = modeling_data
	st.session_state.pipeline_completed = True
	st.session_state.plots_path = os.path.join(config.results_dir, 'plots')

	# Collect information about available plots
	self.collect_available_plots()

	# Completion
	status_text.text("Completing...")
	progress_bar.progress(100)

	st.success("✅ Pipeline completed successfully!")

	# Show results
	col1, col2, col3 = st.columns(3)

	with col1:
	if hasattr(self.pipeline, 'results') and 'data_loading' in self.pipeline.results:
	st.metric("Original Data", f"{self.pipeline.results['data_loading']['shape'][0]:,} rows")
	else:
	st.metric("Original Data", "Information unavailable")

	with col2:
	st.metric("Processed Data", f"{processed_data.shape[0]:,} rows")

	with col3:
	st.metric("Final Features", f"{processed_data.shape[1]} columns")

	# Button to proceed to results
	if st.button("📊 Go to Results", type="primary", width='stretch'):
	st.session_state.current_step = 5
	st.rerun()

	else:
	st.error("❌ Error executing pipeline")
	st.error("Check logs for more information")

	except Exception as e:
	progress_bar.progress(0)
	status_text.text("")
	st.error(f"❌ Error: {str(e)}")
	st.exception(e)

	# Back button
	if st.button("⬅️ Back to Analysis", width='stretch'):
	st.session_state.current_step = 3
	st.rerun()

	def collect_available_plots(self):
	"""Collect information about available plots"""
	if not st.session_state.plots_path or not os.path.exists(st.session_state.plots_path):
	st.session_state.available_plots = {}
	return

	plots_categories = {
	'summary': ['summary_dashboard.png'],
	'missing_values': ['missing_values_analysis.png'],
	'outliers': ['outliers_analysis.png', 'outlier_handling_results.png', 'temporal_outliers.png'],
	'stationarity': ['stationarity_*.png'],
	'data_split': ['data_split.png'],
	'scaling': ['scaling_results.png'],
	'feature_selection': ['feature_selection_*.png'],
	'correlations': ['correlation_matrix.png', 'high_correlations.png', 'target_correlations.png', 'vif_scores.png']
	}

	available_plots = {}

	for category, patterns in plots_categories.items():
	category_plots = []

	# Search for files for each pattern
	for pattern in patterns:
	# For general patterns
	if '*' in pattern:
	search_path = os.path.join(st.session_state.plots_path, pattern)
	files = glob.glob(search_path)

	# Also search in subfolders
	for root, dirs, filenames in os.walk(st.session_state.plots_path):
	for filename in filenames:
	if pattern.replace('*', '') in filename and filename.endswith('.png'):
	full_path = os.path.join(root, filename)
	if full_path not in files:
	files.append(full_path)
	else:
	# For specific file names
	file_path = os.path.join(st.session_state.plots_path, pattern)

	# Check in main folder
	if os.path.exists(file_path):
	files = [file_path]
	else:
	# Check in subfolders
	files = []
	for root, dirs, filenames in os.walk(st.session_state.plots_path):
	for filename in filenames:
	if filename == pattern:
	files.append(os.path.join(root, filename))

	for file in files:
	if os.path.exists(file):
	# Get relative path for display
	rel_path = os.path.relpath(file, st.session_state.plots_path)
	category_plots.append({
	'path': file,
	'name': os.path.basename(file),
	'rel_path': rel_path,
	'size': os.path.getsize(file)
	})

	if category_plots:
	available_plots[category] = category_plots

	# Also add all found PNG files in general folder
	all_png_files = []
	for root, dirs, filenames in os.walk(st.session_state.plots_path):
	for filename in filenames:
	if filename.endswith('.png'):
	file_path = os.path.join(root, filename)
	# Check if this file already added
	already_added = False
	for category_plots in available_plots.values():
	for plot in category_plots:
	if plot['path'] == file_path:
	already_added = True
	break

	if not already_added:
	rel_path = os.path.relpath(file_path, st.session_state.plots_path)
	all_png_files.append({
	'path': file_path,
	'name': filename,
	'rel_path': rel_path,
	'size': os.path.getsize(file_path)
	})

	if all_png_files:
	available_plots['other'] = all_png_files

	st.session_state.available_plots = available_plots

	def render_step_5_results(self):
	"""Step 5: Results"""
	st.header("📊 Pipeline Results")

	if not st.session_state.pipeline_completed or st.session_state.processed_data is None:
	st.warning("Pipeline not yet run or not completed successfully")

	# Suggest using quick test
	st.markdown("---")
	st.subheader("🎮 Quick Start")

	col1, col2 = st.columns(2)
	with col1:
	if st.button("🚀 Run Quick Test", type="primary", width='stretch'):
	st.session_state.quick_test_mode = True
	st.session_state.current_step = 1
	st.rerun()

	with col2:
	if st.button("Load Data", width='stretch'):
	st.session_state.current_step = 1
	st.rerun()

	return

	data = st.session_state.processed_data
	modeling_data = st.session_state.modeling_data

	# Results tabs
	tab1, tab2, tab3, tab4 = st.tabs([
	"📈 Data Overview",
	"📊 Feature Analysis",
	"📉 Validation",
	"💾 Export"
	])

	with tab1:
	st.subheader("Processed Data")

	# Basic information
	info_col1, info_col2, info_col3, info_col4 = st.columns(4)

	with info_col1:
	st.metric("Total Records", f"{data.shape[0]:,}")
	with info_col2:
	st.metric("Total Features", data.shape[1])
	with info_col3:
	numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist()
	st.metric("Numeric Features", len(numeric_cols))
	with info_col4:
	missing_total = data.isnull().sum().sum()
	st.metric("Missing Values", missing_total)

	# Data preview
	st.subheader("Data Preview")
	st.dataframe(data.head(100), width='stretch')

	# Statistics
	st.subheader("Processed Data Statistics")
	st.dataframe(data.describe().round(4), width='stretch')

	with tab2:
	st.subheader("Feature Analysis")

	if modeling_data and 'feature_names' in modeling_data:
	features = modeling_data['feature_names']

	# Feature list
	st.write(f"Selected Features: {len(features)}")

	# Display features as cards
	cols_per_row = 4
	for i in range(0, len(features), cols_per_row):
	cols = st.columns(cols_per_row)
	for j in range(cols_per_row):
	idx = i + j
	if idx < len(features):
	with cols[j]:
	st.info(features[idx])

	# Feature importance (if available)
	if (self.pipeline is not None and
	hasattr(self.pipeline, 'feature_selector') and
	self.pipeline.feature_selector is not None):

	# Check for feature_importances_
	if hasattr(self.pipeline.feature_selector, 'feature_importances_'):
	importances = self.pipeline.feature_selector.feature_importances_

	if importances is not None and len(importances) > 0:
	importance_df = pd.DataFrame({
	'Feature': features[:len(importances)] if len(features) >= len(importances) else features,
	'Importance': importances[:len(features)] if len(importances) >= len(features) else importances
	}).sort_values('Importance', ascending=False)

	st.subheader("Feature Importance")

	fig = px.bar(
	importance_df.head(20),
	x='Importance',
	y='Feature',
	orientation='h',
	title="Top-20 Features by Importance",
	color='Importance',
	color_continuous_scale='Viridis'
	)
	st.plotly_chart(fig, width='stretch')

	# Correlation matrix (limited for performance)
	if data.shape[1] <= 50: # Performance limit
	st.subheader("Correlation Matrix (first 20 features)")

	# Select only numeric columns and limit quantity
	numeric_data = data.select_dtypes(include=[np.number])
	if len(numeric_data.columns) > 20:
	numeric_data = numeric_data.iloc[:, :20]

	if not numeric_data.empty and len(numeric_data.columns) > 1:
	corr_matrix = numeric_data.corr()

	fig = go.Figure(data=go.Heatmap(
	z=corr_matrix.values,
	x=corr_matrix.columns,
	y=corr_matrix.columns,
	colorscale='RdBu',
	zmin=-1,
	zmax=1,
	text=corr_matrix.round(2).values,
	texttemplate='%{text}',
	textfont={"size": 10}
	))

	fig.update_layout(
	title="Correlation Matrix",
	width=800,
	height=800
	)

	st.plotly_chart(fig, width='stretch')
	else:
	st.info("Insufficient data for correlation matrix")

	with tab3:
	st.subheader("Validation Results")

	# Improved validation result availability check
	validation_available = False
	validation_data = None

	if self.pipeline is not None:
	# Check for results in pipeline
	if hasattr(self.pipeline, 'results'):
	# Look for validation results under different keys
	validation_keys = ['final_validation', 'validation_results', 'validation', 'validation_checks']
	for key in validation_keys:
	if key in self.pipeline.results:
	validation_data = self.pipeline.results[key]
	validation_available = True
	break

	# If not found in results, check other attributes
	if not validation_available and hasattr(self.pipeline, 'validation_report'):
	validation_data = self.pipeline.validation_report
	validation_available = True

	# Or check processing results
	if not validation_available and hasattr(self.pipeline, 'get_validation_summary'):
	try:
	validation_data = self.pipeline.get_validation_summary()
	validation_available = True
	except:
	pass

	# If validation results available
	if validation_available and validation_data:
	st.success("✅ Validation results available")

	# Check validation data format
	if isinstance(validation_data, dict):
	# Display as dictionary
	col1, col2 = st.columns(2)

	with col1:
	# Status
	status = validation_data.get('status', 'UNKNOWN')
	if status == 'PASS':
	st.success(f"Status: {status}")
	elif status == 'WARNING':
	st.warning(f"Status: {status}")
	else:
	st.error(f"Status: {status}")

	# Overall score
	score = validation_data.get('overall_score', validation_data.get('score', 0))
	if score:
	st.metric("Overall Score", f"{score}/100")

	with col2:
	# Check counters
	if 'checks' in validation_data:
	checks = validation_data['checks']
	elif 'basic_checks' in validation_data:
	checks = validation_data['basic_checks']
	else:
	checks = validation_data

	if isinstance(checks, dict):
	passed = sum(1 for check in checks.values()
	if isinstance(check, dict) and check.get('passed', False))
	total = len(checks)
	st.metric("Checks Passed", f"{passed}/{total}")

	# Check details
	st.subheader("Check Details")

	# Determine where checks are located
	checks_to_display = None
	if 'checks' in validation_data:
	checks_to_display = validation_data['checks']
	elif 'basic_checks' in validation_data:
	checks_to_display = validation_data['basic_checks']
	elif any(isinstance(v, dict) and 'passed' in v for v in validation_data.values()):
	checks_to_display = validation_data

	if checks_to_display and isinstance(checks_to_display, dict):
	for check_name, check_info in checks_to_display.items():
	if isinstance(check_info, dict):
	col1, col2, col3 = st.columns([3, 1, 3])

	with col1:
	# Check description
	description = check_info.get('description', check_name)
	st.write(f"{description}")

	with col2:
	# Status
	if check_info.get('passed', False):
	st.success("✅")
	else:
	st.error("❌")

	with col3:
	# Message
	if 'message' in check_info:
	st.caption(check_info['message'])
	else:
	# Simple format
	st.write(f"{check_name}: {check_info}")
	else:
	# Display all validation data
	st.json(validation_data)
	else:
	# If not dictionary, display as is
	st.write("Validation results:")
	st.write(validation_data)
	else:
	# If no validation results, show pipeline information
	st.info("Validation results in report format not available, but pipeline execution statistics presented below")

	# Pipeline stage statistics
	st.subheader("Pipeline Execution Statistics")

	# Create stage table
	stages = [
	("Data Loading", "✅ Successful" if data is not None else "❌ Error"),
	("Missing Value Processing", "✅ Completed"),
	("Outlier Processing", "✅ Completed"),
	("Feature Engineering", "✅ Completed"),
	("Scaling", "✅ Completed"),
	("Feature Selection", "✅ Completed"),
	("Data Split", "✅ Completed" if modeling_data else "❌ Not completed")
	]

	for stage_name, status in stages:
	col1, col2 = st.columns([3, 1])
	with col1:
	st.write(f"{stage_name}")
	with col2:
	if "✅" in status:
	st.success(status)
	else:
	st.error(status)

	# If pipeline exists, show available metrics
	if self.pipeline is not None:
	# Check for various metrics
	st.subheader("Data Quality Metrics")

	col1, col2, col3 = st.columns(3)

	with col1:
	# Data quality
	if data is not None:
	missing_pct = (data.isnull().sum().sum() / (data.shape[0] * data.shape[1])) * 100
	st.metric("Missing Values", f"{missing_pct:.2f}%")

	with col2:
	# Feature information
	if data is not None:
	numeric_cols = len(data.select_dtypes(include=[np.number]).columns)
	st.metric("Numeric Features", numeric_cols)

	with col3:
	# Split information
	if modeling_data and 'X_train' in modeling_data:
	train_size = len(modeling_data['X_train'])
	total_size = train_size
	if 'X_test' in modeling_data:
	total_size += len(modeling_data['X_test'])
	if 'X_val' in modeling_data:
	total_size += len(modeling_data['X_val'])

	if total_size > 0:
	train_pct = (train_size / total_size) * 100
	st.metric("Training Set", f"{train_pct:.1f}%")

	with tab4:
	st.subheader("Data Export")

	# Export formats
	export_format = st.radio(
	"Export Format",
	options=['CSV', 'Parquet', 'Excel'],
	horizontal=True
	)

	# Export buttons
	if data is not None:
	# Export processed data
	st.write("Processed Data")

	if export_format == 'CSV':
	csv = data.to_csv(index=True)
	st.download_button(
	label="📥 Download CSV",
	data=csv,
	file_name="streamlit_processed_data.csv",
	mime="text/csv",
	width='stretch'
	)

	elif export_format == 'Parquet':
	# For Parquet need to save to temporary file
	import io
	buffer = io.BytesIO()
	data.to_parquet(buffer)
	buffer.seek(0)

	st.download_button(
	label="📥 Download Parquet",
	data=buffer,
	file_name="streamlit_processed_data.parquet",
	mime="application/octet-stream",
	width='stretch'
	)

	elif export_format == 'Excel':
	import io
	buffer = io.BytesIO()
	with pd.ExcelWriter(buffer, engine='openpyxl') as writer:
	data.to_excel(writer, sheet_name='Processed_Data')

	buffer.seek(0)

	st.download_button(
	label="📥 Download Excel",
	data=buffer,
	file_name="streamlit_processed_data.xlsx",
	mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
	width='stretch'
	)

	# Export modeling data
	if modeling_data:
	st.write("Modeling Data")

	col1, col2, col3 = st.columns(3)

	with col1:
	if 'X_train' in modeling_data and modeling_data['X_train'] is not None:
	train_df = pd.concat([
	modeling_data['X_train'],
	modeling_data['y_train'].rename('target')
	], axis=1) if 'y_train' in modeling_data else modeling_data['X_train']

	st.download_button(
	label="📥 Training Set",
	data=train_df.to_csv(),
	file_name="train_data.csv",
	mime="text/csv",
	width='stretch'
	)

	with col2:
	if 'X_val' in modeling_data and modeling_data['X_val'] is not None:
	val_df = pd.concat([
	modeling_data['X_val'],
	modeling_data['y_val'].rename('target')
	], axis=1) if 'y_val' in modeling_data else modeling_data['X_val']

	st.download_button(
	label="📥 Validation Set",
	data=val_df.to_csv(),
	file_name="validation_data.csv",
	mime="text/csv",
	width='stretch'
	)

	with col3:
	if 'X_test' in modeling_data and modeling_data['X_test'] is not None:
	test_df = pd.concat([
	modeling_data['X_test'],
	modeling_data['y_test'].rename('target')
	], axis=1) if 'y_test' in modeling_data else modeling_data['X_test']

	st.download_button(
	label="📥 Test Set",
	data=test_df.to_csv(),
	file_name="test_data.csv",
	mime="text/csv",
	width='stretch'
	)

	# Navigation
	st.markdown("---")
	col1, col2, col3 = st.columns([1, 1, 1])

	with col1:
	if st.button("⬅️ Back to Pipeline", width='stretch'):
	st.session_state.current_step = 4
	st.rerun()

	with col3:
	if st.button("Go to Visualisations ➡️", type="primary", width='stretch'):
	st.session_state.current_step = 6
	st.rerun()

	def render_step_6_visualisations(self):
	"""Step 6: Visualisations"""
	st.header("📈 Pipeline Visualisations")

	if not st.session_state.pipeline_completed:
	st.warning("First run pipeline in Step 4")

	# Suggest quick test
	st.markdown("---")
	st.subheader("🎮 Quick Test")

	col1, col2 = st.columns(2)
	with col1:
	if st.button("🚀 Run Quick Test", type="primary", width='stretch'):
	st.session_state.quick_test_mode = True
	st.session_state.current_step = 1
	st.rerun()

	with col2:
	if st.button("Run Pipeline", width='stretch'):
	st.session_state.current_step = 4
	st.rerun()

	return

	# Check for plots
	if not st.session_state.available_plots:
	st.warning("Plots not found. Ensure pipeline was run with visualisation option enabled.")

	# Try to collect plots again
	if st.button("Try to Find Plots", width='stretch'):
	self.collect_available_plots()
	st.rerun()

	return

	# Plot statistics
	total_plots = sum(len(plots) for plots in st.session_state.available_plots.values())
	st.success(f"✅ Found {total_plots} plots")

	# Plot category tabs
	categories = list(st.session_state.available_plots.keys())

	if 'summary' in categories:
	categories.remove('summary')
	categories.insert(0, 'summary')

	tabs = st.tabs([cat.capitalize().replace('_', ' ') for cat in categories])

	for i, category in enumerate(categories):
	with tabs[i]:
	self.display_category_plots(category)

	# All plots in one gallery
	st.markdown("---")
	st.subheader("🖼️ All Plots Gallery")

	# Collect all plots
	all_plots = []
	for category, plots in st.session_state.available_plots.items():
	for plot in plots:
	all_plots.append((category, plot))

	# Display plots in grid
	cols_per_row = 3
	for i in range(0, len(all_plots), cols_per_row):
	cols = st.columns(cols_per_row)
	for j in range(cols_per_row):
	idx = i + j
	if idx < len(all_plots):
	category, plot_info = all_plots[idx]
	with cols[j]:
	self.display_plot_card(plot_info, category)

	def display_category_plots(self, category):
	"""Display plots in category"""
	plots = st.session_state.available_plots.get(category, [])

	if not plots:
	st.info(f"No plots in category '{category}'")
	return

	st.subheader(f"{category.capitalize().replace('_', ' ')} ({len(plots)} plots)")

	# Sort plots by name
	plots_sorted = sorted(plots, key=lambda x: x['name'])

	# Display plots in accordions for convenience
	for plot_info in plots_sorted:
	with st.expander(f"📊 {plot_info['name'].replace('_', ' ').replace('.png', '')}", expanded=True):
	self.display_plot_image(plot_info)

	def display_plot_card(self, plot_info, category):
	"""Display plot card"""
	try:
	# Load image
	image = Image.open(plot_info['path'])

	# Create safe key for state
	safe_key = plot_info['path'].replace('/', '_').replace('\\', '_').replace('.', '_')

	# Initialise state for this plot if not exists
	if f"show_{safe_key}" not in st.session_state:
	st.session_state[f"show_{safe_key}"] = False

	# Create card
	with st.container():
	st.markdown(f"{plot_info['name'].replace('_', ' ').replace('.png', '')}")
	st.image(image, width='stretch', caption=plot_info['rel_path'])

	# File information
	size_kb = plot_info['size'] / 1024
	st.caption(f"Size: {size_kb:.1f} KB \| Category: {category}")

	# Zoom control buttons
	col1, col2 = st.columns(2)

	with col1:
	# Zoom button
	if st.button("🔍 Zoom", key=f"zoom_{safe_key}", width='stretch'):
	st.session_state[f"show_{safe_key}"] = True
	# Don't use st.rerun() here

	with col2:
	# Hide zoomed image button (if shown)
	if st.session_state[f"show_{safe_key}"]:
	if st.button("✕ Hide", key=f"hide_{safe_key}", width='stretch'):
	st.session_state[f"show_{safe_key}"] = False
	# Don't use st.rerun() here

	# If zoom button clicked, show zoomed image
	if st.session_state[f"show_{safe_key}"]:
	st.markdown("---")
	st.subheader(f"🔍 {plot_info['name'].replace('_', ' ').replace('.png', '')}")
	st.image(image, width='stretch')

	except Exception as e:
	st.error(f"Error loading plot: {str(e)}")
	st.code(f"Path: {plot_info['path']}")


	def display_plot_image(self, plot_info):
	"""Display plot image"""
	try:
	# Load image
	image = Image.open(plot_info['path'])

	# Display with information
	col1, col2 = st.columns([3, 1])

	with col1:
	st.image(image, width='stretch')

	with col2:
	# File information
	st.metric("Size", f"{plot_info['size'] / 1024:.1f} KB")
	st.metric("Resolution", f"{image.width}×{image.height}")

	# File format
	st.write(f"Format: {image.format}")

	# Download button
	with open(plot_info['path'], 'rb') as file:
	btn = st.download_button(
	label="📥 Download",
	data=file,
	file_name=plot_info['name'],
	mime="image/png",
	width='stretch'
	)

	except Exception as e:
	st.error(f"Error loading plot: {str(e)}")
	st.code(f"Path: {plot_info['path']}")

	def render_step_7_modeling(self):
	"""Step 7: Modelling Preparation"""
	st.header("🤖 Modelling Preparation")

	if not st.session_state.pipeline_completed or st.session_state.modeling_data is None:
	st.warning("First run pipeline in Step 4")

	# Suggest quick test
	st.markdown("---")
	st.subheader("🎮 Quick Test")

	col1, col2 = st.columns(2)
	with col1:
	if st.button("🚀 Run Quick Test", type="primary", width='stretch'):
	st.session_state.quick_test_mode = True
	st.session_state.current_step = 1
	st.rerun()

	with col2:
	if st.button("Run Pipeline", width='stretch'):
	st.session_state.current_step = 4
	st.rerun()

	return

	modeling_data = st.session_state.modeling_data

	# Basic information
	col1, col2, col3, col4 = st.columns(4)

	with col1:
	if 'X_train' in modeling_data and modeling_data['X_train'] is not None:
	st.metric("Training Set", f"{modeling_data['X_train'].shape[0]:,} records")
	with col2:
	if 'X_val' in modeling_data and modeling_data['X_val'] is not None:
	st.metric("Validation Set", f"{modeling_data['X_val'].shape[0]:,} records")
	with col3:
	if 'X_test' in modeling_data and modeling_data['X_test'] is not None:
	st.metric("Test Set", f"{modeling_data['X_test'].shape[0]:,} records")
	with col4:
	if 'feature_names' in modeling_data and modeling_data['feature_names'] is not None:
	st.metric("Number of Features", len(modeling_data['feature_names']))

	# Tabs
	tab1, tab2, tab3 = st.tabs([
	"📐 Data Structure",
	"📊 Target Variable Distribution",
	"🔗 ML Integration"
	])

	with tab1:
	st.subheader("Modeling Data Structure")

	# Information table
	data_info = []

	if 'X_train' in modeling_data and modeling_data['X_train'] is not None:
	data_info.append({
	'Dataset': 'Training',
	'Samples': modeling_data['X_train'].shape[0],
	'Features': modeling_data['X_train'].shape[1],
	'Target Variable': 'Yes' if 'y_train' in modeling_data and modeling_data['y_train'] is not None else 'No'
	})

	if 'X_val' in modeling_data and modeling_data['X_val'] is not None:
	data_info.append({
	'Dataset': 'Validation',
	'Samples': modeling_data['X_val'].shape[0],
	'Features': modeling_data['X_val'].shape[1],
	'Target Variable': 'Yes' if 'y_val' in modeling_data and modeling_data['y_val'] is not None else 'No'
	})

	if 'X_test' in modeling_data and modeling_data['X_test'] is not None:
	data_info.append({
	'Dataset': 'Test',
	'Samples': modeling_data['X_test'].shape[0],
	'Features': modeling_data['X_test'].shape[1],
	'Target Variable': 'Yes' if 'y_test' in modeling_data and modeling_data['y_test'] is not None else 'No'
	})

	if data_info:
	st.table(pd.DataFrame(data_info))
	else:
	st.info("Modeling data not available")

	# Data sample
	st.subheader("Training Data Sample")

	if ('X_train' in modeling_data and modeling_data['X_train'] is not None and
	'y_train' in modeling_data and modeling_data['y_train'] is not None):
	sample_data = pd.concat([
	modeling_data['X_train'].head(10),
	modeling_data['y_train'].head(10).rename('target')
	], axis=1)

	st.dataframe(sample_data, width='stretch')

	with tab2:
	st.subheader("Target Variable Distribution")

	if 'y_train' in modeling_data and modeling_data['y_train'] is not None:
	# Target variable histogram
	fig = px.histogram(
	x=modeling_data['y_train'],
	nbins=50,
	title="Target Variable Distribution (Training Set)",
	labels={'x': 'Target Variable', 'y': 'Frequency'},
	color_discrete_sequence=['#00CC96']
	)

	st.plotly_chart(fig, width='stretch')

	# Statistics
	col1, col2, col3, col4 = st.columns(4)

	with col1:
	st.metric("Mean", f"{modeling_data['y_train'].mean():.2f}")
	with col2:
	st.metric("Standard Deviation", f"{modeling_data['y_train'].std():.2f}")
	with col3:
	st.metric("Minimum", f"{modeling_data['y_train'].min():.2f}")
	with col4:
	st.metric("Maximum", f"{modeling_data['y_train'].max():.2f}")
	else:
	st.info("Target variable not available")

	with tab3:
	st.subheader("Machine Learning Library Integration")

	st.info("""
	Your data is ready for use with any Python ML libraries.
	Below are code examples for various libraries.
	""")

	# Library selection
	ml_library = st.selectbox(
	"Select ML Library",
	options=["Scikit-learn", "XGBoost", "LightGBM", "CatBoost", "PyTorch", "TensorFlow"]
	)

	# Code generation
	code_placeholder = st.empty()

	if ml_library == "Scikit-learn":
	code = """# Example usage with Scikit-learn
	from sklearn.ensemble import RandomForestRegressor
	from sklearn.metrics import mean_squared_error, r2_score
	import numpy as np

	# Use prepared data
	X_train = modeling_data['X_train']
	y_train = modeling_data['y_train']
	X_val = modeling_data['X_val']
	y_val = modeling_data['y_val']

	# Create and train model
	model = RandomForestRegressor(
	n_estimators=100,
	max_depth=10,
	random_state=42
	)

	model.fit(X_train, y_train)

	# Predictions and evaluation
	y_pred = model.predict(X_val)

	print(f"RMSE: {np.sqrt(mean_squared_error(y_val, y_pred)):.4f}")
	print(f"R² Score: {r2_score(y_val, y_pred):.4f}")
	print(f"Feature Importance: {model.feature_importances_}")"""

	elif ml_library == "XGBoost":
	code = """# Example usage with XGBoost
	import xgboost as xgb
	from sklearn.metrics import mean_squared_error
	import numpy as np

	# Prepare data in DMatrix format
	dtrain = xgb.DMatrix(modeling_data['X_train'], label=modeling_data['y_train'])
	dval = xgb.DMatrix(modeling_data['X_val'], label=modeling_data['y_val'])

	# Model parameters
	params = {
	'objective': 'reg:squarederror',
	'max_depth': 6,
	'learning_rate': 0.1,
	'subsample': 0.8,
	'colsample_bytree': 0.8,
	'seed': 42
	}

	# Train model
	model = xgb.train(
	params,
	dtrain,
	num_boost_round=100,
	evals=[(dval, 'validation')],
	early_stopping_rounds=10,
	verbose_eval=False
	)

	# Predictions
	y_pred = model.predict(dval)

	print(f"RMSE: {np.sqrt(mean_squared_error(modeling_data['y_val'], y_pred)):.4f}")
	print(f"Number of Trees: {model.best_ntree_limit}")"""

	elif ml_library == "LightGBM":
	code = """# Example usage with LightGBM
	import lightgbm as lgb
	from sklearn.metrics import mean_squared_error
	import numpy as np

	# Prepare data
	train_data = lgb.Dataset(
	modeling_data['X_train'],
	label=modeling_data['y_train']
	)

	val_data = lgb.Dataset(
	modeling_data['X_val'],
	label=modeling_data['y_val'],
	reference=train_data
	)

	# Model parameters
	params = {
	'objective': 'regression',
	'metric': 'rmse',
	'num_leaves': 31,
	'learning_rate': 0.05,
	'feature_fraction': 0.9,
	'bagging_fraction': 0.8,
	'bagging_freq': 5,
	'verbose': 0
	}

	# Train model
	model = lgb.train(
	params,
	train_data,
	valid_sets=[val_data],
	num_boost_round=100,
	callbacks=[lgb.early_stopping(10)]
	)

	# Predictions
	y_pred = model.predict(modeling_data['X_val'])

	print(f"RMSE: {np.sqrt(mean_squared_error(modeling_data['y_val'], y_pred)):.4f}")
	print(f"Best Iteration: {model.best_iteration}")"""

	else:
	code = f"""# Template for {ml_library}
	# Your data available in modeling_data variable

	X_train = modeling_data['X_train']
	y_train = modeling_data['y_train']
	X_val = modeling_data['X_val']
	y_val = modeling_data['y_val']
	X_test = modeling_data['X_test']
	y_test = modeling_data['y_test']

	# Code for {ml_library}...
	print(f"Data sizes:")
	print(f" X_train: {{X_train.shape}}")
	print(f" y_train: {{y_train.shape}}")
	print(f" X_val: {{X_val.shape}}")
	print(f" X_test: {{X_test.shape}}")"""

	# Display code
	code_placeholder.code(code, language='python')

	# Copy code button
	try:
	import pyperclip
	if st.button("📋 Copy Code", width='stretch'):
	try:
	pyperclip.copy(code)
	st.success("Code copied to clipboard!")
	except:
	st.warning("Failed to copy code. Copy manually.")
	except:
	st.warning("To copy code, install pyperclip library: pip install pyperclip")

	# Final information
	st.markdown("---")
	st.success("""
	🎉 Congratulations! You have successfully prepared data for machine learning.

	Next Steps:
	1. Use code above for integration with chosen ML library
	2. Experiment with various models
	3. Optimise hyperparameters
	4. Evaluate results on test set
	""")

	# Navigation
	col1, col2 = st.columns([1, 1])

	with col1:
	if st.button("⬅️ Back to Visualisations", width='stretch'):
	st.session_state.current_step = 6
	st.rerun()

	with col2:
	if st.button("🔄 Run New Pipeline", type="primary", width='stretch'):
	# Reset state
	st.session_state.pipeline_completed = False
	st.session_state.processed_data = None
	st.session_state.modeling_data = None
	st.session_state.current_step = 1
	st.session_state.uploaded_file = None
	st.session_state.plots_path = None
	st.session_state.available_plots = {}
	st.session_state.synthetic_data_generated = False
	st.session_state.auto_pipeline_ready = False
	st.session_state.quick_test_mode = False
	st.rerun()

	def render_footer(self):
	"""Application footer"""
	st.markdown("---")

	col1, col2, col3 = st.columns(3)

	with col1:
	st.markdown("TimeFlowPro v1.1.0")
	st.caption("Added synthetic data generation")

	with col2:
	st.markdown("📧 Contacts: cool.araby@gmail.com")

	with col3:
	st.markdown("© 2026 All Rights Reserved")

	def run(self):
	"""Run application"""
	# Header
	st.title("📊 TimeFlow Pro - Data Analysis and Preprocessing")
	st.markdown("---")

	# Sidebar
	self.create_sidebar()

	# Main content depending on step
	if st.session_state.current_step == 1:
	self.render_step_1_data_loading()
	elif st.session_state.current_step == 2:
	self.render_step_2_configuration()
	elif st.session_state.current_step == 3:
	self.render_step_3_data_analysis()
	elif st.session_state.current_step == 4:
	self.render_step_4_pipeline_execution()
	elif st.session_state.current_step == 5:
	self.render_step_5_results()
	elif st.session_state.current_step == 6:
	self.render_step_6_visualisations()
	elif st.session_state.current_step == 7:
	self.render_step_7_modeling()

	# Footer
	self.render_footer()

	# ============================================
	# APPLICATION LAUNCH
	# ============================================
	if __name__ == "__main__":
	app = StreamlitApp()
	app.run()