diff --git "a/app.py" "b/app.py" new file mode 100644--- /dev/null +++ "b/app.py" @@ -0,0 +1,2403 @@ +# ============================================ +# TimeFlow Pro - Data Analysis and Preprocessing +# ============================================ +import streamlit as st +import pandas as pd +import numpy as np +import os +import sys +import glob +import re +from datetime import datetime, timedelta +import plotly.graph_objects as go +import plotly.express as px +from plotly.subplots import make_subplots +from PIL import Image +import matplotlib.pyplot as plt +import warnings + +from pipeline.main_pipeline import EnhancedDataPreprocessingPipeline + +warnings.filterwarnings('ignore') + +# Add project path +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from config.config import Config +from data_loader.data_loader import DataLoader +from visualization.visualization_manager import VisualisationManager + +# ============================================ +# PAGE CONFIGURATION +# ============================================ +st.set_page_config( + page_title="TimeFlow Pro - Data Analysis and Preprocessing", + page_icon="๐Ÿ“Š", + layout="wide", + initial_sidebar_state="expanded" +) + +# ============================================ +# STATE MANAGEMENT CLASS +# ============================================ +class StreamlitApp: + """Main Streamlit application class""" + + def __init__(self): + self.init_session_state() + self.config = None + self.pipeline = None + self.data = None + + def init_session_state(self): + """Initialise session state""" + if 'pipeline_completed' not in st.session_state: + st.session_state.pipeline_completed = False + if 'processed_data' not in st.session_state: + st.session_state.processed_data = None + if 'modeling_data' not in st.session_state: + st.session_state.modeling_data = None + if 'current_step' not in st.session_state: + st.session_state.current_step = 1 + if 'uploaded_file' not in st.session_state: + st.session_state.uploaded_file = None + if 'config_params' not in st.session_state: + st.session_state.config_params = self.get_default_config() + if 'plots_path' not in st.session_state: + st.session_state.plots_path = None + if 'available_plots' not in st.session_state: + st.session_state.available_plots = {} + if 'synthetic_data_generated' not in st.session_state: + st.session_state.synthetic_data_generated = False + if 'auto_pipeline_ready' not in st.session_state: + st.session_state.auto_pipeline_ready = False + if 'quick_test_mode' not in st.session_state: + st.session_state.quick_test_mode = False + + def get_default_config(self): + """Get default configuration""" + return { + 'data_path': '', + 'results_dir': 'streamlit_results', + 'target_column': '', + 'start_year': 1970, + 'end_year': 1990, + 'max_lags': 5, + 'seasonal_period': 365, + 'rolling_windows': [7, 30, 90], + 'expanding_windows': [30, 90], + 'test_size': 0.2, + 'validation_size': 0.1, + 'scaling_method': 'robust', + 'feature_selection_method': 'correlation', + 'max_features': 20, + 'missing_threshold': 0.3, + 'outlier_method': 'iqr', + 'enable_validation': True, + 'split_method': 'time_based' + } + + def create_sidebar(self): + """Create sidebar""" + with st.sidebar: + st.title("๐ŸŽฏ TimeFlowPro") + st.markdown("---") + + # Navigation + st.subheader("Navigation") + steps = { + 1: "๐Ÿ“ Data Loading", + 2: "โš™๏ธ Configuration", + 3: "๐Ÿ” Data Analysis", + 4: "โšก Pipeline Execution", + 5: "๐Ÿ“Š Results", + 6: "๐Ÿ“ˆ Visualisations", + 7: "๐Ÿค– Modelling" + } + + for step_num, step_name in steps.items(): + if st.button( + f"{step_name}", + key=f"nav_{step_num}", + type="primary" if st.session_state.current_step == step_num else "secondary", + width='stretch' + ): + st.session_state.current_step = step_num + st.rerun() + + st.markdown("---") + + # Quick start with synthetic data + st.subheader("โšก Quick Test") + + if st.button("๐Ÿš€ Quick Start with Synthetic Data", + type="primary", + width='stretch', + help="Generate synthetic data and run pipeline immediately"): + st.session_state.quick_test_mode = True + st.session_state.current_step = 1 + st.rerun() + + st.markdown("---") + + # Project information + st.subheader("๐Ÿ“ˆ About the Project") + st.info(""" + TimeFlow Pro - Data Analysis and Preprocessing. + + **New Features:** + - Synthetic data generation for testing + - Automatic pipeline execution + - Quick testing without file upload + + **Standard Features:** + - Missing data analysis and processing + - Outlier detection + - Feature engineering + - Stationarity analysis + - Data scaling + - Feature selection + """) + + # Progress indicator + if st.session_state.pipeline_completed: + st.success("โœ… Pipeline completed") + else: + st.warning("โš ๏ธ Pipeline not started") + + # Quick test indicator + if st.session_state.quick_test_mode: + st.info("โšก Quick test mode active") + + def generate_synthetic_data(self, n_days=1095, include_seasonality=True, include_trend=True, + include_noise=True, include_exogenous=True, data_type="complex"): + """ + Generate synthetic data for testing + + Args: + n_days (int): Number of days of data + include_seasonality (bool): Include seasonality + include_trend (bool): Include trend + include_noise (bool): Include noise + include_exogenous (bool): Include exogenous variables + data_type (str): Data type (simple, medium, complex) + + Returns: + pd.DataFrame: Generated synthetic data + """ + try: + # Base parameters depending on data type + if data_type == "simple": + n_days = min(n_days, 365) # Limit for simple type + trend_strength = 0.005 + noise_std = 2 + include_exogenous = False + elif data_type == "medium": + n_days = min(n_days, 730) # Limit for medium type + trend_strength = 0.01 + noise_std = 5 + include_exogenous = True + else: # complex + n_days = min(n_days, 1095) # Limit for complex type + trend_strength = 0.02 + noise_std = 10 + include_exogenous = True + + # Create dates + start_date = datetime.now() - timedelta(days=n_days) + dates = pd.date_range(start=start_date, periods=n_days, freq='D') + + # Base trend + if include_trend: + trend = np.linspace(0, trend_strength * n_days, n_days) + else: + trend = np.zeros(n_days) + + # Seasonality + if include_seasonality: + # Annual seasonality + seasonal = 10 * np.sin(2 * np.pi * np.arange(n_days) / 365) + # Quarterly seasonality + seasonal += 5 * np.sin(2 * np.pi * np.arange(n_days) / 90) + # Monthly seasonality + seasonal += 3 * np.sin(2 * np.pi * np.arange(n_days) / 30) + # Weekly seasonality + seasonal += 2 * np.sin(2 * np.pi * np.arange(n_days) / 7) + else: + seasonal = np.zeros(n_days) + + # Main target variable (water consumption) + base_value = 100 + raskhodvoda = base_value + trend + seasonal + + # Add noise + if include_noise: + noise = np.random.normal(0, noise_std, n_days) + raskhodvoda += noise + + # Create DataFrame + data = pd.DataFrame({ + 'date': dates, + 'raskhodvoda': raskhodvoda + }) + + # Add exogenous variables + if include_exogenous: + # Temperature (seasonal) + data['temperature'] = 15 + 10 * np.sin(2 * np.pi * np.arange(n_days) / 365) + np.random.normal(0, 3, n_days) + + # Precipitation (random spikes) + precipitation = np.random.exponential(2, n_days) + # Add seasonality to precipitation + precipitation_seasonality = 5 * np.sin(2 * np.pi * np.arange(n_days) / 365 + np.pi/2) + data['precipitation'] = np.maximum(0, precipitation + precipitation_seasonality) + + # Pressure + data['pressure'] = 760 + np.random.normal(0, 5, n_days) + + # Humidity + data['humidity'] = 60 + 20 * np.sin(2 * np.pi * np.arange(n_days) / 180) + np.random.normal(0, 10, n_days) + + # Electricity consumption (correlated with target variable) + data['electricity_consumption'] = raskhodvoda * 0.8 + np.random.normal(0, 5, n_days) + + # Day of week (categorical variable) + data['day_of_week'] = dates.dayofweek + data['is_weekend'] = (data['day_of_week'] >= 5).astype(int) + + # Holidays (random) + holidays = np.random.choice([0, 1], size=n_days, p=[0.95, 0.05]) + data['is_holiday'] = holidays + + # Lag variables + for lag in [1, 7, 30]: + data[f'raskhodvoda_lag_{lag}'] = data['raskhodvoda'].shift(lag) + + # Moving averages + for window in [7, 30]: + data[f'raskhodvoda_ma_{window}'] = data['raskhodvoda'].rolling(window=window).mean() + + # Add missing values for realism (5% random missing values) + # CORRECTION: proper creation of missing value mask + for col in data.columns: + if col != 'date': # Don't add missing values to dates + mask = np.random.random(len(data)) < 0.05 + data.loc[mask, col] = np.nan + + # Add outliers (1% of data) + # CORRECTION: proper creation of outlier mask + numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist() + for col in numeric_cols: + outlier_mask = np.random.random(len(data)) < 0.01 + if outlier_mask.any(): + # Find outlier indices + outlier_indices = data.index[outlier_mask] + for idx in outlier_indices: + if col in data.columns: + mean_val = data[col].mean(skipna=True) + std_val = data[col].std(skipna=True) + if not np.isnan(mean_val) and not np.isnan(std_val) and std_val > 0: + outlier_value = mean_val + 5 * std_val * np.random.choice([-1, 1]) + data.at[idx, col] = outlier_value + + # Reset index + data.reset_index(drop=True, inplace=True) + + st.session_state.synthetic_data_generated = True + return data + + except Exception as e: + st.error(f"Error generating synthetic data: {str(e)}") + import traceback + st.error(f"Error traceback: {traceback.format_exc()}") + return None + + def quick_test_pipeline(self): + """Quick pipeline execution with synthetic data""" + with st.spinner("๐Ÿš€ Running quick test with synthetic data..."): + try: + # Step 1: Generate synthetic data + st.info("Step 1: Generating synthetic data...") + synthetic_data = self.generate_synthetic_data( + n_days=365, # Reduced for speed + include_seasonality=True, + include_trend=True, + include_noise=True, + include_exogenous=True, + data_type="medium" # Changed to medium for balance between speed and quality + ) + + if synthetic_data is None: + st.error("Failed to generate synthetic data") + return + + # Save data to temporary file + temp_file = "temp_synthetic_data.csv" + synthetic_data.to_csv(temp_file, index=False) + + # Step 2: Configure settings + st.info("Step 2: Configuring settings...") + config_params = st.session_state.config_params.copy() + config_params.update({ + 'data_path': temp_file, + 'target_column': 'raskhodvoda', + 'start_year': 2020, + 'end_year': 2023, + 'max_lags': 7, + 'seasonal_period': 365, + 'rolling_windows': [7, 30], + 'expanding_windows': [30], + 'test_size': 0.2, + 'validation_size': 0.1, + 'scaling_method': 'robust', + 'feature_selection_method': 'correlation', + 'max_features': 10, # Reduced for speed + 'missing_threshold': 0.3, + 'outlier_method': 'iqr', + 'enable_validation': True, + 'split_method': 'time_based' + }) + + # Step 3: Create and run pipeline + st.info("Step 3: Creating and running pipeline...") + + # Create progress bar + progress_bar = st.progress(0) + status_text = st.empty() + + # Update configuration + st.session_state.config_params = config_params + st.session_state.uploaded_file = temp_file + st.session_state.data_preview = synthetic_data + + # Create configuration + status_text.text("Creating configuration...") + progress_bar.progress(20) + + config = Config(**config_params) + + # Create pipeline + status_text.text("Initialising pipeline...") + progress_bar.progress(40) + + self.pipeline = EnhancedDataPreprocessingPipeline(config) + + # Run pipeline + status_text.text("Running preprocessing pipeline...") + progress_bar.progress(60) + + processed_data = self.pipeline.run_full_pipeline( + use_synthetic=False, # Synthetic data already loaded + save_intermediate=True, + create_reports=True + ) + + # Update progress + if processed_data is not None: + status_text.text("Getting data for modelling...") + progress_bar.progress(80) + + modeling_data = self.pipeline.get_final_data_for_modeling() + + # Save to session state + st.session_state.processed_data = processed_data + st.session_state.modeling_data = modeling_data + st.session_state.pipeline_completed = True + st.session_state.plots_path = os.path.join(config.results_dir, 'plots') + st.session_state.auto_pipeline_ready = True + + # Collect information about available plots + self.collect_available_plots() + + # Completion + status_text.text("Completing...") + progress_bar.progress(100) + + st.success("โœ… Quick test completed successfully!") + + # Show results + col1, col2, col3 = st.columns(3) + + with col1: + st.metric("Records generated", f"{synthetic_data.shape[0]:,}") + + with col2: + st.metric("Processed data", f"{processed_data.shape[0]:,} rows") + + with col3: + st.metric("Final features", f"{processed_data.shape[1]} columns") + + # Automatic transition to results + st.session_state.current_step = 5 + st.rerun() + + else: + st.error("โŒ Error running pipeline") + st.error("Check logs for more information") + + except Exception as e: + st.error(f"โŒ Error during quick test: {str(e)}") + import traceback + st.error(f"Error traceback: {traceback.format_exc()}") + + + def render_step_1_data_loading(self): + """Step 1: Data Loading""" + st.header("๐Ÿ“ Data Loading") + + # Check quick test mode + if st.session_state.quick_test_mode and not st.session_state.auto_pipeline_ready: + st.info("โšก Quick test mode activated. Generating synthetic data and running pipeline...") + self.quick_test_pipeline() + return + + col1, col2 = st.columns([2, 1]) + + with col1: + # File upload + uploaded_file = st.file_uploader( + "Upload CSV file with data", + type=['csv', 'xlsx', 'parquet'], + help="Supported formats: CSV, Excel, Parquet" + ) + + if uploaded_file is not None: + # Save file temporarily + file_path = f"temp_data.{uploaded_file.name.split('.')[-1]}" + with open(file_path, "wb") as f: + f.write(uploaded_file.getbuffer()) + + st.session_state.uploaded_file = file_path + st.session_state.config_params['data_path'] = file_path + + # Load and preview data + try: + if file_path.endswith('.csv'): + data = pd.read_csv(file_path) + elif file_path.endswith('.xlsx'): + data = pd.read_excel(file_path) + elif file_path.endswith('.parquet'): + data = pd.read_parquet(file_path) + else: + st.error("Unsupported file format") + return + + st.session_state.data_preview = data + + # Data preview + st.subheader("Data Preview") + st.dataframe(data.head(50), width='stretch') + + # Basic information + st.subheader("๐Ÿ“‹ Data Information") + + info_col1, info_col2, info_col3 = st.columns(3) + + with info_col1: + st.metric("Rows", data.shape[0]) + st.metric("Columns", data.shape[1]) + + with info_col2: + numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist() + st.metric("Numeric columns", len(numeric_cols)) + categorical_cols = data.select_dtypes(include=['object', 'category']).columns.tolist() + st.metric("Categorical columns", len(categorical_cols)) + + with info_col3: + total_missing = data.isnull().sum().sum() + missing_percentage = (total_missing / (data.shape[0] * data.shape[1])) * 100 + st.metric("Missing values", f"{total_missing:,}") + st.metric("Missing percentage", f"{missing_percentage:.2f}%") + + # Automatic target column selection if not set + if 'target_column' not in st.session_state.config_params or not st.session_state.config_params['target_column']: + numeric_columns = data.select_dtypes(include=[np.number]).columns.tolist() + if numeric_columns: + # Automatically select column with typical name + target_keywords = ['target', 'y', 'value', 'price', 'sales', 'demand', 'raskhod', 'ั€ะฐัั…ะพะด'] + selected_target = None + + for col in numeric_columns: + if any(keyword in col.lower() for keyword in target_keywords): + selected_target = col + break + + # If not found by keywords, take last numeric column + if not selected_target and numeric_columns: + selected_target = numeric_columns[-1] + + if selected_target: + st.session_state.config_params['target_column'] = selected_target + st.info(f"Target variable automatically selected: **{selected_target}**") + st.info("You can change it in the next step") + + # Button to proceed to next step + if st.button("โžก๏ธ Go to Configuration", type="primary", width='stretch'): + st.session_state.current_step = 2 + st.rerun() + + except Exception as e: + st.error(f"Error loading data: {str(e)}") + + with col2: + # Demo data + st.subheader("๐ŸŽฎ Demo Mode") + + demo_option = st.radio( + "Choose demo data:", + ["Synthetic Data", "Time Series Example"] + ) + + # Synthetic data settings + with st.expander("โš™๏ธ Synthetic Data Settings", expanded=False): + data_type = st.selectbox( + "Data Type", + options=["Simple", "Medium", "Complex"], + index=1, + help="Simple: 1 year, few features\nMedium: 2 years, main features\nComplex: 3 years, all features" + ) + + n_days = st.slider( + "Number of days", + min_value=90, + max_value=1825, + value=1095, + step=30, + help="Number of days in synthetic data" + ) + + include_trend = st.checkbox("Include trend", value=True) + include_seasonality = st.checkbox("Include seasonality", value=True) + include_noise = st.checkbox("Include noise", value=True) + include_exogenous = st.checkbox("Include additional features", value=True) + + if st.button("Generate and Load Synthetic Data", width='stretch'): + with st.spinner("Creating synthetic data..."): + try: + # Data type mapping + data_type_map = { + "Simple": "simple", + "Medium": "medium", + "Complex": "complex" + } + + # Generate synthetic data + synthetic_data = self.generate_synthetic_data( + n_days=n_days, + include_seasonality=include_seasonality, + include_trend=include_trend, + include_noise=include_noise, + include_exogenous=include_exogenous, + data_type=data_type_map[data_type] + ) + + if synthetic_data is not None: + st.session_state.data_preview = synthetic_data + st.session_state.uploaded_file = "synthetic_data" + st.session_state.config_params['data_path'] = 'synthetic_data' + + # Automatically select target variable + if 'raskhodvoda' in synthetic_data.columns: + st.session_state.config_params['target_column'] = 'raskhodvoda' + + st.success(f"โœ… Synthetic data created: {synthetic_data.shape[0]} rows, {synthetic_data.shape[1]} columns") + + # Show preview + st.subheader("Synthetic Data Preview") + st.dataframe(synthetic_data.head(20), width='stretch') + + # Statistics + st.subheader("๐Ÿ“Š Synthetic Data Statistics") + + stat_col1, stat_col2 = st.columns(2) + + with stat_col1: + st.metric("Period", f"{synthetic_data.shape[0]} days") + # CORRECTION: convert dates to strings for display + if 'date' in synthetic_data.columns: + min_date = synthetic_data['date'].min() + max_date = synthetic_data['date'].max() + if isinstance(min_date, (pd.Timestamp, datetime)): + st.text(f"Start: {min_date.strftime('%Y-%m-%d')}") + else: + st.text(f"Start: {str(min_date)}") + + if isinstance(max_date, (pd.Timestamp, datetime)): + st.text(f"End: {max_date.strftime('%Y-%m-%d')}") + else: + st.text(f"End: {str(max_date)}") + + with stat_col2: + if 'raskhodvoda' in synthetic_data.columns: + st.metric("Average consumption", f"{synthetic_data['raskhodvoda'].mean():.2f}") + st.metric("Max consumption", f"{synthetic_data['raskhodvoda'].max():.2f}") + st.metric("Min consumption", f"{synthetic_data['raskhodvoda'].min():.2f}") + + # Quick pipeline execution + st.markdown("---") + if st.button("๐Ÿš€ Quick Run Pipeline with This Data", type="primary", width='stretch'): + st.session_state.quick_test_mode = True + st.session_state.auto_pipeline_ready = False + st.rerun() + + st.rerun() + else: + st.error("Failed to generate synthetic data") + + except Exception as e: + st.error(f"Error creating synthetic data: {str(e)}") + + st.markdown("---") + + # Instructions + st.subheader("๐Ÿ“– Instructions") + st.markdown(""" + 1. Upload CSV file with data **OR** + 2. Generate synthetic data for testing + 3. Check data preview + 4. Target variable will be selected automatically + 5. Go to configuration to specify parameters + + **Data Requirements:** + - Date in separate column or index + - Clean column names + - Time series with regular intervals + """) + + def render_step_2_configuration(self): + """Step 2: Pipeline Configuration""" + st.header("โš™๏ธ Pipeline Configuration") + + # Automatic configuration for synthetic data + if st.session_state.uploaded_file == "synthetic_data" or st.session_state.config_params['data_path'] == 'synthetic_data': + st.info("โšก Synthetic data detected. Optimised configuration applied.") + + # Automatic parameter setup for synthetic data + if st.button("Apply Recommended Settings for Synthetic Data", width='stretch'): + st.session_state.config_params.update({ + 'target_column': 'raskhodvoda', + 'max_lags': 7, + 'seasonal_period': 365, + 'rolling_windows': [7, 30, 90], + 'expanding_windows': [30, 90], + 'test_size': 0.2, + 'validation_size': 0.1, + 'scaling_method': 'robust', + 'feature_selection_method': 'correlation', + 'max_features': 15, + 'missing_threshold': 0.3, + 'outlier_method': 'iqr', + 'enable_validation': True + }) + st.success("Settings applied!") + st.rerun() + + # Configuration sections + tab1, tab2, tab3, tab4 = st.tabs([ + "๐Ÿ“Š Basic Parameters", + "๐Ÿ”ง Data Processing", + "๐ŸŽฏ Features and Selection", + "๐Ÿ“ˆ Temporal Parameters" + ]) + + with tab1: + col1, col2 = st.columns(2) + + with col1: + st.subheader("Basic Parameters") + st.session_state.config_params['results_dir'] = st.text_input( + "Results Directory", + value=st.session_state.config_params['results_dir'] + ) + + # CORRECTION: replace text_input with selectbox for target variable selection + if hasattr(st.session_state, 'data_preview') and st.session_state.data_preview is not None: + # Get all data columns + all_columns = st.session_state.data_preview.columns.tolist() + + # If target variable already set and present in data, use it + current_target = st.session_state.config_params.get('target_column', '') + default_index = 0 + + if current_target in all_columns: + default_index = all_columns.index(current_target) + elif len(all_columns) > 0: + # Try to find suitable default column + numeric_columns = st.session_state.data_preview.select_dtypes(include=[np.number]).columns.tolist() + if numeric_columns: + # Look for columns with typical target variable names + target_keywords = ['target', 'y', 'value', 'price', 'sales', 'demand', 'raskhod', 'ั€ะฐัั…ะพะด'] + for i, col in enumerate(all_columns): + if any(keyword in col.lower() for keyword in target_keywords): + default_index = i + break + # If not found by keywords, take first numeric column + if default_index == 0 and numeric_columns[0] in all_columns: + default_index = all_columns.index(numeric_columns[0]) + + st.session_state.config_params['target_column'] = st.selectbox( + "Select Target Variable", + options=all_columns, + index=default_index, + help="Select column to be predicted" + ) + else: + # If data not loaded, keep text field + st.session_state.config_params['target_column'] = st.text_input( + "Target Variable", + value=st.session_state.config_params.get('target_column', ''), + help="Enter target column name" + ) + + st.session_state.config_params['enable_validation'] = st.checkbox( + "Enable Data Validation", + value=st.session_state.config_params['enable_validation'] + ) + + with col2: + st.subheader("Data Split") + st.session_state.config_params['test_size'] = st.slider( + "Test Set Size (%)", + min_value=5, + max_value=40, + value=int(st.session_state.config_params['test_size'] * 100), + step=5, + format="%d%%" + ) / 100 + + st.session_state.config_params['validation_size'] = st.slider( + "Validation Set Size (%)", + min_value=5, + max_value=30, + value=int(st.session_state.config_params['validation_size'] * 100), + step=5, + format="%d%%" + ) / 100 + + split_methods = ['time_based', 'random'] + st.session_state.config_params['split_method'] = st.selectbox( + "Split Method", + options=split_methods, + index=split_methods.index(st.session_state.config_params['split_method']) + ) + + with tab2: + col1, col2 = st.columns(2) + + with col1: + st.subheader("Missing Value Processing") + st.session_state.config_params['missing_threshold'] = st.slider( + "Missing Value Column Removal Threshold", + min_value=0.0, + max_value=0.5, + value=st.session_state.config_params['missing_threshold'], + step=0.05, + format="%.2f" + ) + + st.subheader("Outlier Processing") + outlier_methods = ['iqr', 'zscore', 'isolation_forest'] + st.session_state.config_params['outlier_method'] = st.selectbox( + "Outlier Detection Method", + options=outlier_methods, + index=outlier_methods.index(st.session_state.config_params['outlier_method']) + ) + + with col2: + st.subheader("Data Scaling") + scaling_methods = ['robust', 'standard', 'minmax', 'none'] + st.session_state.config_params['scaling_method'] = st.selectbox( + "Scaling Method", + options=scaling_methods, + index=scaling_methods.index(st.session_state.config_params['scaling_method']) + ) + + if st.session_state.config_params['scaling_method'] == 'none': + st.info("โš ๏ธ Data will not be scaled") + + with tab3: + col1, col2 = st.columns(2) + + with col1: + st.subheader("Feature Engineering") + st.session_state.config_params['max_lags'] = st.slider( + "Maximum Number of Lags", + min_value=1, + max_value=20, + value=st.session_state.config_params['max_lags'], + step=1 + ) + + rolling_windows_input = st.text_input( + "Windows for Rolling Statistics (comma-separated)", + value=', '.join(map(str, st.session_state.config_params['rolling_windows'])) + ) + if rolling_windows_input: + st.session_state.config_params['rolling_windows'] = [ + int(x.strip()) for x in rolling_windows_input.split(',') if x.strip().isdigit() + ] + + with col2: + st.subheader("Feature Selection") + feature_methods = ['correlation', 'variance', 'mutual_info', 'rf', 'none'] + st.session_state.config_params['feature_selection_method'] = st.selectbox( + "Feature Selection Method", + options=feature_methods, + index=feature_methods.index(st.session_state.config_params['feature_selection_method']) + ) + + st.session_state.config_params['max_features'] = st.slider( + "Maximum Number of Features", + min_value=5, + max_value=100, + value=st.session_state.config_params['max_features'], + step=5 + ) + + with tab4: + col1, col2 = st.columns(2) + + with col1: + st.subheader("Temporal Parameters") + + # If there is data for preview, show date range + if hasattr(st.session_state, 'data_preview'): + if 'date' in st.session_state.data_preview.columns: + date_col = 'date' + elif isinstance(st.session_state.data_preview.index, pd.DatetimeIndex): + dates = st.session_state.data_preview.index + else: + # Try to find date column + date_cols = [col for col in st.session_state.data_preview.columns + if 'date' in col.lower() or 'time' in col.lower()] + date_col = date_cols[0] if date_cols else None + + if date_col: + if date_col in st.session_state.data_preview.columns: + dates = pd.to_datetime(st.session_state.data_preview[date_col]) + else: + dates = st.session_state.data_preview.index + + if len(dates) > 0: + min_date = dates.min() + max_date = dates.max() + + col1_date, col2_date = st.columns(2) + with col1_date: + st.session_state.config_params['start_year'] = st.number_input( + "Start Year", + min_value=1900, + max_value=2100, + value=min_date.year, + step=1 + ) + with col2_date: + st.session_state.config_params['end_year'] = st.number_input( + "End Year", + min_value=1900, + max_value=2100, + value=max_date.year, + step=1 + ) + + with col2: + st.subheader("Seasonality") + st.session_state.config_params['seasonal_period'] = st.selectbox( + "Seasonal Period", + options=[7, 30, 90, 365, 12, 24], + index=[7, 30, 90, 365, 12, 24].index( + st.session_state.config_params['seasonal_period'] + ) if st.session_state.config_params['seasonal_period'] in [7, 30, 90, 365, 12, 24] else 0 + ) + + expanding_windows_input = st.text_input( + "Windows for Expanding Statistics (comma-separated)", + value=', '.join(map(str, st.session_state.config_params['expanding_windows'])) + ) + if expanding_windows_input: + st.session_state.config_params['expanding_windows'] = [ + int(x.strip()) for x in expanding_windows_input.split(',') if x.strip().isdigit() + ] + + # Navigation buttons + col1, col2, col3 = st.columns([1, 1, 1]) + + with col1: + if st.button("โฌ…๏ธ Back to Loading", width='stretch'): + st.session_state.current_step = 1 + st.rerun() + + with col3: + if st.button("Go to Analysis โžก๏ธ", type="primary", width='stretch'): + st.session_state.current_step = 3 + st.rerun() + + def render_step_3_data_analysis(self): + """Step 3: Data Analysis""" + st.header("๐Ÿ” Data Analysis") + + if not hasattr(st.session_state, 'data_preview') or st.session_state.data_preview is None: + st.warning("First load data in Step 1") + if st.button("Return to Data Loading"): + st.session_state.current_step = 1 + st.rerun() + return + + data = st.session_state.data_preview + + # Analysis tabs + tab1, tab2, tab3, tab4 = st.tabs([ + "๐Ÿ“ˆ Statistics", + "๐Ÿ” Distributions", + "๐Ÿ“… Temporal Analysis", + "โ“ Missing Values and Outliers" + ]) + + with tab1: + col1, col2 = st.columns(2) + + with col1: + st.subheader("Basic Statistics") + st.dataframe(data.describe().round(2), width='stretch') + + with col2: + st.subheader("Data Types") + dtype_info = pd.DataFrame({ + 'Column': data.columns, + 'Type': data.dtypes.values, + 'Unique Values': [data[col].nunique() for col in data.columns] + }) + st.dataframe(dtype_info, width='stretch') + + with tab2: + # Select column for visualisation + numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist() + + if numeric_cols: + selected_col = st.selectbox( + "Select Column for Analysis", + options=numeric_cols + ) + + col1, col2 = st.columns(2) + + with col1: + # Histogram + fig = px.histogram( + data, + x=selected_col, + title=f"Distribution of {selected_col}", + nbins=50, + color_discrete_sequence=['#636EFA'] + ) + st.plotly_chart(fig, width='stretch') + + with col2: + # Box plot + fig = go.Figure() + fig.add_trace(go.Box( + y=data[selected_col], + name=selected_col, + boxpoints='outliers', + marker_color='#EF553B' + )) + fig.update_layout( + title=f"Box plot {selected_col}", + yaxis_title=selected_col + ) + st.plotly_chart(fig, width='stretch') + else: + st.warning("No numeric columns for distribution analysis") + + with tab3: + # Time series analysis + date_cols = [col for col in data.columns if 'date' in col.lower()] + + if date_cols or isinstance(data.index, pd.DatetimeIndex): + if date_cols: + date_col = date_cols[0] + dates = pd.to_datetime(data[date_col]) + else: + dates = data.index + date_col = 'index' + + # Check for numeric columns + if len(numeric_cols) > 0: + # Select column for time series + ts_col = st.selectbox( + "Select Column for Time Series", + options=numeric_cols + ) + + # Time series + fig = go.Figure() + fig.add_trace(go.Scatter( + x=dates, + y=data[ts_col], + mode='lines', + name=ts_col, + line=dict(color='#636EFA', width=2) + )) + + fig.update_layout( + title=f"Time Series: {ts_col}", + xaxis_title="Date", + yaxis_title=ts_col, + hovermode='x unified' + ) + + st.plotly_chart(fig, width='stretch') + + # Seasonality (if sufficient data) + if len(dates) > 30: + # Monthly trend + if hasattr(dates, 'month'): + monthly_data = data.groupby(dates.dt.month)[ts_col].mean() + + fig2 = px.bar( + x=monthly_data.index, + y=monthly_data.values, + title=f"Monthly Seasonality: {ts_col}", + labels={'x': 'Month', 'y': 'Average Value'} + ) + st.plotly_chart(fig2, width='stretch') + else: + st.warning("No numeric columns for temporal analysis") + else: + st.info("For temporal analysis, date column or DatetimeIndex required") + + with tab4: + col1, col2 = st.columns(2) + + with col1: + # Missing value analysis + st.subheader("Missing Values") + missing_data = data.isnull().sum() + missing_percentage = (missing_data / len(data)) * 100 + + missing_df = pd.DataFrame({ + 'Column': missing_data.index, + 'Missing Count': missing_data.values, + 'Missing Percentage': missing_percentage.values + }).sort_values('Missing Count', ascending=False) + + st.dataframe(missing_df, width='stretch') + + # Missing values visualisation + if missing_data.sum() > 0: + fig = px.bar( + missing_df, + x='Column', + y='Missing Percentage', + title="Missing Percentage by Column", + color='Missing Percentage', + color_continuous_scale='Reds' + ) + st.plotly_chart(fig, width='stretch') + + with col2: + # Quick outlier analysis + st.subheader("Quick Outlier Analysis") + + if len(numeric_cols) > 0: + outlier_summary = [] + + for col in numeric_cols[:5]: # Limit to 5 columns for speed + q1 = data[col].quantile(0.25) + q3 = data[col].quantile(0.75) + iqr = q3 - q1 + lower_bound = q1 - 1.5 * iqr + upper_bound = q3 + 1.5 * iqr + + outliers = data[(data[col] < lower_bound) | (data[col] > upper_bound)] + outlier_pct = (len(outliers) / len(data)) * 100 + + outlier_summary.append({ + 'Column': col, + 'Outliers': len(outliers), + 'Percentage': f"{outlier_pct:.2f}%" + }) + + outlier_df = pd.DataFrame(outlier_summary) + st.dataframe(outlier_df, width='stretch') + else: + st.warning("No numeric columns for outlier analysis") + + # Navigation buttons + col1, col2, col3 = st.columns([1, 1, 1]) + + with col1: + if st.button("โฌ…๏ธ Back to Configuration", width='stretch'): + st.session_state.current_step = 2 + st.rerun() + + with col3: + if st.button("Run Pipeline โžก๏ธ", type="primary", width='stretch'): + st.session_state.current_step = 4 + st.rerun() + + def render_step_4_pipeline_execution(self): + """Step 4: Pipeline Execution""" + st.header("โšก Pipeline Execution") + + # Readiness check + ready_to_run = True + issues = [] + + if not st.session_state.uploaded_file and st.session_state.config_params['data_path'] != 'demo' and st.session_state.config_params['data_path'] != 'synthetic_data': + issues.append("Data not loaded") + ready_to_run = False + + if not st.session_state.config_params['target_column']: + issues.append("Target variable not selected") + ready_to_run = False + + # Automatic synthetic data generation if quick test enabled + if st.session_state.quick_test_mode and not st.session_state.auto_pipeline_ready: + st.info("โšก Quick test mode activated. Generating synthetic data...") + self.quick_test_pipeline() + return + + # Display warnings + if issues: + st.error("โš ๏ธ Fix before running:") + for issue in issues: + st.write(f"- {issue}") + + # Suggest using synthetic data + st.markdown("---") + st.subheader("๐ŸŽฎ Quick Solution") + + col1, col2 = st.columns(2) + + with col1: + if st.button("Generate Synthetic Data", width='stretch'): + st.session_state.current_step = 1 + st.rerun() + + with col2: + if st.button("To Data Loading", width='stretch'): + st.session_state.current_step = 1 + st.rerun() + + col3, col4 = st.columns(2) + with col3: + if st.button("To Configuration", width='stretch'): + st.session_state.current_step = 2 + st.rerun() + + return + + # Display configuration + st.subheader("Execution Configuration") + + config_col1, config_col2 = st.columns(2) + + with config_col1: + st.metric("Target Variable", st.session_state.config_params['target_column']) + st.metric("Test Set", f"{st.session_state.config_params['test_size']*100:.0f}%") + st.metric("Scaling Method", st.session_state.config_params['scaling_method']) + + with config_col2: + st.metric("Max Lags", st.session_state.config_params['max_lags']) + st.metric("Feature Selection Method", st.session_state.config_params['feature_selection_method']) + st.metric("Validation Enabled", "Yes" if st.session_state.config_params['enable_validation'] else "No") + + # Execution options + st.subheader("Execution Options") + + col1, col2 = st.columns(2) + + with col1: + use_synthetic = st.checkbox( + "Use Synthetic Data", + value=(st.session_state.config_params['data_path'] == 'demo' or + st.session_state.config_params['data_path'] == 'synthetic_data'), + disabled=(st.session_state.config_params['data_path'] == 'demo' or + st.session_state.config_params['data_path'] == 'synthetic_data') + ) + + save_intermediate = st.checkbox( + "Save Intermediate Results", + value=True + ) + + with col2: + create_reports = st.checkbox( + "Create Reports", + value=True + ) + + create_visualisations = st.checkbox( + "Create Visualisations", + value=True, + help="Create data analysis plots" + ) + + # Run button + if st.button("๐Ÿš€ Run Preprocessing Pipeline", type="primary", width='stretch'): + + # Create progress bar + progress_bar = st.progress(0) + status_text = st.empty() + + try: + # Create configuration + status_text.text("Creating configuration...") + progress_bar.progress(10) + + config = Config(**st.session_state.config_params) + + # Create pipeline + status_text.text("Initialising pipeline...") + progress_bar.progress(20) + + self.pipeline = EnhancedDataPreprocessingPipeline(config) + + # Determine whether to use synthetic data + use_synthetic_flag = (use_synthetic or + st.session_state.config_params['data_path'] == 'demo' or + st.session_state.config_params['data_path'] == 'synthetic_data') + + # Run pipeline + status_text.text("Running preprocessing pipeline...") + progress_bar.progress(30) + + processed_data = self.pipeline.run_full_pipeline( + use_synthetic=use_synthetic_flag, + save_intermediate=save_intermediate, + create_reports=create_reports + ) + + # Update progress + if processed_data is not None: + status_text.text("Getting data for modelling...") + progress_bar.progress(80) + + modeling_data = self.pipeline.get_final_data_for_modelling() + + # Save to session state + st.session_state.processed_data = processed_data + st.session_state.modeling_data = modeling_data + st.session_state.pipeline_completed = True + st.session_state.plots_path = os.path.join(config.results_dir, 'plots') + + # Collect information about available plots + self.collect_available_plots() + + # Completion + status_text.text("Completing...") + progress_bar.progress(100) + + st.success("โœ… Pipeline completed successfully!") + + # Show results + col1, col2, col3 = st.columns(3) + + with col1: + if hasattr(self.pipeline, 'results') and 'data_loading' in self.pipeline.results: + st.metric("Original Data", f"{self.pipeline.results['data_loading']['shape'][0]:,} rows") + else: + st.metric("Original Data", "Information unavailable") + + with col2: + st.metric("Processed Data", f"{processed_data.shape[0]:,} rows") + + with col3: + st.metric("Final Features", f"{processed_data.shape[1]} columns") + + # Button to proceed to results + if st.button("๐Ÿ“Š Go to Results", type="primary", width='stretch'): + st.session_state.current_step = 5 + st.rerun() + + else: + st.error("โŒ Error executing pipeline") + st.error("Check logs for more information") + + except Exception as e: + progress_bar.progress(0) + status_text.text("") + st.error(f"โŒ Error: {str(e)}") + st.exception(e) + + # Back button + if st.button("โฌ…๏ธ Back to Analysis", width='stretch'): + st.session_state.current_step = 3 + st.rerun() + + def collect_available_plots(self): + """Collect information about available plots""" + if not st.session_state.plots_path or not os.path.exists(st.session_state.plots_path): + st.session_state.available_plots = {} + return + + plots_categories = { + 'summary': ['summary_dashboard.png'], + 'missing_values': ['missing_values_analysis.png'], + 'outliers': ['outliers_analysis.png', 'outlier_handling_results.png', 'temporal_outliers.png'], + 'stationarity': ['stationarity_*.png'], + 'data_split': ['data_split.png'], + 'scaling': ['scaling_results.png'], + 'feature_selection': ['feature_selection_*.png'], + 'correlations': ['correlation_matrix.png', 'high_correlations.png', 'target_correlations.png', 'vif_scores.png'] + } + + available_plots = {} + + for category, patterns in plots_categories.items(): + category_plots = [] + + # Search for files for each pattern + for pattern in patterns: + # For general patterns + if '*' in pattern: + search_path = os.path.join(st.session_state.plots_path, pattern) + files = glob.glob(search_path) + + # Also search in subfolders + for root, dirs, filenames in os.walk(st.session_state.plots_path): + for filename in filenames: + if pattern.replace('*', '') in filename and filename.endswith('.png'): + full_path = os.path.join(root, filename) + if full_path not in files: + files.append(full_path) + else: + # For specific file names + file_path = os.path.join(st.session_state.plots_path, pattern) + + # Check in main folder + if os.path.exists(file_path): + files = [file_path] + else: + # Check in subfolders + files = [] + for root, dirs, filenames in os.walk(st.session_state.plots_path): + for filename in filenames: + if filename == pattern: + files.append(os.path.join(root, filename)) + + for file in files: + if os.path.exists(file): + # Get relative path for display + rel_path = os.path.relpath(file, st.session_state.plots_path) + category_plots.append({ + 'path': file, + 'name': os.path.basename(file), + 'rel_path': rel_path, + 'size': os.path.getsize(file) + }) + + if category_plots: + available_plots[category] = category_plots + + # Also add all found PNG files in general folder + all_png_files = [] + for root, dirs, filenames in os.walk(st.session_state.plots_path): + for filename in filenames: + if filename.endswith('.png'): + file_path = os.path.join(root, filename) + # Check if this file already added + already_added = False + for category_plots in available_plots.values(): + for plot in category_plots: + if plot['path'] == file_path: + already_added = True + break + + if not already_added: + rel_path = os.path.relpath(file_path, st.session_state.plots_path) + all_png_files.append({ + 'path': file_path, + 'name': filename, + 'rel_path': rel_path, + 'size': os.path.getsize(file_path) + }) + + if all_png_files: + available_plots['other'] = all_png_files + + st.session_state.available_plots = available_plots + + def render_step_5_results(self): + """Step 5: Results""" + st.header("๐Ÿ“Š Pipeline Results") + + if not st.session_state.pipeline_completed or st.session_state.processed_data is None: + st.warning("Pipeline not yet run or not completed successfully") + + # Suggest using quick test + st.markdown("---") + st.subheader("๐ŸŽฎ Quick Start") + + col1, col2 = st.columns(2) + with col1: + if st.button("๐Ÿš€ Run Quick Test", type="primary", width='stretch'): + st.session_state.quick_test_mode = True + st.session_state.current_step = 1 + st.rerun() + + with col2: + if st.button("Load Data", width='stretch'): + st.session_state.current_step = 1 + st.rerun() + + return + + data = st.session_state.processed_data + modeling_data = st.session_state.modeling_data + + # Results tabs + tab1, tab2, tab3, tab4 = st.tabs([ + "๐Ÿ“ˆ Data Overview", + "๐Ÿ“Š Feature Analysis", + "๐Ÿ“‰ Validation", + "๐Ÿ’พ Export" + ]) + + with tab1: + st.subheader("Processed Data") + + # Basic information + info_col1, info_col2, info_col3, info_col4 = st.columns(4) + + with info_col1: + st.metric("Total Records", f"{data.shape[0]:,}") + with info_col2: + st.metric("Total Features", data.shape[1]) + with info_col3: + numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist() + st.metric("Numeric Features", len(numeric_cols)) + with info_col4: + missing_total = data.isnull().sum().sum() + st.metric("Missing Values", missing_total) + + # Data preview + st.subheader("Data Preview") + st.dataframe(data.head(100), width='stretch') + + # Statistics + st.subheader("Processed Data Statistics") + st.dataframe(data.describe().round(4), width='stretch') + + with tab2: + st.subheader("Feature Analysis") + + if modeling_data and 'feature_names' in modeling_data: + features = modeling_data['feature_names'] + + # Feature list + st.write(f"**Selected Features:** {len(features)}") + + # Display features as cards + cols_per_row = 4 + for i in range(0, len(features), cols_per_row): + cols = st.columns(cols_per_row) + for j in range(cols_per_row): + idx = i + j + if idx < len(features): + with cols[j]: + st.info(features[idx]) + + # Feature importance (if available) + if (self.pipeline is not None and + hasattr(self.pipeline, 'feature_selector') and + self.pipeline.feature_selector is not None): + + # Check for feature_importances_ + if hasattr(self.pipeline.feature_selector, 'feature_importances_'): + importances = self.pipeline.feature_selector.feature_importances_ + + if importances is not None and len(importances) > 0: + importance_df = pd.DataFrame({ + 'Feature': features[:len(importances)] if len(features) >= len(importances) else features, + 'Importance': importances[:len(features)] if len(importances) >= len(features) else importances + }).sort_values('Importance', ascending=False) + + st.subheader("Feature Importance") + + fig = px.bar( + importance_df.head(20), + x='Importance', + y='Feature', + orientation='h', + title="Top-20 Features by Importance", + color='Importance', + color_continuous_scale='Viridis' + ) + st.plotly_chart(fig, width='stretch') + + # Correlation matrix (limited for performance) + if data.shape[1] <= 50: # Performance limit + st.subheader("Correlation Matrix (first 20 features)") + + # Select only numeric columns and limit quantity + numeric_data = data.select_dtypes(include=[np.number]) + if len(numeric_data.columns) > 20: + numeric_data = numeric_data.iloc[:, :20] + + if not numeric_data.empty and len(numeric_data.columns) > 1: + corr_matrix = numeric_data.corr() + + fig = go.Figure(data=go.Heatmap( + z=corr_matrix.values, + x=corr_matrix.columns, + y=corr_matrix.columns, + colorscale='RdBu', + zmin=-1, + zmax=1, + text=corr_matrix.round(2).values, + texttemplate='%{text}', + textfont={"size": 10} + )) + + fig.update_layout( + title="Correlation Matrix", + width=800, + height=800 + ) + + st.plotly_chart(fig, width='stretch') + else: + st.info("Insufficient data for correlation matrix") + + with tab3: + st.subheader("Validation Results") + + # Improved validation result availability check + validation_available = False + validation_data = None + + if self.pipeline is not None: + # Check for results in pipeline + if hasattr(self.pipeline, 'results'): + # Look for validation results under different keys + validation_keys = ['final_validation', 'validation_results', 'validation', 'validation_checks'] + for key in validation_keys: + if key in self.pipeline.results: + validation_data = self.pipeline.results[key] + validation_available = True + break + + # If not found in results, check other attributes + if not validation_available and hasattr(self.pipeline, 'validation_report'): + validation_data = self.pipeline.validation_report + validation_available = True + + # Or check processing results + if not validation_available and hasattr(self.pipeline, 'get_validation_summary'): + try: + validation_data = self.pipeline.get_validation_summary() + validation_available = True + except: + pass + + # If validation results available + if validation_available and validation_data: + st.success("โœ… Validation results available") + + # Check validation data format + if isinstance(validation_data, dict): + # Display as dictionary + col1, col2 = st.columns(2) + + with col1: + # Status + status = validation_data.get('status', 'UNKNOWN') + if status == 'PASS': + st.success(f"Status: {status}") + elif status == 'WARNING': + st.warning(f"Status: {status}") + else: + st.error(f"Status: {status}") + + # Overall score + score = validation_data.get('overall_score', validation_data.get('score', 0)) + if score: + st.metric("Overall Score", f"{score}/100") + + with col2: + # Check counters + if 'checks' in validation_data: + checks = validation_data['checks'] + elif 'basic_checks' in validation_data: + checks = validation_data['basic_checks'] + else: + checks = validation_data + + if isinstance(checks, dict): + passed = sum(1 for check in checks.values() + if isinstance(check, dict) and check.get('passed', False)) + total = len(checks) + st.metric("Checks Passed", f"{passed}/{total}") + + # Check details + st.subheader("Check Details") + + # Determine where checks are located + checks_to_display = None + if 'checks' in validation_data: + checks_to_display = validation_data['checks'] + elif 'basic_checks' in validation_data: + checks_to_display = validation_data['basic_checks'] + elif any(isinstance(v, dict) and 'passed' in v for v in validation_data.values()): + checks_to_display = validation_data + + if checks_to_display and isinstance(checks_to_display, dict): + for check_name, check_info in checks_to_display.items(): + if isinstance(check_info, dict): + col1, col2, col3 = st.columns([3, 1, 3]) + + with col1: + # Check description + description = check_info.get('description', check_name) + st.write(f"**{description}**") + + with col2: + # Status + if check_info.get('passed', False): + st.success("โœ…") + else: + st.error("โŒ") + + with col3: + # Message + if 'message' in check_info: + st.caption(check_info['message']) + else: + # Simple format + st.write(f"**{check_name}**: {check_info}") + else: + # Display all validation data + st.json(validation_data) + else: + # If not dictionary, display as is + st.write("Validation results:") + st.write(validation_data) + else: + # If no validation results, show pipeline information + st.info("Validation results in report format not available, but pipeline execution statistics presented below") + + # Pipeline stage statistics + st.subheader("Pipeline Execution Statistics") + + # Create stage table + stages = [ + ("Data Loading", "โœ… Successful" if data is not None else "โŒ Error"), + ("Missing Value Processing", "โœ… Completed"), + ("Outlier Processing", "โœ… Completed"), + ("Feature Engineering", "โœ… Completed"), + ("Scaling", "โœ… Completed"), + ("Feature Selection", "โœ… Completed"), + ("Data Split", "โœ… Completed" if modeling_data else "โŒ Not completed") + ] + + for stage_name, status in stages: + col1, col2 = st.columns([3, 1]) + with col1: + st.write(f"**{stage_name}**") + with col2: + if "โœ…" in status: + st.success(status) + else: + st.error(status) + + # If pipeline exists, show available metrics + if self.pipeline is not None: + # Check for various metrics + st.subheader("Data Quality Metrics") + + col1, col2, col3 = st.columns(3) + + with col1: + # Data quality + if data is not None: + missing_pct = (data.isnull().sum().sum() / (data.shape[0] * data.shape[1])) * 100 + st.metric("Missing Values", f"{missing_pct:.2f}%") + + with col2: + # Feature information + if data is not None: + numeric_cols = len(data.select_dtypes(include=[np.number]).columns) + st.metric("Numeric Features", numeric_cols) + + with col3: + # Split information + if modeling_data and 'X_train' in modeling_data: + train_size = len(modeling_data['X_train']) + total_size = train_size + if 'X_test' in modeling_data: + total_size += len(modeling_data['X_test']) + if 'X_val' in modeling_data: + total_size += len(modeling_data['X_val']) + + if total_size > 0: + train_pct = (train_size / total_size) * 100 + st.metric("Training Set", f"{train_pct:.1f}%") + + with tab4: + st.subheader("Data Export") + + # Export formats + export_format = st.radio( + "Export Format", + options=['CSV', 'Parquet', 'Excel'], + horizontal=True + ) + + # Export buttons + if data is not None: + # Export processed data + st.write("**Processed Data**") + + if export_format == 'CSV': + csv = data.to_csv(index=True) + st.download_button( + label="๐Ÿ“ฅ Download CSV", + data=csv, + file_name="streamlit_processed_data.csv", + mime="text/csv", + width='stretch' + ) + + elif export_format == 'Parquet': + # For Parquet need to save to temporary file + import io + buffer = io.BytesIO() + data.to_parquet(buffer) + buffer.seek(0) + + st.download_button( + label="๐Ÿ“ฅ Download Parquet", + data=buffer, + file_name="streamlit_processed_data.parquet", + mime="application/octet-stream", + width='stretch' + ) + + elif export_format == 'Excel': + import io + buffer = io.BytesIO() + with pd.ExcelWriter(buffer, engine='openpyxl') as writer: + data.to_excel(writer, sheet_name='Processed_Data') + + buffer.seek(0) + + st.download_button( + label="๐Ÿ“ฅ Download Excel", + data=buffer, + file_name="streamlit_processed_data.xlsx", + mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + width='stretch' + ) + + # Export modeling data + if modeling_data: + st.write("**Modeling Data**") + + col1, col2, col3 = st.columns(3) + + with col1: + if 'X_train' in modeling_data and modeling_data['X_train'] is not None: + train_df = pd.concat([ + modeling_data['X_train'], + modeling_data['y_train'].rename('target') + ], axis=1) if 'y_train' in modeling_data else modeling_data['X_train'] + + st.download_button( + label="๐Ÿ“ฅ Training Set", + data=train_df.to_csv(), + file_name="train_data.csv", + mime="text/csv", + width='stretch' + ) + + with col2: + if 'X_val' in modeling_data and modeling_data['X_val'] is not None: + val_df = pd.concat([ + modeling_data['X_val'], + modeling_data['y_val'].rename('target') + ], axis=1) if 'y_val' in modeling_data else modeling_data['X_val'] + + st.download_button( + label="๐Ÿ“ฅ Validation Set", + data=val_df.to_csv(), + file_name="validation_data.csv", + mime="text/csv", + width='stretch' + ) + + with col3: + if 'X_test' in modeling_data and modeling_data['X_test'] is not None: + test_df = pd.concat([ + modeling_data['X_test'], + modeling_data['y_test'].rename('target') + ], axis=1) if 'y_test' in modeling_data else modeling_data['X_test'] + + st.download_button( + label="๐Ÿ“ฅ Test Set", + data=test_df.to_csv(), + file_name="test_data.csv", + mime="text/csv", + width='stretch' + ) + + # Navigation + st.markdown("---") + col1, col2, col3 = st.columns([1, 1, 1]) + + with col1: + if st.button("โฌ…๏ธ Back to Pipeline", width='stretch'): + st.session_state.current_step = 4 + st.rerun() + + with col3: + if st.button("Go to Visualisations โžก๏ธ", type="primary", width='stretch'): + st.session_state.current_step = 6 + st.rerun() + + def render_step_6_visualisations(self): + """Step 6: Visualisations""" + st.header("๐Ÿ“ˆ Pipeline Visualisations") + + if not st.session_state.pipeline_completed: + st.warning("First run pipeline in Step 4") + + # Suggest quick test + st.markdown("---") + st.subheader("๐ŸŽฎ Quick Test") + + col1, col2 = st.columns(2) + with col1: + if st.button("๐Ÿš€ Run Quick Test", type="primary", width='stretch'): + st.session_state.quick_test_mode = True + st.session_state.current_step = 1 + st.rerun() + + with col2: + if st.button("Run Pipeline", width='stretch'): + st.session_state.current_step = 4 + st.rerun() + + return + + # Check for plots + if not st.session_state.available_plots: + st.warning("Plots not found. Ensure pipeline was run with visualisation option enabled.") + + # Try to collect plots again + if st.button("Try to Find Plots", width='stretch'): + self.collect_available_plots() + st.rerun() + + return + + # Plot statistics + total_plots = sum(len(plots) for plots in st.session_state.available_plots.values()) + st.success(f"โœ… Found {total_plots} plots") + + # Plot category tabs + categories = list(st.session_state.available_plots.keys()) + + if 'summary' in categories: + categories.remove('summary') + categories.insert(0, 'summary') + + tabs = st.tabs([cat.capitalize().replace('_', ' ') for cat in categories]) + + for i, category in enumerate(categories): + with tabs[i]: + self.display_category_plots(category) + + # All plots in one gallery + st.markdown("---") + st.subheader("๐Ÿ–ผ๏ธ All Plots Gallery") + + # Collect all plots + all_plots = [] + for category, plots in st.session_state.available_plots.items(): + for plot in plots: + all_plots.append((category, plot)) + + # Display plots in grid + cols_per_row = 3 + for i in range(0, len(all_plots), cols_per_row): + cols = st.columns(cols_per_row) + for j in range(cols_per_row): + idx = i + j + if idx < len(all_plots): + category, plot_info = all_plots[idx] + with cols[j]: + self.display_plot_card(plot_info, category) + + def display_category_plots(self, category): + """Display plots in category""" + plots = st.session_state.available_plots.get(category, []) + + if not plots: + st.info(f"No plots in category '{category}'") + return + + st.subheader(f"{category.capitalize().replace('_', ' ')} ({len(plots)} plots)") + + # Sort plots by name + plots_sorted = sorted(plots, key=lambda x: x['name']) + + # Display plots in accordions for convenience + for plot_info in plots_sorted: + with st.expander(f"๐Ÿ“Š {plot_info['name'].replace('_', ' ').replace('.png', '')}", expanded=True): + self.display_plot_image(plot_info) + + def display_plot_card(self, plot_info, category): + """Display plot card""" + try: + # Load image + image = Image.open(plot_info['path']) + + # Create safe key for state + safe_key = plot_info['path'].replace('/', '_').replace('\\', '_').replace('.', '_') + + # Initialise state for this plot if not exists + if f"show_{safe_key}" not in st.session_state: + st.session_state[f"show_{safe_key}"] = False + + # Create card + with st.container(): + st.markdown(f"**{plot_info['name'].replace('_', ' ').replace('.png', '')}**") + st.image(image, width='stretch', caption=plot_info['rel_path']) + + # File information + size_kb = plot_info['size'] / 1024 + st.caption(f"Size: {size_kb:.1f} KB | Category: {category}") + + # Zoom control buttons + col1, col2 = st.columns(2) + + with col1: + # Zoom button + if st.button("๐Ÿ” Zoom", key=f"zoom_{safe_key}", width='stretch'): + st.session_state[f"show_{safe_key}"] = True + # Don't use st.rerun() here + + with col2: + # Hide zoomed image button (if shown) + if st.session_state[f"show_{safe_key}"]: + if st.button("โœ• Hide", key=f"hide_{safe_key}", width='stretch'): + st.session_state[f"show_{safe_key}"] = False + # Don't use st.rerun() here + + # If zoom button clicked, show zoomed image + if st.session_state[f"show_{safe_key}"]: + st.markdown("---") + st.subheader(f"๐Ÿ” {plot_info['name'].replace('_', ' ').replace('.png', '')}") + st.image(image, width='stretch') + + except Exception as e: + st.error(f"Error loading plot: {str(e)}") + st.code(f"Path: {plot_info['path']}") + + + def display_plot_image(self, plot_info): + """Display plot image""" + try: + # Load image + image = Image.open(plot_info['path']) + + # Display with information + col1, col2 = st.columns([3, 1]) + + with col1: + st.image(image, width='stretch') + + with col2: + # File information + st.metric("Size", f"{plot_info['size'] / 1024:.1f} KB") + st.metric("Resolution", f"{image.width}ร—{image.height}") + + # File format + st.write(f"**Format:** {image.format}") + + # Download button + with open(plot_info['path'], 'rb') as file: + btn = st.download_button( + label="๐Ÿ“ฅ Download", + data=file, + file_name=plot_info['name'], + mime="image/png", + width='stretch' + ) + + except Exception as e: + st.error(f"Error loading plot: {str(e)}") + st.code(f"Path: {plot_info['path']}") + + def render_step_7_modeling(self): + """Step 7: Modelling Preparation""" + st.header("๐Ÿค– Modelling Preparation") + + if not st.session_state.pipeline_completed or st.session_state.modeling_data is None: + st.warning("First run pipeline in Step 4") + + # Suggest quick test + st.markdown("---") + st.subheader("๐ŸŽฎ Quick Test") + + col1, col2 = st.columns(2) + with col1: + if st.button("๐Ÿš€ Run Quick Test", type="primary", width='stretch'): + st.session_state.quick_test_mode = True + st.session_state.current_step = 1 + st.rerun() + + with col2: + if st.button("Run Pipeline", width='stretch'): + st.session_state.current_step = 4 + st.rerun() + + return + + modeling_data = st.session_state.modeling_data + + # Basic information + col1, col2, col3, col4 = st.columns(4) + + with col1: + if 'X_train' in modeling_data and modeling_data['X_train'] is not None: + st.metric("Training Set", f"{modeling_data['X_train'].shape[0]:,} records") + with col2: + if 'X_val' in modeling_data and modeling_data['X_val'] is not None: + st.metric("Validation Set", f"{modeling_data['X_val'].shape[0]:,} records") + with col3: + if 'X_test' in modeling_data and modeling_data['X_test'] is not None: + st.metric("Test Set", f"{modeling_data['X_test'].shape[0]:,} records") + with col4: + if 'feature_names' in modeling_data and modeling_data['feature_names'] is not None: + st.metric("Number of Features", len(modeling_data['feature_names'])) + + # Tabs + tab1, tab2, tab3 = st.tabs([ + "๐Ÿ“ Data Structure", + "๐Ÿ“Š Target Variable Distribution", + "๐Ÿ”— ML Integration" + ]) + + with tab1: + st.subheader("Modeling Data Structure") + + # Information table + data_info = [] + + if 'X_train' in modeling_data and modeling_data['X_train'] is not None: + data_info.append({ + 'Dataset': 'Training', + 'Samples': modeling_data['X_train'].shape[0], + 'Features': modeling_data['X_train'].shape[1], + 'Target Variable': 'Yes' if 'y_train' in modeling_data and modeling_data['y_train'] is not None else 'No' + }) + + if 'X_val' in modeling_data and modeling_data['X_val'] is not None: + data_info.append({ + 'Dataset': 'Validation', + 'Samples': modeling_data['X_val'].shape[0], + 'Features': modeling_data['X_val'].shape[1], + 'Target Variable': 'Yes' if 'y_val' in modeling_data and modeling_data['y_val'] is not None else 'No' + }) + + if 'X_test' in modeling_data and modeling_data['X_test'] is not None: + data_info.append({ + 'Dataset': 'Test', + 'Samples': modeling_data['X_test'].shape[0], + 'Features': modeling_data['X_test'].shape[1], + 'Target Variable': 'Yes' if 'y_test' in modeling_data and modeling_data['y_test'] is not None else 'No' + }) + + if data_info: + st.table(pd.DataFrame(data_info)) + else: + st.info("Modeling data not available") + + # Data sample + st.subheader("Training Data Sample") + + if ('X_train' in modeling_data and modeling_data['X_train'] is not None and + 'y_train' in modeling_data and modeling_data['y_train'] is not None): + sample_data = pd.concat([ + modeling_data['X_train'].head(10), + modeling_data['y_train'].head(10).rename('target') + ], axis=1) + + st.dataframe(sample_data, width='stretch') + + with tab2: + st.subheader("Target Variable Distribution") + + if 'y_train' in modeling_data and modeling_data['y_train'] is not None: + # Target variable histogram + fig = px.histogram( + x=modeling_data['y_train'], + nbins=50, + title="Target Variable Distribution (Training Set)", + labels={'x': 'Target Variable', 'y': 'Frequency'}, + color_discrete_sequence=['#00CC96'] + ) + + st.plotly_chart(fig, width='stretch') + + # Statistics + col1, col2, col3, col4 = st.columns(4) + + with col1: + st.metric("Mean", f"{modeling_data['y_train'].mean():.2f}") + with col2: + st.metric("Standard Deviation", f"{modeling_data['y_train'].std():.2f}") + with col3: + st.metric("Minimum", f"{modeling_data['y_train'].min():.2f}") + with col4: + st.metric("Maximum", f"{modeling_data['y_train'].max():.2f}") + else: + st.info("Target variable not available") + + with tab3: + st.subheader("Machine Learning Library Integration") + + st.info(""" + Your data is ready for use with any Python ML libraries. + Below are code examples for various libraries. + """) + + # Library selection + ml_library = st.selectbox( + "Select ML Library", + options=["Scikit-learn", "XGBoost", "LightGBM", "CatBoost", "PyTorch", "TensorFlow"] + ) + + # Code generation + code_placeholder = st.empty() + + if ml_library == "Scikit-learn": + code = """# Example usage with Scikit-learn +from sklearn.ensemble import RandomForestRegressor +from sklearn.metrics import mean_squared_error, r2_score +import numpy as np + +# Use prepared data +X_train = modeling_data['X_train'] +y_train = modeling_data['y_train'] +X_val = modeling_data['X_val'] +y_val = modeling_data['y_val'] + +# Create and train model +model = RandomForestRegressor( + n_estimators=100, + max_depth=10, + random_state=42 +) + +model.fit(X_train, y_train) + +# Predictions and evaluation +y_pred = model.predict(X_val) + +print(f"RMSE: {np.sqrt(mean_squared_error(y_val, y_pred)):.4f}") +print(f"Rยฒ Score: {r2_score(y_val, y_pred):.4f}") +print(f"Feature Importance: {model.feature_importances_}")""" + + elif ml_library == "XGBoost": + code = """# Example usage with XGBoost +import xgboost as xgb +from sklearn.metrics import mean_squared_error +import numpy as np + +# Prepare data in DMatrix format +dtrain = xgb.DMatrix(modeling_data['X_train'], label=modeling_data['y_train']) +dval = xgb.DMatrix(modeling_data['X_val'], label=modeling_data['y_val']) + +# Model parameters +params = { + 'objective': 'reg:squarederror', + 'max_depth': 6, + 'learning_rate': 0.1, + 'subsample': 0.8, + 'colsample_bytree': 0.8, + 'seed': 42 +} + +# Train model +model = xgb.train( + params, + dtrain, + num_boost_round=100, + evals=[(dval, 'validation')], + early_stopping_rounds=10, + verbose_eval=False +) + +# Predictions +y_pred = model.predict(dval) + +print(f"RMSE: {np.sqrt(mean_squared_error(modeling_data['y_val'], y_pred)):.4f}") +print(f"Number of Trees: {model.best_ntree_limit}")""" + + elif ml_library == "LightGBM": + code = """# Example usage with LightGBM +import lightgbm as lgb +from sklearn.metrics import mean_squared_error +import numpy as np + +# Prepare data +train_data = lgb.Dataset( + modeling_data['X_train'], + label=modeling_data['y_train'] +) + +val_data = lgb.Dataset( + modeling_data['X_val'], + label=modeling_data['y_val'], + reference=train_data +) + +# Model parameters +params = { + 'objective': 'regression', + 'metric': 'rmse', + 'num_leaves': 31, + 'learning_rate': 0.05, + 'feature_fraction': 0.9, + 'bagging_fraction': 0.8, + 'bagging_freq': 5, + 'verbose': 0 +} + +# Train model +model = lgb.train( + params, + train_data, + valid_sets=[val_data], + num_boost_round=100, + callbacks=[lgb.early_stopping(10)] +) + +# Predictions +y_pred = model.predict(modeling_data['X_val']) + +print(f"RMSE: {np.sqrt(mean_squared_error(modeling_data['y_val'], y_pred)):.4f}") +print(f"Best Iteration: {model.best_iteration}")""" + + else: + code = f"""# Template for {ml_library} +# Your data available in modeling_data variable + +X_train = modeling_data['X_train'] +y_train = modeling_data['y_train'] +X_val = modeling_data['X_val'] +y_val = modeling_data['y_val'] +X_test = modeling_data['X_test'] +y_test = modeling_data['y_test'] + +# Code for {ml_library}... +print(f"Data sizes:") +print(f" X_train: {{X_train.shape}}") +print(f" y_train: {{y_train.shape}}") +print(f" X_val: {{X_val.shape}}") +print(f" X_test: {{X_test.shape}}")""" + + # Display code + code_placeholder.code(code, language='python') + + # Copy code button + try: + import pyperclip + if st.button("๐Ÿ“‹ Copy Code", width='stretch'): + try: + pyperclip.copy(code) + st.success("Code copied to clipboard!") + except: + st.warning("Failed to copy code. Copy manually.") + except: + st.warning("To copy code, install pyperclip library: pip install pyperclip") + + # Final information + st.markdown("---") + st.success(""" + ๐ŸŽ‰ Congratulations! You have successfully prepared data for machine learning. + + **Next Steps:** + 1. Use code above for integration with chosen ML library + 2. Experiment with various models + 3. Optimise hyperparameters + 4. Evaluate results on test set + """) + + # Navigation + col1, col2 = st.columns([1, 1]) + + with col1: + if st.button("โฌ…๏ธ Back to Visualisations", width='stretch'): + st.session_state.current_step = 6 + st.rerun() + + with col2: + if st.button("๐Ÿ”„ Run New Pipeline", type="primary", width='stretch'): + # Reset state + st.session_state.pipeline_completed = False + st.session_state.processed_data = None + st.session_state.modeling_data = None + st.session_state.current_step = 1 + st.session_state.uploaded_file = None + st.session_state.plots_path = None + st.session_state.available_plots = {} + st.session_state.synthetic_data_generated = False + st.session_state.auto_pipeline_ready = False + st.session_state.quick_test_mode = False + st.rerun() + + def render_footer(self): + """Application footer""" + st.markdown("---") + + col1, col2, col3 = st.columns(3) + + with col1: + st.markdown("**TimeFlowPro** v1.1.0") + st.caption("Added synthetic data generation") + + with col2: + st.markdown("๐Ÿ“ง Contacts: cool.araby@gmail.com") + + with col3: + st.markdown("ยฉ 2026 All Rights Reserved") + + def run(self): + """Run application""" + # Header + st.title("๐Ÿ“Š TimeFlow Pro - Data Analysis and Preprocessing") + st.markdown("---") + + # Sidebar + self.create_sidebar() + + # Main content depending on step + if st.session_state.current_step == 1: + self.render_step_1_data_loading() + elif st.session_state.current_step == 2: + self.render_step_2_configuration() + elif st.session_state.current_step == 3: + self.render_step_3_data_analysis() + elif st.session_state.current_step == 4: + self.render_step_4_pipeline_execution() + elif st.session_state.current_step == 5: + self.render_step_5_results() + elif st.session_state.current_step == 6: + self.render_step_6_visualisations() + elif st.session_state.current_step == 7: + self.render_step_7_modeling() + + # Footer + self.render_footer() + +# ============================================ +# APPLICATION LAUNCH +# ============================================ +if __name__ == "__main__": + app = StreamlitApp() + app.run() \ No newline at end of file