# ============================================ # TimeFlow Pro - Data Analysis and Preprocessing # ============================================ import sys import os sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) import streamlit as st import pandas as pd import numpy as np import os import sys import glob import re from datetime import datetime, timedelta import plotly.graph_objects as go import plotly.express as px from plotly.subplots import make_subplots from PIL import Image import matplotlib.pyplot as plt import warnings from pipeline.main_pipeline import EnhancedDataPreprocessingPipeline warnings.filterwarnings('ignore') # Add project path sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from config.config import Config from data_loader.data_loader import DataLoader from visualization.visualization_manager import VisualisationManager # ============================================ # PAGE CONFIGURATION # ============================================ st.set_page_config( page_title="TimeFlow Pro - Data Analysis and Preprocessing", page_icon="๐Ÿ“Š", layout="wide", initial_sidebar_state="expanded" ) # ============================================ # STATE MANAGEMENT CLASS # ============================================ class StreamlitApp: """Main Streamlit application class""" def __init__(self): self.init_session_state() self.config = None self.pipeline = None self.data = None def init_session_state(self): """Initialise session state""" if 'pipeline_completed' not in st.session_state: st.session_state.pipeline_completed = False if 'processed_data' not in st.session_state: st.session_state.processed_data = None if 'modeling_data' not in st.session_state: st.session_state.modeling_data = None if 'current_step' not in st.session_state: st.session_state.current_step = 1 if 'uploaded_file' not in st.session_state: st.session_state.uploaded_file = None if 'config_params' not in st.session_state: st.session_state.config_params = self.get_default_config() if 'plots_path' not in st.session_state: st.session_state.plots_path = None if 'available_plots' not in st.session_state: st.session_state.available_plots = {} if 'synthetic_data_generated' not in st.session_state: st.session_state.synthetic_data_generated = False if 'auto_pipeline_ready' not in st.session_state: st.session_state.auto_pipeline_ready = False if 'quick_test_mode' not in st.session_state: st.session_state.quick_test_mode = False def get_default_config(self): """Get default configuration""" return { 'data_path': '', 'results_dir': 'streamlit_results', 'target_column': '', 'start_year': 1970, 'end_year': 1990, 'max_lags': 5, 'seasonal_period': 365, 'rolling_windows': [7, 30, 90], 'expanding_windows': [30, 90], 'test_size': 0.2, 'validation_size': 0.1, 'scaling_method': 'robust', 'feature_selection_method': 'correlation', 'max_features': 20, 'missing_threshold': 0.3, 'outlier_method': 'iqr', 'enable_validation': True, 'split_method': 'time_based' } def create_sidebar(self): """Create sidebar""" with st.sidebar: st.title("๐ŸŽฏ TimeFlowPro") st.markdown("---") # Navigation st.subheader("Navigation") steps = { 1: "๐Ÿ“ Data Loading", 2: "โš™๏ธ Configuration", 3: "๐Ÿ” Data Analysis", 4: "โšก Pipeline Execution", 5: "๐Ÿ“Š Results", 6: "๐Ÿ“ˆ Visualisations", 7: "๐Ÿค– Modelling" } for step_num, step_name in steps.items(): if st.button( f"{step_name}", key=f"nav_{step_num}", type="primary" if st.session_state.current_step == step_num else "secondary", width='stretch' ): st.session_state.current_step = step_num st.rerun() st.markdown("---") # Quick start with synthetic data st.subheader("โšก Quick Test") if st.button("๐Ÿš€ Quick Start with Synthetic Data", type="primary", width='stretch', help="Generate synthetic data and run pipeline immediately"): st.session_state.quick_test_mode = True st.session_state.current_step = 1 st.rerun() st.markdown("---") # Project information st.subheader("๐Ÿ“ˆ About the Project") st.info(""" TimeFlow Pro - Data Analysis and Preprocessing. **New Features:** - Synthetic data generation for testing - Automatic pipeline execution - Quick testing without file upload **Standard Features:** - Missing data analysis and processing - Outlier detection - Feature engineering - Stationarity analysis - Data scaling - Feature selection """) # Progress indicator if st.session_state.pipeline_completed: st.success("โœ… Pipeline completed") else: st.warning("โš ๏ธ Pipeline not started") # Quick test indicator if st.session_state.quick_test_mode: st.info("โšก Quick test mode active") def generate_synthetic_data(self, n_days=1095, include_seasonality=True, include_trend=True, include_noise=True, include_exogenous=True, data_type="complex"): """ Generate synthetic data for testing Args: n_days (int): Number of days of data include_seasonality (bool): Include seasonality include_trend (bool): Include trend include_noise (bool): Include noise include_exogenous (bool): Include exogenous variables data_type (str): Data type (simple, medium, complex) Returns: pd.DataFrame: Generated synthetic data """ try: # Base parameters depending on data type if data_type == "simple": n_days = min(n_days, 365) # Limit for simple type trend_strength = 0.005 noise_std = 2 include_exogenous = False elif data_type == "medium": n_days = min(n_days, 730) # Limit for medium type trend_strength = 0.01 noise_std = 5 include_exogenous = True else: # complex n_days = min(n_days, 1095) # Limit for complex type trend_strength = 0.02 noise_std = 10 include_exogenous = True # Create dates start_date = datetime.now() - timedelta(days=n_days) dates = pd.date_range(start=start_date, periods=n_days, freq='D') # Base trend if include_trend: trend = np.linspace(0, trend_strength * n_days, n_days) else: trend = np.zeros(n_days) # Seasonality if include_seasonality: # Annual seasonality seasonal = 10 * np.sin(2 * np.pi * np.arange(n_days) / 365) # Quarterly seasonality seasonal += 5 * np.sin(2 * np.pi * np.arange(n_days) / 90) # Monthly seasonality seasonal += 3 * np.sin(2 * np.pi * np.arange(n_days) / 30) # Weekly seasonality seasonal += 2 * np.sin(2 * np.pi * np.arange(n_days) / 7) else: seasonal = np.zeros(n_days) # Main target variable (water consumption) base_value = 100 raskhodvoda = base_value + trend + seasonal # Add noise if include_noise: noise = np.random.normal(0, noise_std, n_days) raskhodvoda += noise # Create DataFrame data = pd.DataFrame({ 'date': dates, 'raskhodvoda': raskhodvoda }) # Add exogenous variables if include_exogenous: # Temperature (seasonal) data['temperature'] = 15 + 10 * np.sin(2 * np.pi * np.arange(n_days) / 365) + np.random.normal(0, 3, n_days) # Precipitation (random spikes) precipitation = np.random.exponential(2, n_days) # Add seasonality to precipitation precipitation_seasonality = 5 * np.sin(2 * np.pi * np.arange(n_days) / 365 + np.pi/2) data['precipitation'] = np.maximum(0, precipitation + precipitation_seasonality) # Pressure data['pressure'] = 760 + np.random.normal(0, 5, n_days) # Humidity data['humidity'] = 60 + 20 * np.sin(2 * np.pi * np.arange(n_days) / 180) + np.random.normal(0, 10, n_days) # Electricity consumption (correlated with target variable) data['electricity_consumption'] = raskhodvoda * 0.8 + np.random.normal(0, 5, n_days) # Day of week (categorical variable) data['day_of_week'] = dates.dayofweek data['is_weekend'] = (data['day_of_week'] >= 5).astype(int) # Holidays (random) holidays = np.random.choice([0, 1], size=n_days, p=[0.95, 0.05]) data['is_holiday'] = holidays # Lag variables for lag in [1, 7, 30]: data[f'raskhodvoda_lag_{lag}'] = data['raskhodvoda'].shift(lag) # Moving averages for window in [7, 30]: data[f'raskhodvoda_ma_{window}'] = data['raskhodvoda'].rolling(window=window).mean() # Add missing values for realism (5% random missing values) # CORRECTION: proper creation of missing value mask for col in data.columns: if col != 'date': # Don't add missing values to dates mask = np.random.random(len(data)) < 0.05 data.loc[mask, col] = np.nan # Add outliers (1% of data) # CORRECTION: proper creation of outlier mask numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist() for col in numeric_cols: outlier_mask = np.random.random(len(data)) < 0.01 if outlier_mask.any(): # Find outlier indices outlier_indices = data.index[outlier_mask] for idx in outlier_indices: if col in data.columns: mean_val = data[col].mean(skipna=True) std_val = data[col].std(skipna=True) if not np.isnan(mean_val) and not np.isnan(std_val) and std_val > 0: outlier_value = mean_val + 5 * std_val * np.random.choice([-1, 1]) data.at[idx, col] = outlier_value # Reset index data.reset_index(drop=True, inplace=True) st.session_state.synthetic_data_generated = True return data except Exception as e: st.error(f"Error generating synthetic data: {str(e)}") import traceback st.error(f"Error traceback: {traceback.format_exc()}") return None def quick_test_pipeline(self): """Quick pipeline execution with synthetic data""" with st.spinner("๐Ÿš€ Running quick test with synthetic data..."): try: # Step 1: Generate synthetic data st.info("Step 1: Generating synthetic data...") synthetic_data = self.generate_synthetic_data( n_days=365, # Reduced for speed include_seasonality=True, include_trend=True, include_noise=True, include_exogenous=True, data_type="medium" # Changed to medium for balance between speed and quality ) if synthetic_data is None: st.error("Failed to generate synthetic data") return # Save data to temporary file temp_file = "temp_synthetic_data.csv" synthetic_data.to_csv(temp_file, index=False) # Step 2: Configure settings st.info("Step 2: Configuring settings...") config_params = st.session_state.config_params.copy() config_params.update({ 'data_path': temp_file, 'target_column': 'raskhodvoda', 'start_year': 2020, 'end_year': 2023, 'max_lags': 7, 'seasonal_period': 365, 'rolling_windows': [7, 30], 'expanding_windows': [30], 'test_size': 0.2, 'validation_size': 0.1, 'scaling_method': 'robust', 'feature_selection_method': 'correlation', 'max_features': 10, # Reduced for speed 'missing_threshold': 0.3, 'outlier_method': 'iqr', 'enable_validation': True, 'split_method': 'time_based' }) # Step 3: Create and run pipeline st.info("Step 3: Creating and running pipeline...") # Create progress bar progress_bar = st.progress(0) status_text = st.empty() # Update configuration st.session_state.config_params = config_params st.session_state.uploaded_file = temp_file st.session_state.data_preview = synthetic_data # Create configuration status_text.text("Creating configuration...") progress_bar.progress(20) config = Config(**config_params) # Create pipeline status_text.text("Initialising pipeline...") progress_bar.progress(40) self.pipeline = EnhancedDataPreprocessingPipeline(config) # Run pipeline status_text.text("Running preprocessing pipeline...") progress_bar.progress(60) processed_data = self.pipeline.run_full_pipeline( use_synthetic=False, # Synthetic data already loaded save_intermediate=True, create_reports=True ) # Update progress if processed_data is not None: status_text.text("Getting data for modelling...") progress_bar.progress(80) modeling_data = self.pipeline.get_final_data_for_modelling() # Save to session state st.session_state.processed_data = processed_data st.session_state.modeling_data = modeling_data st.session_state.pipeline_completed = True st.session_state.plots_path = os.path.join(config.results_dir, 'plots') st.session_state.auto_pipeline_ready = True # Collect information about available plots self.collect_available_plots() # Completion status_text.text("Completing...") progress_bar.progress(100) st.success("โœ… Quick test completed successfully!") # Show results col1, col2, col3 = st.columns(3) with col1: st.metric("Records generated", f"{synthetic_data.shape[0]:,}") with col2: st.metric("Processed data", f"{processed_data.shape[0]:,} rows") with col3: st.metric("Final features", f"{processed_data.shape[1]} columns") # Automatic transition to results st.session_state.current_step = 5 st.rerun() else: st.error("โŒ Error running pipeline") st.error("Check logs for more information") except Exception as e: st.error(f"โŒ Error during quick test: {str(e)}") import traceback st.error(f"Error traceback: {traceback.format_exc()}") def render_step_1_data_loading(self): """Step 1: Data Loading""" st.header("๐Ÿ“ Data Loading") # Check quick test mode if st.session_state.quick_test_mode and not st.session_state.auto_pipeline_ready: st.info("โšก Quick test mode activated. Generating synthetic data and running pipeline...") self.quick_test_pipeline() return col1, col2 = st.columns([2, 1]) with col1: # File upload uploaded_file = st.file_uploader( "Upload CSV file with data", type=['csv', 'xlsx', 'parquet'], help="Supported formats: CSV, Excel, Parquet" ) if uploaded_file is not None: # Save file temporarily file_path = f"temp_data.{uploaded_file.name.split('.')[-1]}" with open(file_path, "wb") as f: f.write(uploaded_file.getbuffer()) st.session_state.uploaded_file = file_path st.session_state.config_params['data_path'] = file_path # Load and preview data try: if file_path.endswith('.csv'): data = pd.read_csv(file_path) elif file_path.endswith('.xlsx'): data = pd.read_excel(file_path) elif file_path.endswith('.parquet'): data = pd.read_parquet(file_path) else: st.error("Unsupported file format") return st.session_state.data_preview = data # Data preview st.subheader("Data Preview") st.dataframe(data.head(50), width='stretch') # Basic information st.subheader("๐Ÿ“‹ Data Information") info_col1, info_col2, info_col3 = st.columns(3) with info_col1: st.metric("Rows", data.shape[0]) st.metric("Columns", data.shape[1]) with info_col2: numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist() st.metric("Numeric columns", len(numeric_cols)) categorical_cols = data.select_dtypes(include=['object', 'category']).columns.tolist() st.metric("Categorical columns", len(categorical_cols)) with info_col3: total_missing = data.isnull().sum().sum() missing_percentage = (total_missing / (data.shape[0] * data.shape[1])) * 100 st.metric("Missing values", f"{total_missing:,}") st.metric("Missing percentage", f"{missing_percentage:.2f}%") # Automatic target column selection if not set if 'target_column' not in st.session_state.config_params or not st.session_state.config_params['target_column']: numeric_columns = data.select_dtypes(include=[np.number]).columns.tolist() if numeric_columns: # Automatically select column with typical name target_keywords = ['target', 'y', 'value', 'price', 'sales', 'demand', 'raskhod', 'ั€ะฐัั…ะพะด'] selected_target = None for col in numeric_columns: if any(keyword in col.lower() for keyword in target_keywords): selected_target = col break # If not found by keywords, take last numeric column if not selected_target and numeric_columns: selected_target = numeric_columns[-1] if selected_target: st.session_state.config_params['target_column'] = selected_target st.info(f"Target variable automatically selected: **{selected_target}**") st.info("You can change it in the next step") # Button to proceed to next step if st.button("โžก๏ธ Go to Configuration", type="primary", width='stretch'): st.session_state.current_step = 2 st.rerun() except Exception as e: st.error(f"Error loading data: {str(e)}") with col2: # Demo data st.subheader("๐ŸŽฎ Demo Mode") demo_option = st.radio( "Choose demo data:", ["Synthetic Data", "Time Series Example"] ) # Synthetic data settings with st.expander("โš™๏ธ Synthetic Data Settings", expanded=False): data_type = st.selectbox( "Data Type", options=["Simple", "Medium", "Complex"], index=1, help="Simple: 1 year, few features\nMedium: 2 years, main features\nComplex: 3 years, all features" ) n_days = st.slider( "Number of days", min_value=90, max_value=1825, value=1095, step=30, help="Number of days in synthetic data" ) include_trend = st.checkbox("Include trend", value=True) include_seasonality = st.checkbox("Include seasonality", value=True) include_noise = st.checkbox("Include noise", value=True) include_exogenous = st.checkbox("Include additional features", value=True) if st.button("Generate and Load Synthetic Data", width='stretch'): with st.spinner("Creating synthetic data..."): try: # Data type mapping data_type_map = { "Simple": "simple", "Medium": "medium", "Complex": "complex" } # Generate synthetic data synthetic_data = self.generate_synthetic_data( n_days=n_days, include_seasonality=include_seasonality, include_trend=include_trend, include_noise=include_noise, include_exogenous=include_exogenous, data_type=data_type_map[data_type] ) if synthetic_data is not None: st.session_state.data_preview = synthetic_data st.session_state.uploaded_file = "synthetic_data" st.session_state.config_params['data_path'] = 'synthetic_data' # Automatically select target variable if 'raskhodvoda' in synthetic_data.columns: st.session_state.config_params['target_column'] = 'raskhodvoda' st.success(f"โœ… Synthetic data created: {synthetic_data.shape[0]} rows, {synthetic_data.shape[1]} columns") # Show preview st.subheader("Synthetic Data Preview") st.dataframe(synthetic_data.head(20), width='stretch') # Statistics st.subheader("๐Ÿ“Š Synthetic Data Statistics") stat_col1, stat_col2 = st.columns(2) with stat_col1: st.metric("Period", f"{synthetic_data.shape[0]} days") # CORRECTION: convert dates to strings for display if 'date' in synthetic_data.columns: min_date = synthetic_data['date'].min() max_date = synthetic_data['date'].max() if isinstance(min_date, (pd.Timestamp, datetime)): st.text(f"Start: {min_date.strftime('%Y-%m-%d')}") else: st.text(f"Start: {str(min_date)}") if isinstance(max_date, (pd.Timestamp, datetime)): st.text(f"End: {max_date.strftime('%Y-%m-%d')}") else: st.text(f"End: {str(max_date)}") with stat_col2: if 'raskhodvoda' in synthetic_data.columns: st.metric("Average consumption", f"{synthetic_data['raskhodvoda'].mean():.2f}") st.metric("Max consumption", f"{synthetic_data['raskhodvoda'].max():.2f}") st.metric("Min consumption", f"{synthetic_data['raskhodvoda'].min():.2f}") # Quick pipeline execution st.markdown("---") if st.button("๐Ÿš€ Quick Run Pipeline with This Data", type="primary", width='stretch'): st.session_state.quick_test_mode = True st.session_state.auto_pipeline_ready = False st.rerun() st.rerun() else: st.error("Failed to generate synthetic data") except Exception as e: st.error(f"Error creating synthetic data: {str(e)}") st.markdown("---") # Instructions st.subheader("๐Ÿ“– Instructions") st.markdown(""" 1. Upload CSV file with data **OR** 2. Generate synthetic data for testing 3. Check data preview 4. Target variable will be selected automatically 5. Go to configuration to specify parameters **Data Requirements:** - Date in separate column or index - Clean column names - Time series with regular intervals """) def render_step_2_configuration(self): """Step 2: Pipeline Configuration""" st.header("โš™๏ธ Pipeline Configuration") # Automatic configuration for synthetic data if st.session_state.uploaded_file == "synthetic_data" or st.session_state.config_params['data_path'] == 'synthetic_data': st.info("โšก Synthetic data detected. Optimised configuration applied.") # Automatic parameter setup for synthetic data if st.button("Apply Recommended Settings for Synthetic Data", width='stretch'): st.session_state.config_params.update({ 'target_column': 'raskhodvoda', 'max_lags': 7, 'seasonal_period': 365, 'rolling_windows': [7, 30, 90], 'expanding_windows': [30, 90], 'test_size': 0.2, 'validation_size': 0.1, 'scaling_method': 'robust', 'feature_selection_method': 'correlation', 'max_features': 15, 'missing_threshold': 0.3, 'outlier_method': 'iqr', 'enable_validation': True }) st.success("Settings applied!") st.rerun() # Configuration sections tab1, tab2, tab3, tab4 = st.tabs([ "๐Ÿ“Š Basic Parameters", "๐Ÿ”ง Data Processing", "๐ŸŽฏ Features and Selection", "๐Ÿ“ˆ Temporal Parameters" ]) with tab1: col1, col2 = st.columns(2) with col1: st.subheader("Basic Parameters") st.session_state.config_params['results_dir'] = st.text_input( "Results Directory", value=st.session_state.config_params['results_dir'] ) # CORRECTION: replace text_input with selectbox for target variable selection if hasattr(st.session_state, 'data_preview') and st.session_state.data_preview is not None: # Get all data columns all_columns = st.session_state.data_preview.columns.tolist() # If target variable already set and present in data, use it current_target = st.session_state.config_params.get('target_column', '') default_index = 0 if current_target in all_columns: default_index = all_columns.index(current_target) elif len(all_columns) > 0: # Try to find suitable default column numeric_columns = st.session_state.data_preview.select_dtypes(include=[np.number]).columns.tolist() if numeric_columns: # Look for columns with typical target variable names target_keywords = ['target', 'y', 'value', 'price', 'sales', 'demand', 'raskhod', 'ั€ะฐัั…ะพะด'] for i, col in enumerate(all_columns): if any(keyword in col.lower() for keyword in target_keywords): default_index = i break # If not found by keywords, take first numeric column if default_index == 0 and numeric_columns[0] in all_columns: default_index = all_columns.index(numeric_columns[0]) st.session_state.config_params['target_column'] = st.selectbox( "Select Target Variable", options=all_columns, index=default_index, help="Select column to be predicted" ) else: # If data not loaded, keep text field st.session_state.config_params['target_column'] = st.text_input( "Target Variable", value=st.session_state.config_params.get('target_column', ''), help="Enter target column name" ) st.session_state.config_params['enable_validation'] = st.checkbox( "Enable Data Validation", value=st.session_state.config_params['enable_validation'] ) with col2: st.subheader("Data Split") st.session_state.config_params['test_size'] = st.slider( "Test Set Size (%)", min_value=5, max_value=40, value=int(st.session_state.config_params['test_size'] * 100), step=5, format="%d%%" ) / 100 st.session_state.config_params['validation_size'] = st.slider( "Validation Set Size (%)", min_value=5, max_value=30, value=int(st.session_state.config_params['validation_size'] * 100), step=5, format="%d%%" ) / 100 split_methods = ['time_based', 'random'] st.session_state.config_params['split_method'] = st.selectbox( "Split Method", options=split_methods, index=split_methods.index(st.session_state.config_params['split_method']) ) with tab2: col1, col2 = st.columns(2) with col1: st.subheader("Missing Value Processing") st.session_state.config_params['missing_threshold'] = st.slider( "Missing Value Column Removal Threshold", min_value=0.0, max_value=0.5, value=st.session_state.config_params['missing_threshold'], step=0.05, format="%.2f" ) st.subheader("Outlier Processing") outlier_methods = ['iqr', 'zscore', 'isolation_forest'] st.session_state.config_params['outlier_method'] = st.selectbox( "Outlier Detection Method", options=outlier_methods, index=outlier_methods.index(st.session_state.config_params['outlier_method']) ) with col2: st.subheader("Data Scaling") scaling_methods = ['robust', 'standard', 'minmax', 'none'] st.session_state.config_params['scaling_method'] = st.selectbox( "Scaling Method", options=scaling_methods, index=scaling_methods.index(st.session_state.config_params['scaling_method']) ) if st.session_state.config_params['scaling_method'] == 'none': st.info("โš ๏ธ Data will not be scaled") with tab3: col1, col2 = st.columns(2) with col1: st.subheader("Feature Engineering") st.session_state.config_params['max_lags'] = st.slider( "Maximum Number of Lags", min_value=1, max_value=20, value=st.session_state.config_params['max_lags'], step=1 ) rolling_windows_input = st.text_input( "Windows for Rolling Statistics (comma-separated)", value=', '.join(map(str, st.session_state.config_params['rolling_windows'])) ) if rolling_windows_input: st.session_state.config_params['rolling_windows'] = [ int(x.strip()) for x in rolling_windows_input.split(',') if x.strip().isdigit() ] with col2: st.subheader("Feature Selection") feature_methods = ['correlation', 'variance', 'mutual_info', 'rf', 'none'] st.session_state.config_params['feature_selection_method'] = st.selectbox( "Feature Selection Method", options=feature_methods, index=feature_methods.index(st.session_state.config_params['feature_selection_method']) ) st.session_state.config_params['max_features'] = st.slider( "Maximum Number of Features", min_value=5, max_value=100, value=st.session_state.config_params['max_features'], step=5 ) with tab4: col1, col2 = st.columns(2) with col1: st.subheader("Temporal Parameters") # If there is data for preview, show date range if hasattr(st.session_state, 'data_preview'): if 'date' in st.session_state.data_preview.columns: date_col = 'date' elif isinstance(st.session_state.data_preview.index, pd.DatetimeIndex): dates = st.session_state.data_preview.index else: # Try to find date column date_cols = [col for col in st.session_state.data_preview.columns if 'date' in col.lower() or 'time' in col.lower()] date_col = date_cols[0] if date_cols else None if date_col: if date_col in st.session_state.data_preview.columns: dates = pd.to_datetime(st.session_state.data_preview[date_col]) else: dates = st.session_state.data_preview.index if len(dates) > 0: min_date = dates.min() max_date = dates.max() col1_date, col2_date = st.columns(2) with col1_date: st.session_state.config_params['start_year'] = st.number_input( "Start Year", min_value=1900, max_value=2100, value=min_date.year, step=1 ) with col2_date: st.session_state.config_params['end_year'] = st.number_input( "End Year", min_value=1900, max_value=2100, value=max_date.year, step=1 ) with col2: st.subheader("Seasonality") st.session_state.config_params['seasonal_period'] = st.selectbox( "Seasonal Period", options=[7, 30, 90, 365, 12, 24], index=[7, 30, 90, 365, 12, 24].index( st.session_state.config_params['seasonal_period'] ) if st.session_state.config_params['seasonal_period'] in [7, 30, 90, 365, 12, 24] else 0 ) expanding_windows_input = st.text_input( "Windows for Expanding Statistics (comma-separated)", value=', '.join(map(str, st.session_state.config_params['expanding_windows'])) ) if expanding_windows_input: st.session_state.config_params['expanding_windows'] = [ int(x.strip()) for x in expanding_windows_input.split(',') if x.strip().isdigit() ] # Navigation buttons col1, col2, col3 = st.columns([1, 1, 1]) with col1: if st.button("โฌ…๏ธ Back to Loading", width='stretch'): st.session_state.current_step = 1 st.rerun() with col3: if st.button("Go to Analysis โžก๏ธ", type="primary", width='stretch'): st.session_state.current_step = 3 st.rerun() def render_step_3_data_analysis(self): """Step 3: Data Analysis""" st.header("๐Ÿ” Data Analysis") if not hasattr(st.session_state, 'data_preview') or st.session_state.data_preview is None: st.warning("First load data in Step 1") if st.button("Return to Data Loading"): st.session_state.current_step = 1 st.rerun() return data = st.session_state.data_preview # Analysis tabs tab1, tab2, tab3, tab4 = st.tabs([ "๐Ÿ“ˆ Statistics", "๐Ÿ” Distributions", "๐Ÿ“… Temporal Analysis", "โ“ Missing Values and Outliers" ]) with tab1: col1, col2 = st.columns(2) with col1: st.subheader("Basic Statistics") st.dataframe(data.describe().round(2), width='stretch') with col2: st.subheader("Data Types") dtype_info = pd.DataFrame({ 'Column': data.columns, 'Type': data.dtypes.values, 'Unique Values': [data[col].nunique() for col in data.columns] }) st.dataframe(dtype_info, width='stretch') with tab2: # Select column for visualisation numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist() if numeric_cols: selected_col = st.selectbox( "Select Column for Analysis", options=numeric_cols ) col1, col2 = st.columns(2) with col1: # Histogram fig = px.histogram( data, x=selected_col, title=f"Distribution of {selected_col}", nbins=50, color_discrete_sequence=['#636EFA'] ) st.plotly_chart(fig, width='stretch') with col2: # Box plot fig = go.Figure() fig.add_trace(go.Box( y=data[selected_col], name=selected_col, boxpoints='outliers', marker_color='#EF553B' )) fig.update_layout( title=f"Box plot {selected_col}", yaxis_title=selected_col ) st.plotly_chart(fig, width='stretch') else: st.warning("No numeric columns for distribution analysis") with tab3: # Time series analysis date_cols = [col for col in data.columns if 'date' in col.lower()] if date_cols or isinstance(data.index, pd.DatetimeIndex): if date_cols: date_col = date_cols[0] dates = pd.to_datetime(data[date_col]) else: dates = data.index date_col = 'index' # Check for numeric columns if len(numeric_cols) > 0: # Select column for time series ts_col = st.selectbox( "Select Column for Time Series", options=numeric_cols ) # Time series fig = go.Figure() fig.add_trace(go.Scatter( x=dates, y=data[ts_col], mode='lines', name=ts_col, line=dict(color='#636EFA', width=2) )) fig.update_layout( title=f"Time Series: {ts_col}", xaxis_title="Date", yaxis_title=ts_col, hovermode='x unified' ) st.plotly_chart(fig, width='stretch') # Seasonality (if sufficient data) if len(dates) > 30: # Monthly trend if hasattr(dates, 'month'): monthly_data = data.groupby(dates.dt.month)[ts_col].mean() fig2 = px.bar( x=monthly_data.index, y=monthly_data.values, title=f"Monthly Seasonality: {ts_col}", labels={'x': 'Month', 'y': 'Average Value'} ) st.plotly_chart(fig2, width='stretch') else: st.warning("No numeric columns for temporal analysis") else: st.info("For temporal analysis, date column or DatetimeIndex required") with tab4: col1, col2 = st.columns(2) with col1: # Missing value analysis st.subheader("Missing Values") missing_data = data.isnull().sum() missing_percentage = (missing_data / len(data)) * 100 missing_df = pd.DataFrame({ 'Column': missing_data.index, 'Missing Count': missing_data.values, 'Missing Percentage': missing_percentage.values }).sort_values('Missing Count', ascending=False) st.dataframe(missing_df, width='stretch') # Missing values visualisation if missing_data.sum() > 0: fig = px.bar( missing_df, x='Column', y='Missing Percentage', title="Missing Percentage by Column", color='Missing Percentage', color_continuous_scale='Reds' ) st.plotly_chart(fig, width='stretch') with col2: # Quick outlier analysis st.subheader("Quick Outlier Analysis") if len(numeric_cols) > 0: outlier_summary = [] for col in numeric_cols[:5]: # Limit to 5 columns for speed q1 = data[col].quantile(0.25) q3 = data[col].quantile(0.75) iqr = q3 - q1 lower_bound = q1 - 1.5 * iqr upper_bound = q3 + 1.5 * iqr outliers = data[(data[col] < lower_bound) | (data[col] > upper_bound)] outlier_pct = (len(outliers) / len(data)) * 100 outlier_summary.append({ 'Column': col, 'Outliers': len(outliers), 'Percentage': f"{outlier_pct:.2f}%" }) outlier_df = pd.DataFrame(outlier_summary) st.dataframe(outlier_df, width='stretch') else: st.warning("No numeric columns for outlier analysis") # Navigation buttons col1, col2, col3 = st.columns([1, 1, 1]) with col1: if st.button("โฌ…๏ธ Back to Configuration", width='stretch'): st.session_state.current_step = 2 st.rerun() with col3: if st.button("Run Pipeline โžก๏ธ", type="primary", width='stretch'): st.session_state.current_step = 4 st.rerun() def render_step_4_pipeline_execution(self): """Step 4: Pipeline Execution""" st.header("โšก Pipeline Execution") # Readiness check ready_to_run = True issues = [] if not st.session_state.uploaded_file and st.session_state.config_params['data_path'] != 'demo' and st.session_state.config_params['data_path'] != 'synthetic_data': issues.append("Data not loaded") ready_to_run = False if not st.session_state.config_params['target_column']: issues.append("Target variable not selected") ready_to_run = False # Automatic synthetic data generation if quick test enabled if st.session_state.quick_test_mode and not st.session_state.auto_pipeline_ready: st.info("โšก Quick test mode activated. Generating synthetic data...") self.quick_test_pipeline() return # Display warnings if issues: st.error("โš ๏ธ Fix before running:") for issue in issues: st.write(f"- {issue}") # Suggest using synthetic data st.markdown("---") st.subheader("๐ŸŽฎ Quick Solution") col1, col2 = st.columns(2) with col1: if st.button("Generate Synthetic Data", width='stretch'): st.session_state.current_step = 1 st.rerun() with col2: if st.button("To Data Loading", width='stretch'): st.session_state.current_step = 1 st.rerun() col3, col4 = st.columns(2) with col3: if st.button("To Configuration", width='stretch'): st.session_state.current_step = 2 st.rerun() return # Display configuration st.subheader("Execution Configuration") config_col1, config_col2 = st.columns(2) with config_col1: st.metric("Target Variable", st.session_state.config_params['target_column']) st.metric("Test Set", f"{st.session_state.config_params['test_size']*100:.0f}%") st.metric("Scaling Method", st.session_state.config_params['scaling_method']) with config_col2: st.metric("Max Lags", st.session_state.config_params['max_lags']) st.metric("Feature Selection Method", st.session_state.config_params['feature_selection_method']) st.metric("Validation Enabled", "Yes" if st.session_state.config_params['enable_validation'] else "No") # Execution options st.subheader("Execution Options") col1, col2 = st.columns(2) with col1: use_synthetic = st.checkbox( "Use Synthetic Data", value=(st.session_state.config_params['data_path'] == 'demo' or st.session_state.config_params['data_path'] == 'synthetic_data'), disabled=(st.session_state.config_params['data_path'] == 'demo' or st.session_state.config_params['data_path'] == 'synthetic_data') ) save_intermediate = st.checkbox( "Save Intermediate Results", value=True ) with col2: create_reports = st.checkbox( "Create Reports", value=True ) create_visualisations = st.checkbox( "Create Visualisations", value=True, help="Create data analysis plots" ) # Run button if st.button("๐Ÿš€ Run Preprocessing Pipeline", type="primary", width='stretch'): # Create progress bar progress_bar = st.progress(0) status_text = st.empty() try: # Create configuration status_text.text("Creating configuration...") progress_bar.progress(10) config = Config(**st.session_state.config_params) # Create pipeline status_text.text("Initialising pipeline...") progress_bar.progress(20) self.pipeline = EnhancedDataPreprocessingPipeline(config) # Determine whether to use synthetic data use_synthetic_flag = (use_synthetic or st.session_state.config_params['data_path'] == 'demo' or st.session_state.config_params['data_path'] == 'synthetic_data') # Run pipeline status_text.text("Running preprocessing pipeline...") progress_bar.progress(30) processed_data = self.pipeline.run_full_pipeline( use_synthetic=use_synthetic_flag, save_intermediate=save_intermediate, create_reports=create_reports ) # Update progress if processed_data is not None: status_text.text("Getting data for modelling...") progress_bar.progress(80) modeling_data = self.pipeline.get_final_data_for_modelling() # Save to session state st.session_state.processed_data = processed_data st.session_state.modeling_data = modeling_data st.session_state.pipeline_completed = True st.session_state.plots_path = os.path.join(config.results_dir, 'plots') # Collect information about available plots self.collect_available_plots() # Completion status_text.text("Completing...") progress_bar.progress(100) st.success("โœ… Pipeline completed successfully!") # Show results col1, col2, col3 = st.columns(3) with col1: if hasattr(self.pipeline, 'results') and 'data_loading' in self.pipeline.results: st.metric("Original Data", f"{self.pipeline.results['data_loading']['shape'][0]:,} rows") else: st.metric("Original Data", "Information unavailable") with col2: st.metric("Processed Data", f"{processed_data.shape[0]:,} rows") with col3: st.metric("Final Features", f"{processed_data.shape[1]} columns") # Button to proceed to results if st.button("๐Ÿ“Š Go to Results", type="primary", width='stretch'): st.session_state.current_step = 5 st.rerun() else: st.error("โŒ Error executing pipeline") st.error("Check logs for more information") except Exception as e: progress_bar.progress(0) status_text.text("") st.error(f"โŒ Error: {str(e)}") st.exception(e) # Back button if st.button("โฌ…๏ธ Back to Analysis", width='stretch'): st.session_state.current_step = 3 st.rerun() def collect_available_plots(self): """Collect information about available plots""" if not st.session_state.plots_path or not os.path.exists(st.session_state.plots_path): st.session_state.available_plots = {} return plots_categories = { 'summary': ['summary_dashboard.png'], 'missing_values': ['missing_values_analysis.png'], 'outliers': ['outliers_analysis.png', 'outlier_handling_results.png', 'temporal_outliers.png'], 'stationarity': ['stationarity_*.png'], 'data_split': ['data_split.png'], 'scaling': ['scaling_results.png'], 'feature_selection': ['feature_selection_*.png'], 'correlations': ['correlation_matrix.png', 'high_correlations.png', 'target_correlations.png', 'vif_scores.png'] } available_plots = {} for category, patterns in plots_categories.items(): category_plots = [] # Search for files for each pattern for pattern in patterns: # For general patterns if '*' in pattern: search_path = os.path.join(st.session_state.plots_path, pattern) files = glob.glob(search_path) # Also search in subfolders for root, dirs, filenames in os.walk(st.session_state.plots_path): for filename in filenames: if pattern.replace('*', '') in filename and filename.endswith('.png'): full_path = os.path.join(root, filename) if full_path not in files: files.append(full_path) else: # For specific file names file_path = os.path.join(st.session_state.plots_path, pattern) # Check in main folder if os.path.exists(file_path): files = [file_path] else: # Check in subfolders files = [] for root, dirs, filenames in os.walk(st.session_state.plots_path): for filename in filenames: if filename == pattern: files.append(os.path.join(root, filename)) for file in files: if os.path.exists(file): # Get relative path for display rel_path = os.path.relpath(file, st.session_state.plots_path) category_plots.append({ 'path': file, 'name': os.path.basename(file), 'rel_path': rel_path, 'size': os.path.getsize(file) }) if category_plots: available_plots[category] = category_plots # Also add all found PNG files in general folder all_png_files = [] for root, dirs, filenames in os.walk(st.session_state.plots_path): for filename in filenames: if filename.endswith('.png'): file_path = os.path.join(root, filename) # Check if this file already added already_added = False for category_plots in available_plots.values(): for plot in category_plots: if plot['path'] == file_path: already_added = True break if not already_added: rel_path = os.path.relpath(file_path, st.session_state.plots_path) all_png_files.append({ 'path': file_path, 'name': filename, 'rel_path': rel_path, 'size': os.path.getsize(file_path) }) if all_png_files: available_plots['other'] = all_png_files st.session_state.available_plots = available_plots def render_step_5_results(self): """Step 5: Results""" st.header("๐Ÿ“Š Pipeline Results") if not st.session_state.pipeline_completed or st.session_state.processed_data is None: st.warning("Pipeline not yet run or not completed successfully") # Suggest using quick test st.markdown("---") st.subheader("๐ŸŽฎ Quick Start") col1, col2 = st.columns(2) with col1: if st.button("๐Ÿš€ Run Quick Test", type="primary", width='stretch'): st.session_state.quick_test_mode = True st.session_state.current_step = 1 st.rerun() with col2: if st.button("Load Data", width='stretch'): st.session_state.current_step = 1 st.rerun() return data = st.session_state.processed_data modeling_data = st.session_state.modeling_data # Results tabs tab1, tab2, tab3, tab4 = st.tabs([ "๐Ÿ“ˆ Data Overview", "๐Ÿ“Š Feature Analysis", "๐Ÿ“‰ Validation", "๐Ÿ’พ Export" ]) with tab1: st.subheader("Processed Data") # Basic information info_col1, info_col2, info_col3, info_col4 = st.columns(4) with info_col1: st.metric("Total Records", f"{data.shape[0]:,}") with info_col2: st.metric("Total Features", data.shape[1]) with info_col3: numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist() st.metric("Numeric Features", len(numeric_cols)) with info_col4: missing_total = data.isnull().sum().sum() st.metric("Missing Values", missing_total) # Data preview st.subheader("Data Preview") st.dataframe(data.head(100), width='stretch') # Statistics st.subheader("Processed Data Statistics") st.dataframe(data.describe().round(4), width='stretch') with tab2: st.subheader("Feature Analysis") if modeling_data and 'feature_names' in modeling_data: features = modeling_data['feature_names'] # Feature list st.write(f"**Selected Features:** {len(features)}") # Display features as cards cols_per_row = 4 for i in range(0, len(features), cols_per_row): cols = st.columns(cols_per_row) for j in range(cols_per_row): idx = i + j if idx < len(features): with cols[j]: st.info(features[idx]) # Feature importance (if available) if (self.pipeline is not None and hasattr(self.pipeline, 'feature_selector') and self.pipeline.feature_selector is not None): # Check for feature_importances_ if hasattr(self.pipeline.feature_selector, 'feature_importances_'): importances = self.pipeline.feature_selector.feature_importances_ if importances is not None and len(importances) > 0: importance_df = pd.DataFrame({ 'Feature': features[:len(importances)] if len(features) >= len(importances) else features, 'Importance': importances[:len(features)] if len(importances) >= len(features) else importances }).sort_values('Importance', ascending=False) st.subheader("Feature Importance") fig = px.bar( importance_df.head(20), x='Importance', y='Feature', orientation='h', title="Top-20 Features by Importance", color='Importance', color_continuous_scale='Viridis' ) st.plotly_chart(fig, width='stretch') # Correlation matrix (limited for performance) if data.shape[1] <= 50: # Performance limit st.subheader("Correlation Matrix (first 20 features)") # Select only numeric columns and limit quantity numeric_data = data.select_dtypes(include=[np.number]) if len(numeric_data.columns) > 20: numeric_data = numeric_data.iloc[:, :20] if not numeric_data.empty and len(numeric_data.columns) > 1: corr_matrix = numeric_data.corr() fig = go.Figure(data=go.Heatmap( z=corr_matrix.values, x=corr_matrix.columns, y=corr_matrix.columns, colorscale='RdBu', zmin=-1, zmax=1, text=corr_matrix.round(2).values, texttemplate='%{text}', textfont={"size": 10} )) fig.update_layout( title="Correlation Matrix", width=800, height=800 ) st.plotly_chart(fig, width='stretch') else: st.info("Insufficient data for correlation matrix") with tab3: st.subheader("Validation Results") # Improved validation result availability check validation_available = False validation_data = None if self.pipeline is not None: # Check for results in pipeline if hasattr(self.pipeline, 'results'): # Look for validation results under different keys validation_keys = ['final_validation', 'validation_results', 'validation', 'validation_checks'] for key in validation_keys: if key in self.pipeline.results: validation_data = self.pipeline.results[key] validation_available = True break # If not found in results, check other attributes if not validation_available and hasattr(self.pipeline, 'validation_report'): validation_data = self.pipeline.validation_report validation_available = True # Or check processing results if not validation_available and hasattr(self.pipeline, 'get_validation_summary'): try: validation_data = self.pipeline.get_validation_summary() validation_available = True except: pass # If validation results available if validation_available and validation_data: st.success("โœ… Validation results available") # Check validation data format if isinstance(validation_data, dict): # Display as dictionary col1, col2 = st.columns(2) with col1: # Status status = validation_data.get('status', 'UNKNOWN') if status == 'PASS': st.success(f"Status: {status}") elif status == 'WARNING': st.warning(f"Status: {status}") else: st.error(f"Status: {status}") # Overall score score = validation_data.get('overall_score', validation_data.get('score', 0)) if score: st.metric("Overall Score", f"{score}/100") with col2: # Check counters if 'checks' in validation_data: checks = validation_data['checks'] elif 'basic_checks' in validation_data: checks = validation_data['basic_checks'] else: checks = validation_data if isinstance(checks, dict): passed = sum(1 for check in checks.values() if isinstance(check, dict) and check.get('passed', False)) total = len(checks) st.metric("Checks Passed", f"{passed}/{total}") # Check details st.subheader("Check Details") # Determine where checks are located checks_to_display = None if 'checks' in validation_data: checks_to_display = validation_data['checks'] elif 'basic_checks' in validation_data: checks_to_display = validation_data['basic_checks'] elif any(isinstance(v, dict) and 'passed' in v for v in validation_data.values()): checks_to_display = validation_data if checks_to_display and isinstance(checks_to_display, dict): for check_name, check_info in checks_to_display.items(): if isinstance(check_info, dict): col1, col2, col3 = st.columns([3, 1, 3]) with col1: # Check description description = check_info.get('description', check_name) st.write(f"**{description}**") with col2: # Status if check_info.get('passed', False): st.success("โœ…") else: st.error("โŒ") with col3: # Message if 'message' in check_info: st.caption(check_info['message']) else: # Simple format st.write(f"**{check_name}**: {check_info}") else: # Display all validation data st.json(validation_data) else: # If not dictionary, display as is st.write("Validation results:") st.write(validation_data) else: # If no validation results, show pipeline information st.info("Validation results in report format not available, but pipeline execution statistics presented below") # Pipeline stage statistics st.subheader("Pipeline Execution Statistics") # Create stage table stages = [ ("Data Loading", "โœ… Successful" if data is not None else "โŒ Error"), ("Missing Value Processing", "โœ… Completed"), ("Outlier Processing", "โœ… Completed"), ("Feature Engineering", "โœ… Completed"), ("Scaling", "โœ… Completed"), ("Feature Selection", "โœ… Completed"), ("Data Split", "โœ… Completed" if modeling_data else "โŒ Not completed") ] for stage_name, status in stages: col1, col2 = st.columns([3, 1]) with col1: st.write(f"**{stage_name}**") with col2: if "โœ…" in status: st.success(status) else: st.error(status) # If pipeline exists, show available metrics if self.pipeline is not None: # Check for various metrics st.subheader("Data Quality Metrics") col1, col2, col3 = st.columns(3) with col1: # Data quality if data is not None: missing_pct = (data.isnull().sum().sum() / (data.shape[0] * data.shape[1])) * 100 st.metric("Missing Values", f"{missing_pct:.2f}%") with col2: # Feature information if data is not None: numeric_cols = len(data.select_dtypes(include=[np.number]).columns) st.metric("Numeric Features", numeric_cols) with col3: # Split information if modeling_data and 'X_train' in modeling_data: train_size = len(modeling_data['X_train']) total_size = train_size if 'X_test' in modeling_data: total_size += len(modeling_data['X_test']) if 'X_val' in modeling_data: total_size += len(modeling_data['X_val']) if total_size > 0: train_pct = (train_size / total_size) * 100 st.metric("Training Set", f"{train_pct:.1f}%") with tab4: st.subheader("Data Export") # Export formats export_format = st.radio( "Export Format", options=['CSV', 'Parquet', 'Excel'], horizontal=True ) # Export buttons if data is not None: # Export processed data st.write("**Processed Data**") if export_format == 'CSV': csv = data.to_csv(index=True) st.download_button( label="๐Ÿ“ฅ Download CSV", data=csv, file_name="streamlit_processed_data.csv", mime="text/csv", width='stretch' ) elif export_format == 'Parquet': # For Parquet need to save to temporary file import io buffer = io.BytesIO() data.to_parquet(buffer) buffer.seek(0) st.download_button( label="๐Ÿ“ฅ Download Parquet", data=buffer, file_name="streamlit_processed_data.parquet", mime="application/octet-stream", width='stretch' ) elif export_format == 'Excel': import io buffer = io.BytesIO() with pd.ExcelWriter(buffer, engine='openpyxl') as writer: data.to_excel(writer, sheet_name='Processed_Data') buffer.seek(0) st.download_button( label="๐Ÿ“ฅ Download Excel", data=buffer, file_name="streamlit_processed_data.xlsx", mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", width='stretch' ) # Export modeling data if modeling_data: st.write("**Modeling Data**") col1, col2, col3 = st.columns(3) with col1: if 'X_train' in modeling_data and modeling_data['X_train'] is not None: train_df = pd.concat([ modeling_data['X_train'], modeling_data['y_train'].rename('target') ], axis=1) if 'y_train' in modeling_data else modeling_data['X_train'] st.download_button( label="๐Ÿ“ฅ Training Set", data=train_df.to_csv(), file_name="train_data.csv", mime="text/csv", width='stretch' ) with col2: if 'X_val' in modeling_data and modeling_data['X_val'] is not None: val_df = pd.concat([ modeling_data['X_val'], modeling_data['y_val'].rename('target') ], axis=1) if 'y_val' in modeling_data else modeling_data['X_val'] st.download_button( label="๐Ÿ“ฅ Validation Set", data=val_df.to_csv(), file_name="validation_data.csv", mime="text/csv", width='stretch' ) with col3: if 'X_test' in modeling_data and modeling_data['X_test'] is not None: test_df = pd.concat([ modeling_data['X_test'], modeling_data['y_test'].rename('target') ], axis=1) if 'y_test' in modeling_data else modeling_data['X_test'] st.download_button( label="๐Ÿ“ฅ Test Set", data=test_df.to_csv(), file_name="test_data.csv", mime="text/csv", width='stretch' ) # Navigation st.markdown("---") col1, col2, col3 = st.columns([1, 1, 1]) with col1: if st.button("โฌ…๏ธ Back to Pipeline", width='stretch'): st.session_state.current_step = 4 st.rerun() with col3: if st.button("Go to Visualisations โžก๏ธ", type="primary", width='stretch'): st.session_state.current_step = 6 st.rerun() def render_step_6_visualisations(self): """Step 6: Visualisations""" st.header("๐Ÿ“ˆ Pipeline Visualisations") if not st.session_state.pipeline_completed: st.warning("First run pipeline in Step 4") # Suggest quick test st.markdown("---") st.subheader("๐ŸŽฎ Quick Test") col1, col2 = st.columns(2) with col1: if st.button("๐Ÿš€ Run Quick Test", type="primary", width='stretch'): st.session_state.quick_test_mode = True st.session_state.current_step = 1 st.rerun() with col2: if st.button("Run Pipeline", width='stretch'): st.session_state.current_step = 4 st.rerun() return # Check for plots if not st.session_state.available_plots: st.warning("Plots not found. Ensure pipeline was run with visualisation option enabled.") # Try to collect plots again if st.button("Try to Find Plots", width='stretch'): self.collect_available_plots() st.rerun() return # Plot statistics total_plots = sum(len(plots) for plots in st.session_state.available_plots.values()) st.success(f"โœ… Found {total_plots} plots") # Plot category tabs categories = list(st.session_state.available_plots.keys()) if 'summary' in categories: categories.remove('summary') categories.insert(0, 'summary') tabs = st.tabs([cat.capitalize().replace('_', ' ') for cat in categories]) for i, category in enumerate(categories): with tabs[i]: self.display_category_plots(category) # All plots in one gallery st.markdown("---") st.subheader("๐Ÿ–ผ๏ธ All Plots Gallery") # Collect all plots all_plots = [] for category, plots in st.session_state.available_plots.items(): for plot in plots: all_plots.append((category, plot)) # Display plots in grid cols_per_row = 3 for i in range(0, len(all_plots), cols_per_row): cols = st.columns(cols_per_row) for j in range(cols_per_row): idx = i + j if idx < len(all_plots): category, plot_info = all_plots[idx] with cols[j]: self.display_plot_card(plot_info, category) def display_category_plots(self, category): """Display plots in category""" plots = st.session_state.available_plots.get(category, []) if not plots: st.info(f"No plots in category '{category}'") return st.subheader(f"{category.capitalize().replace('_', ' ')} ({len(plots)} plots)") # Sort plots by name plots_sorted = sorted(plots, key=lambda x: x['name']) # Display plots in accordions for convenience for plot_info in plots_sorted: with st.expander(f"๐Ÿ“Š {plot_info['name'].replace('_', ' ').replace('.png', '')}", expanded=True): self.display_plot_image(plot_info) def display_plot_card(self, plot_info, category): """Display plot card""" try: # Load image image = Image.open(plot_info['path']) # Create safe key for state safe_key = plot_info['path'].replace('/', '_').replace('\\', '_').replace('.', '_') # Initialise state for this plot if not exists if f"show_{safe_key}" not in st.session_state: st.session_state[f"show_{safe_key}"] = False # Create card with st.container(): st.markdown(f"**{plot_info['name'].replace('_', ' ').replace('.png', '')}**") st.image(image, width='stretch', caption=plot_info['rel_path']) # File information size_kb = plot_info['size'] / 1024 st.caption(f"Size: {size_kb:.1f} KB | Category: {category}") # Zoom control buttons col1, col2 = st.columns(2) with col1: # Zoom button if st.button("๐Ÿ” Zoom", key=f"zoom_{safe_key}", width='stretch'): st.session_state[f"show_{safe_key}"] = True # Don't use st.rerun() here with col2: # Hide zoomed image button (if shown) if st.session_state[f"show_{safe_key}"]: if st.button("โœ• Hide", key=f"hide_{safe_key}", width='stretch'): st.session_state[f"show_{safe_key}"] = False # Don't use st.rerun() here # If zoom button clicked, show zoomed image if st.session_state[f"show_{safe_key}"]: st.markdown("---") st.subheader(f"๐Ÿ” {plot_info['name'].replace('_', ' ').replace('.png', '')}") st.image(image, width='stretch') except Exception as e: st.error(f"Error loading plot: {str(e)}") st.code(f"Path: {plot_info['path']}") def display_plot_image(self, plot_info): """Display plot image""" try: # Load image image = Image.open(plot_info['path']) # Display with information col1, col2 = st.columns([3, 1]) with col1: st.image(image, width='stretch') with col2: # File information st.metric("Size", f"{plot_info['size'] / 1024:.1f} KB") st.metric("Resolution", f"{image.width}ร—{image.height}") # File format st.write(f"**Format:** {image.format}") # Download button with open(plot_info['path'], 'rb') as file: btn = st.download_button( label="๐Ÿ“ฅ Download", data=file, file_name=plot_info['name'], mime="image/png", width='stretch' ) except Exception as e: st.error(f"Error loading plot: {str(e)}") st.code(f"Path: {plot_info['path']}") def render_step_7_modeling(self): """Step 7: Modelling Preparation""" st.header("๐Ÿค– Modelling Preparation") if not st.session_state.pipeline_completed or st.session_state.modeling_data is None: st.warning("First run pipeline in Step 4") # Suggest quick test st.markdown("---") st.subheader("๐ŸŽฎ Quick Test") col1, col2 = st.columns(2) with col1: if st.button("๐Ÿš€ Run Quick Test", type="primary", width='stretch'): st.session_state.quick_test_mode = True st.session_state.current_step = 1 st.rerun() with col2: if st.button("Run Pipeline", width='stretch'): st.session_state.current_step = 4 st.rerun() return modeling_data = st.session_state.modeling_data # Basic information col1, col2, col3, col4 = st.columns(4) with col1: if 'X_train' in modeling_data and modeling_data['X_train'] is not None: st.metric("Training Set", f"{modeling_data['X_train'].shape[0]:,} records") with col2: if 'X_val' in modeling_data and modeling_data['X_val'] is not None: st.metric("Validation Set", f"{modeling_data['X_val'].shape[0]:,} records") with col3: if 'X_test' in modeling_data and modeling_data['X_test'] is not None: st.metric("Test Set", f"{modeling_data['X_test'].shape[0]:,} records") with col4: if 'feature_names' in modeling_data and modeling_data['feature_names'] is not None: st.metric("Number of Features", len(modeling_data['feature_names'])) # Tabs tab1, tab2, tab3 = st.tabs([ "๐Ÿ“ Data Structure", "๐Ÿ“Š Target Variable Distribution", "๐Ÿ”— ML Integration" ]) with tab1: st.subheader("Modeling Data Structure") # Information table data_info = [] if 'X_train' in modeling_data and modeling_data['X_train'] is not None: data_info.append({ 'Dataset': 'Training', 'Samples': modeling_data['X_train'].shape[0], 'Features': modeling_data['X_train'].shape[1], 'Target Variable': 'Yes' if 'y_train' in modeling_data and modeling_data['y_train'] is not None else 'No' }) if 'X_val' in modeling_data and modeling_data['X_val'] is not None: data_info.append({ 'Dataset': 'Validation', 'Samples': modeling_data['X_val'].shape[0], 'Features': modeling_data['X_val'].shape[1], 'Target Variable': 'Yes' if 'y_val' in modeling_data and modeling_data['y_val'] is not None else 'No' }) if 'X_test' in modeling_data and modeling_data['X_test'] is not None: data_info.append({ 'Dataset': 'Test', 'Samples': modeling_data['X_test'].shape[0], 'Features': modeling_data['X_test'].shape[1], 'Target Variable': 'Yes' if 'y_test' in modeling_data and modeling_data['y_test'] is not None else 'No' }) if data_info: st.table(pd.DataFrame(data_info)) else: st.info("Modeling data not available") # Data sample st.subheader("Training Data Sample") if ('X_train' in modeling_data and modeling_data['X_train'] is not None and 'y_train' in modeling_data and modeling_data['y_train'] is not None): sample_data = pd.concat([ modeling_data['X_train'].head(10), modeling_data['y_train'].head(10).rename('target') ], axis=1) st.dataframe(sample_data, width='stretch') with tab2: st.subheader("Target Variable Distribution") if 'y_train' in modeling_data and modeling_data['y_train'] is not None: # Target variable histogram fig = px.histogram( x=modeling_data['y_train'], nbins=50, title="Target Variable Distribution (Training Set)", labels={'x': 'Target Variable', 'y': 'Frequency'}, color_discrete_sequence=['#00CC96'] ) st.plotly_chart(fig, width='stretch') # Statistics col1, col2, col3, col4 = st.columns(4) with col1: st.metric("Mean", f"{modeling_data['y_train'].mean():.2f}") with col2: st.metric("Standard Deviation", f"{modeling_data['y_train'].std():.2f}") with col3: st.metric("Minimum", f"{modeling_data['y_train'].min():.2f}") with col4: st.metric("Maximum", f"{modeling_data['y_train'].max():.2f}") else: st.info("Target variable not available") with tab3: st.subheader("Machine Learning Library Integration") st.info(""" Your data is ready for use with any Python ML libraries. Below are code examples for various libraries. """) # Library selection ml_library = st.selectbox( "Select ML Library", options=["Scikit-learn", "XGBoost", "LightGBM", "CatBoost", "PyTorch", "TensorFlow"] ) # Code generation code_placeholder = st.empty() if ml_library == "Scikit-learn": code = """# Example usage with Scikit-learn from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_squared_error, r2_score import numpy as np # Use prepared data X_train = modeling_data['X_train'] y_train = modeling_data['y_train'] X_val = modeling_data['X_val'] y_val = modeling_data['y_val'] # Create and train model model = RandomForestRegressor( n_estimators=100, max_depth=10, random_state=42 ) model.fit(X_train, y_train) # Predictions and evaluation y_pred = model.predict(X_val) print(f"RMSE: {np.sqrt(mean_squared_error(y_val, y_pred)):.4f}") print(f"Rยฒ Score: {r2_score(y_val, y_pred):.4f}") print(f"Feature Importance: {model.feature_importances_}")""" elif ml_library == "XGBoost": code = """# Example usage with XGBoost import xgboost as xgb from sklearn.metrics import mean_squared_error import numpy as np # Prepare data in DMatrix format dtrain = xgb.DMatrix(modeling_data['X_train'], label=modeling_data['y_train']) dval = xgb.DMatrix(modeling_data['X_val'], label=modeling_data['y_val']) # Model parameters params = { 'objective': 'reg:squarederror', 'max_depth': 6, 'learning_rate': 0.1, 'subsample': 0.8, 'colsample_bytree': 0.8, 'seed': 42 } # Train model model = xgb.train( params, dtrain, num_boost_round=100, evals=[(dval, 'validation')], early_stopping_rounds=10, verbose_eval=False ) # Predictions y_pred = model.predict(dval) print(f"RMSE: {np.sqrt(mean_squared_error(modeling_data['y_val'], y_pred)):.4f}") print(f"Number of Trees: {model.best_ntree_limit}")""" elif ml_library == "LightGBM": code = """# Example usage with LightGBM import lightgbm as lgb from sklearn.metrics import mean_squared_error import numpy as np # Prepare data train_data = lgb.Dataset( modeling_data['X_train'], label=modeling_data['y_train'] ) val_data = lgb.Dataset( modeling_data['X_val'], label=modeling_data['y_val'], reference=train_data ) # Model parameters params = { 'objective': 'regression', 'metric': 'rmse', 'num_leaves': 31, 'learning_rate': 0.05, 'feature_fraction': 0.9, 'bagging_fraction': 0.8, 'bagging_freq': 5, 'verbose': 0 } # Train model model = lgb.train( params, train_data, valid_sets=[val_data], num_boost_round=100, callbacks=[lgb.early_stopping(10)] ) # Predictions y_pred = model.predict(modeling_data['X_val']) print(f"RMSE: {np.sqrt(mean_squared_error(modeling_data['y_val'], y_pred)):.4f}") print(f"Best Iteration: {model.best_iteration}")""" else: code = f"""# Template for {ml_library} # Your data available in modeling_data variable X_train = modeling_data['X_train'] y_train = modeling_data['y_train'] X_val = modeling_data['X_val'] y_val = modeling_data['y_val'] X_test = modeling_data['X_test'] y_test = modeling_data['y_test'] # Code for {ml_library}... print(f"Data sizes:") print(f" X_train: {{X_train.shape}}") print(f" y_train: {{y_train.shape}}") print(f" X_val: {{X_val.shape}}") print(f" X_test: {{X_test.shape}}")""" # Display code code_placeholder.code(code, language='python') # Copy code button try: import pyperclip if st.button("๐Ÿ“‹ Copy Code", width='stretch'): try: pyperclip.copy(code) st.success("Code copied to clipboard!") except: st.warning("Failed to copy code. Copy manually.") except: st.warning("To copy code, install pyperclip library: pip install pyperclip") # Final information st.markdown("---") st.success(""" ๐ŸŽ‰ Congratulations! You have successfully prepared data for machine learning. **Next Steps:** 1. Use code above for integration with chosen ML library 2. Experiment with various models 3. Optimise hyperparameters 4. Evaluate results on test set """) # Navigation col1, col2 = st.columns([1, 1]) with col1: if st.button("โฌ…๏ธ Back to Visualisations", width='stretch'): st.session_state.current_step = 6 st.rerun() with col2: if st.button("๐Ÿ”„ Run New Pipeline", type="primary", width='stretch'): # Reset state st.session_state.pipeline_completed = False st.session_state.processed_data = None st.session_state.modeling_data = None st.session_state.current_step = 1 st.session_state.uploaded_file = None st.session_state.plots_path = None st.session_state.available_plots = {} st.session_state.synthetic_data_generated = False st.session_state.auto_pipeline_ready = False st.session_state.quick_test_mode = False st.rerun() def render_footer(self): """Application footer""" st.markdown("---") col1, col2, col3 = st.columns(3) with col1: st.markdown("**TimeFlowPro** v1.1.0") st.caption("Added synthetic data generation") with col2: st.markdown("๐Ÿ“ง Contacts: cool.araby@gmail.com") with col3: st.markdown("ยฉ 2026 All Rights Reserved") def run(self): """Run application""" # Header st.title("๐Ÿ“Š TimeFlow Pro - Data Analysis and Preprocessing") st.markdown("---") # Sidebar self.create_sidebar() # Main content depending on step if st.session_state.current_step == 1: self.render_step_1_data_loading() elif st.session_state.current_step == 2: self.render_step_2_configuration() elif st.session_state.current_step == 3: self.render_step_3_data_analysis() elif st.session_state.current_step == 4: self.render_step_4_pipeline_execution() elif st.session_state.current_step == 5: self.render_step_5_results() elif st.session_state.current_step == 6: self.render_step_6_visualisations() elif st.session_state.current_step == 7: self.render_step_7_modeling() # Footer self.render_footer() # ============================================ # APPLICATION LAUNCH # ============================================ if __name__ == "__main__": app = StreamlitApp() app.run()