Spaces:
Running
Running
| # ============================================ | |
| # TimeFlow Pro - Data Analysis and Preprocessing | |
| # ============================================ | |
| import sys | |
| import os | |
| sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) | |
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import os | |
| import sys | |
| import glob | |
| import re | |
| from datetime import datetime, timedelta | |
| import plotly.graph_objects as go | |
| import plotly.express as px | |
| from plotly.subplots import make_subplots | |
| from PIL import Image | |
| import matplotlib.pyplot as plt | |
| import warnings | |
| from pipeline.main_pipeline import EnhancedDataPreprocessingPipeline | |
| warnings.filterwarnings('ignore') | |
| # Add project path | |
| sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
| from config.config import Config | |
| from data_loader.data_loader import DataLoader | |
| from visualization.visualization_manager import VisualisationManager | |
| # ============================================ | |
| # PAGE CONFIGURATION | |
| # ============================================ | |
| st.set_page_config( | |
| page_title="TimeFlow Pro - Data Analysis and Preprocessing", | |
| page_icon="๐", | |
| layout="wide", | |
| initial_sidebar_state="expanded" | |
| ) | |
| # ============================================ | |
| # STATE MANAGEMENT CLASS | |
| # ============================================ | |
| class StreamlitApp: | |
| """Main Streamlit application class""" | |
| def __init__(self): | |
| self.init_session_state() | |
| self.config = None | |
| self.pipeline = None | |
| self.data = None | |
| def init_session_state(self): | |
| """Initialise session state""" | |
| if 'pipeline_completed' not in st.session_state: | |
| st.session_state.pipeline_completed = False | |
| if 'processed_data' not in st.session_state: | |
| st.session_state.processed_data = None | |
| if 'modeling_data' not in st.session_state: | |
| st.session_state.modeling_data = None | |
| if 'current_step' not in st.session_state: | |
| st.session_state.current_step = 1 | |
| if 'uploaded_file' not in st.session_state: | |
| st.session_state.uploaded_file = None | |
| if 'config_params' not in st.session_state: | |
| st.session_state.config_params = self.get_default_config() | |
| if 'plots_path' not in st.session_state: | |
| st.session_state.plots_path = None | |
| if 'available_plots' not in st.session_state: | |
| st.session_state.available_plots = {} | |
| if 'synthetic_data_generated' not in st.session_state: | |
| st.session_state.synthetic_data_generated = False | |
| if 'auto_pipeline_ready' not in st.session_state: | |
| st.session_state.auto_pipeline_ready = False | |
| if 'quick_test_mode' not in st.session_state: | |
| st.session_state.quick_test_mode = False | |
| def get_default_config(self): | |
| """Get default configuration""" | |
| return { | |
| 'data_path': '', | |
| 'results_dir': 'streamlit_results', | |
| 'target_column': '', | |
| 'start_year': 1970, | |
| 'end_year': 1990, | |
| 'max_lags': 5, | |
| 'seasonal_period': 365, | |
| 'rolling_windows': [7, 30, 90], | |
| 'expanding_windows': [30, 90], | |
| 'test_size': 0.2, | |
| 'validation_size': 0.1, | |
| 'scaling_method': 'robust', | |
| 'feature_selection_method': 'correlation', | |
| 'max_features': 20, | |
| 'missing_threshold': 0.3, | |
| 'outlier_method': 'iqr', | |
| 'enable_validation': True, | |
| 'split_method': 'time_based' | |
| } | |
| def create_sidebar(self): | |
| """Create sidebar""" | |
| with st.sidebar: | |
| st.title("๐ฏ TimeFlowPro") | |
| st.markdown("---") | |
| # Navigation | |
| st.subheader("Navigation") | |
| steps = { | |
| 1: "๐ Data Loading", | |
| 2: "โ๏ธ Configuration", | |
| 3: "๐ Data Analysis", | |
| 4: "โก Pipeline Execution", | |
| 5: "๐ Results", | |
| 6: "๐ Visualisations", | |
| 7: "๐ค Modelling" | |
| } | |
| for step_num, step_name in steps.items(): | |
| if st.button( | |
| f"{step_name}", | |
| key=f"nav_{step_num}", | |
| type="primary" if st.session_state.current_step == step_num else "secondary", | |
| width='stretch' | |
| ): | |
| st.session_state.current_step = step_num | |
| st.rerun() | |
| st.markdown("---") | |
| # Quick start with synthetic data | |
| st.subheader("โก Quick Test") | |
| if st.button("๐ Quick Start with Synthetic Data", | |
| type="primary", | |
| width='stretch', | |
| help="Generate synthetic data and run pipeline immediately"): | |
| st.session_state.quick_test_mode = True | |
| st.session_state.current_step = 1 | |
| st.rerun() | |
| st.markdown("---") | |
| # Project information | |
| st.subheader("๐ About the Project") | |
| st.info(""" | |
| TimeFlow Pro - Data Analysis and Preprocessing. | |
| **New Features:** | |
| - Synthetic data generation for testing | |
| - Automatic pipeline execution | |
| - Quick testing without file upload | |
| **Standard Features:** | |
| - Missing data analysis and processing | |
| - Outlier detection | |
| - Feature engineering | |
| - Stationarity analysis | |
| - Data scaling | |
| - Feature selection | |
| """) | |
| # Progress indicator | |
| if st.session_state.pipeline_completed: | |
| st.success("โ Pipeline completed") | |
| else: | |
| st.warning("โ ๏ธ Pipeline not started") | |
| # Quick test indicator | |
| if st.session_state.quick_test_mode: | |
| st.info("โก Quick test mode active") | |
| def generate_synthetic_data(self, n_days=1095, include_seasonality=True, include_trend=True, | |
| include_noise=True, include_exogenous=True, data_type="complex"): | |
| """ | |
| Generate synthetic data for testing | |
| Args: | |
| n_days (int): Number of days of data | |
| include_seasonality (bool): Include seasonality | |
| include_trend (bool): Include trend | |
| include_noise (bool): Include noise | |
| include_exogenous (bool): Include exogenous variables | |
| data_type (str): Data type (simple, medium, complex) | |
| Returns: | |
| pd.DataFrame: Generated synthetic data | |
| """ | |
| try: | |
| # Base parameters depending on data type | |
| if data_type == "simple": | |
| n_days = min(n_days, 365) # Limit for simple type | |
| trend_strength = 0.005 | |
| noise_std = 2 | |
| include_exogenous = False | |
| elif data_type == "medium": | |
| n_days = min(n_days, 730) # Limit for medium type | |
| trend_strength = 0.01 | |
| noise_std = 5 | |
| include_exogenous = True | |
| else: # complex | |
| n_days = min(n_days, 1095) # Limit for complex type | |
| trend_strength = 0.02 | |
| noise_std = 10 | |
| include_exogenous = True | |
| # Create dates | |
| start_date = datetime.now() - timedelta(days=n_days) | |
| dates = pd.date_range(start=start_date, periods=n_days, freq='D') | |
| # Base trend | |
| if include_trend: | |
| trend = np.linspace(0, trend_strength * n_days, n_days) | |
| else: | |
| trend = np.zeros(n_days) | |
| # Seasonality | |
| if include_seasonality: | |
| # Annual seasonality | |
| seasonal = 10 * np.sin(2 * np.pi * np.arange(n_days) / 365) | |
| # Quarterly seasonality | |
| seasonal += 5 * np.sin(2 * np.pi * np.arange(n_days) / 90) | |
| # Monthly seasonality | |
| seasonal += 3 * np.sin(2 * np.pi * np.arange(n_days) / 30) | |
| # Weekly seasonality | |
| seasonal += 2 * np.sin(2 * np.pi * np.arange(n_days) / 7) | |
| else: | |
| seasonal = np.zeros(n_days) | |
| # Main target variable (water consumption) | |
| base_value = 100 | |
| raskhodvoda = base_value + trend + seasonal | |
| # Add noise | |
| if include_noise: | |
| noise = np.random.normal(0, noise_std, n_days) | |
| raskhodvoda += noise | |
| # Create DataFrame | |
| data = pd.DataFrame({ | |
| 'date': dates, | |
| 'raskhodvoda': raskhodvoda | |
| }) | |
| # Add exogenous variables | |
| if include_exogenous: | |
| # Temperature (seasonal) | |
| data['temperature'] = 15 + 10 * np.sin(2 * np.pi * np.arange(n_days) / 365) + np.random.normal(0, 3, n_days) | |
| # Precipitation (random spikes) | |
| precipitation = np.random.exponential(2, n_days) | |
| # Add seasonality to precipitation | |
| precipitation_seasonality = 5 * np.sin(2 * np.pi * np.arange(n_days) / 365 + np.pi/2) | |
| data['precipitation'] = np.maximum(0, precipitation + precipitation_seasonality) | |
| # Pressure | |
| data['pressure'] = 760 + np.random.normal(0, 5, n_days) | |
| # Humidity | |
| data['humidity'] = 60 + 20 * np.sin(2 * np.pi * np.arange(n_days) / 180) + np.random.normal(0, 10, n_days) | |
| # Electricity consumption (correlated with target variable) | |
| data['electricity_consumption'] = raskhodvoda * 0.8 + np.random.normal(0, 5, n_days) | |
| # Day of week (categorical variable) | |
| data['day_of_week'] = dates.dayofweek | |
| data['is_weekend'] = (data['day_of_week'] >= 5).astype(int) | |
| # Holidays (random) | |
| holidays = np.random.choice([0, 1], size=n_days, p=[0.95, 0.05]) | |
| data['is_holiday'] = holidays | |
| # Lag variables | |
| for lag in [1, 7, 30]: | |
| data[f'raskhodvoda_lag_{lag}'] = data['raskhodvoda'].shift(lag) | |
| # Moving averages | |
| for window in [7, 30]: | |
| data[f'raskhodvoda_ma_{window}'] = data['raskhodvoda'].rolling(window=window).mean() | |
| # Add missing values for realism (5% random missing values) | |
| # CORRECTION: proper creation of missing value mask | |
| for col in data.columns: | |
| if col != 'date': # Don't add missing values to dates | |
| mask = np.random.random(len(data)) < 0.05 | |
| data.loc[mask, col] = np.nan | |
| # Add outliers (1% of data) | |
| # CORRECTION: proper creation of outlier mask | |
| numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist() | |
| for col in numeric_cols: | |
| outlier_mask = np.random.random(len(data)) < 0.01 | |
| if outlier_mask.any(): | |
| # Find outlier indices | |
| outlier_indices = data.index[outlier_mask] | |
| for idx in outlier_indices: | |
| if col in data.columns: | |
| mean_val = data[col].mean(skipna=True) | |
| std_val = data[col].std(skipna=True) | |
| if not np.isnan(mean_val) and not np.isnan(std_val) and std_val > 0: | |
| outlier_value = mean_val + 5 * std_val * np.random.choice([-1, 1]) | |
| data.at[idx, col] = outlier_value | |
| # Reset index | |
| data.reset_index(drop=True, inplace=True) | |
| st.session_state.synthetic_data_generated = True | |
| return data | |
| except Exception as e: | |
| st.error(f"Error generating synthetic data: {str(e)}") | |
| import traceback | |
| st.error(f"Error traceback: {traceback.format_exc()}") | |
| return None | |
| def quick_test_pipeline(self): | |
| """Quick pipeline execution with synthetic data""" | |
| with st.spinner("๐ Running quick test with synthetic data..."): | |
| try: | |
| # Step 1: Generate synthetic data | |
| st.info("Step 1: Generating synthetic data...") | |
| synthetic_data = self.generate_synthetic_data( | |
| n_days=365, # Reduced for speed | |
| include_seasonality=True, | |
| include_trend=True, | |
| include_noise=True, | |
| include_exogenous=True, | |
| data_type="medium" # Changed to medium for balance between speed and quality | |
| ) | |
| if synthetic_data is None: | |
| st.error("Failed to generate synthetic data") | |
| return | |
| # Save data to temporary file | |
| temp_file = "temp_synthetic_data.csv" | |
| synthetic_data.to_csv(temp_file, index=False) | |
| # Step 2: Configure settings | |
| st.info("Step 2: Configuring settings...") | |
| config_params = st.session_state.config_params.copy() | |
| config_params.update({ | |
| 'data_path': temp_file, | |
| 'target_column': 'raskhodvoda', | |
| 'start_year': 2020, | |
| 'end_year': 2023, | |
| 'max_lags': 7, | |
| 'seasonal_period': 365, | |
| 'rolling_windows': [7, 30], | |
| 'expanding_windows': [30], | |
| 'test_size': 0.2, | |
| 'validation_size': 0.1, | |
| 'scaling_method': 'robust', | |
| 'feature_selection_method': 'correlation', | |
| 'max_features': 10, # Reduced for speed | |
| 'missing_threshold': 0.3, | |
| 'outlier_method': 'iqr', | |
| 'enable_validation': True, | |
| 'split_method': 'time_based' | |
| }) | |
| # Step 3: Create and run pipeline | |
| st.info("Step 3: Creating and running pipeline...") | |
| # Create progress bar | |
| progress_bar = st.progress(0) | |
| status_text = st.empty() | |
| # Update configuration | |
| st.session_state.config_params = config_params | |
| st.session_state.uploaded_file = temp_file | |
| st.session_state.data_preview = synthetic_data | |
| # Create configuration | |
| status_text.text("Creating configuration...") | |
| progress_bar.progress(20) | |
| config = Config(**config_params) | |
| # Create pipeline | |
| status_text.text("Initialising pipeline...") | |
| progress_bar.progress(40) | |
| self.pipeline = EnhancedDataPreprocessingPipeline(config) | |
| # Run pipeline | |
| status_text.text("Running preprocessing pipeline...") | |
| progress_bar.progress(60) | |
| processed_data = self.pipeline.run_full_pipeline( | |
| use_synthetic=False, # Synthetic data already loaded | |
| save_intermediate=True, | |
| create_reports=True | |
| ) | |
| # Update progress | |
| if processed_data is not None: | |
| status_text.text("Getting data for modelling...") | |
| progress_bar.progress(80) | |
| modeling_data = self.pipeline.get_final_data_for_modelling() | |
| # Save to session state | |
| st.session_state.processed_data = processed_data | |
| st.session_state.modeling_data = modeling_data | |
| st.session_state.pipeline_completed = True | |
| st.session_state.plots_path = os.path.join(config.results_dir, 'plots') | |
| st.session_state.auto_pipeline_ready = True | |
| # Collect information about available plots | |
| self.collect_available_plots() | |
| # Completion | |
| status_text.text("Completing...") | |
| progress_bar.progress(100) | |
| st.success("โ Quick test completed successfully!") | |
| # Show results | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| st.metric("Records generated", f"{synthetic_data.shape[0]:,}") | |
| with col2: | |
| st.metric("Processed data", f"{processed_data.shape[0]:,} rows") | |
| with col3: | |
| st.metric("Final features", f"{processed_data.shape[1]} columns") | |
| # Automatic transition to results | |
| st.session_state.current_step = 5 | |
| st.rerun() | |
| else: | |
| st.error("โ Error running pipeline") | |
| st.error("Check logs for more information") | |
| except Exception as e: | |
| st.error(f"โ Error during quick test: {str(e)}") | |
| import traceback | |
| st.error(f"Error traceback: {traceback.format_exc()}") | |
| def render_step_1_data_loading(self): | |
| """Step 1: Data Loading""" | |
| st.header("๐ Data Loading") | |
| # Check quick test mode | |
| if st.session_state.quick_test_mode and not st.session_state.auto_pipeline_ready: | |
| st.info("โก Quick test mode activated. Generating synthetic data and running pipeline...") | |
| self.quick_test_pipeline() | |
| return | |
| col1, col2 = st.columns([2, 1]) | |
| with col1: | |
| # File upload | |
| uploaded_file = st.file_uploader( | |
| "Upload CSV file with data", | |
| type=['csv', 'xlsx', 'parquet'], | |
| help="Supported formats: CSV, Excel, Parquet" | |
| ) | |
| if uploaded_file is not None: | |
| # Save file temporarily | |
| file_path = f"temp_data.{uploaded_file.name.split('.')[-1]}" | |
| with open(file_path, "wb") as f: | |
| f.write(uploaded_file.getbuffer()) | |
| st.session_state.uploaded_file = file_path | |
| st.session_state.config_params['data_path'] = file_path | |
| # Load and preview data | |
| try: | |
| if file_path.endswith('.csv'): | |
| data = pd.read_csv(file_path) | |
| elif file_path.endswith('.xlsx'): | |
| data = pd.read_excel(file_path) | |
| elif file_path.endswith('.parquet'): | |
| data = pd.read_parquet(file_path) | |
| else: | |
| st.error("Unsupported file format") | |
| return | |
| st.session_state.data_preview = data | |
| # Data preview | |
| st.subheader("Data Preview") | |
| st.dataframe(data.head(50), width='stretch') | |
| # Basic information | |
| st.subheader("๐ Data Information") | |
| info_col1, info_col2, info_col3 = st.columns(3) | |
| with info_col1: | |
| st.metric("Rows", data.shape[0]) | |
| st.metric("Columns", data.shape[1]) | |
| with info_col2: | |
| numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist() | |
| st.metric("Numeric columns", len(numeric_cols)) | |
| categorical_cols = data.select_dtypes(include=['object', 'category']).columns.tolist() | |
| st.metric("Categorical columns", len(categorical_cols)) | |
| with info_col3: | |
| total_missing = data.isnull().sum().sum() | |
| missing_percentage = (total_missing / (data.shape[0] * data.shape[1])) * 100 | |
| st.metric("Missing values", f"{total_missing:,}") | |
| st.metric("Missing percentage", f"{missing_percentage:.2f}%") | |
| # Automatic target column selection if not set | |
| if 'target_column' not in st.session_state.config_params or not st.session_state.config_params['target_column']: | |
| numeric_columns = data.select_dtypes(include=[np.number]).columns.tolist() | |
| if numeric_columns: | |
| # Automatically select column with typical name | |
| target_keywords = ['target', 'y', 'value', 'price', 'sales', 'demand', 'raskhod', 'ัะฐัั ะพะด'] | |
| selected_target = None | |
| for col in numeric_columns: | |
| if any(keyword in col.lower() for keyword in target_keywords): | |
| selected_target = col | |
| break | |
| # If not found by keywords, take last numeric column | |
| if not selected_target and numeric_columns: | |
| selected_target = numeric_columns[-1] | |
| if selected_target: | |
| st.session_state.config_params['target_column'] = selected_target | |
| st.info(f"Target variable automatically selected: **{selected_target}**") | |
| st.info("You can change it in the next step") | |
| # Button to proceed to next step | |
| if st.button("โก๏ธ Go to Configuration", type="primary", width='stretch'): | |
| st.session_state.current_step = 2 | |
| st.rerun() | |
| except Exception as e: | |
| st.error(f"Error loading data: {str(e)}") | |
| with col2: | |
| # Demo data | |
| st.subheader("๐ฎ Demo Mode") | |
| demo_option = st.radio( | |
| "Choose demo data:", | |
| ["Synthetic Data", "Time Series Example"] | |
| ) | |
| # Synthetic data settings | |
| with st.expander("โ๏ธ Synthetic Data Settings", expanded=False): | |
| data_type = st.selectbox( | |
| "Data Type", | |
| options=["Simple", "Medium", "Complex"], | |
| index=1, | |
| help="Simple: 1 year, few features\nMedium: 2 years, main features\nComplex: 3 years, all features" | |
| ) | |
| n_days = st.slider( | |
| "Number of days", | |
| min_value=90, | |
| max_value=1825, | |
| value=1095, | |
| step=30, | |
| help="Number of days in synthetic data" | |
| ) | |
| include_trend = st.checkbox("Include trend", value=True) | |
| include_seasonality = st.checkbox("Include seasonality", value=True) | |
| include_noise = st.checkbox("Include noise", value=True) | |
| include_exogenous = st.checkbox("Include additional features", value=True) | |
| if st.button("Generate and Load Synthetic Data", width='stretch'): | |
| with st.spinner("Creating synthetic data..."): | |
| try: | |
| # Data type mapping | |
| data_type_map = { | |
| "Simple": "simple", | |
| "Medium": "medium", | |
| "Complex": "complex" | |
| } | |
| # Generate synthetic data | |
| synthetic_data = self.generate_synthetic_data( | |
| n_days=n_days, | |
| include_seasonality=include_seasonality, | |
| include_trend=include_trend, | |
| include_noise=include_noise, | |
| include_exogenous=include_exogenous, | |
| data_type=data_type_map[data_type] | |
| ) | |
| if synthetic_data is not None: | |
| st.session_state.data_preview = synthetic_data | |
| st.session_state.uploaded_file = "synthetic_data" | |
| st.session_state.config_params['data_path'] = 'synthetic_data' | |
| # Automatically select target variable | |
| if 'raskhodvoda' in synthetic_data.columns: | |
| st.session_state.config_params['target_column'] = 'raskhodvoda' | |
| st.success(f"โ Synthetic data created: {synthetic_data.shape[0]} rows, {synthetic_data.shape[1]} columns") | |
| # Show preview | |
| st.subheader("Synthetic Data Preview") | |
| st.dataframe(synthetic_data.head(20), width='stretch') | |
| # Statistics | |
| st.subheader("๐ Synthetic Data Statistics") | |
| stat_col1, stat_col2 = st.columns(2) | |
| with stat_col1: | |
| st.metric("Period", f"{synthetic_data.shape[0]} days") | |
| # CORRECTION: convert dates to strings for display | |
| if 'date' in synthetic_data.columns: | |
| min_date = synthetic_data['date'].min() | |
| max_date = synthetic_data['date'].max() | |
| if isinstance(min_date, (pd.Timestamp, datetime)): | |
| st.text(f"Start: {min_date.strftime('%Y-%m-%d')}") | |
| else: | |
| st.text(f"Start: {str(min_date)}") | |
| if isinstance(max_date, (pd.Timestamp, datetime)): | |
| st.text(f"End: {max_date.strftime('%Y-%m-%d')}") | |
| else: | |
| st.text(f"End: {str(max_date)}") | |
| with stat_col2: | |
| if 'raskhodvoda' in synthetic_data.columns: | |
| st.metric("Average consumption", f"{synthetic_data['raskhodvoda'].mean():.2f}") | |
| st.metric("Max consumption", f"{synthetic_data['raskhodvoda'].max():.2f}") | |
| st.metric("Min consumption", f"{synthetic_data['raskhodvoda'].min():.2f}") | |
| # Quick pipeline execution | |
| st.markdown("---") | |
| if st.button("๐ Quick Run Pipeline with This Data", type="primary", width='stretch'): | |
| st.session_state.quick_test_mode = True | |
| st.session_state.auto_pipeline_ready = False | |
| st.rerun() | |
| st.rerun() | |
| else: | |
| st.error("Failed to generate synthetic data") | |
| except Exception as e: | |
| st.error(f"Error creating synthetic data: {str(e)}") | |
| st.markdown("---") | |
| # Instructions | |
| st.subheader("๐ Instructions") | |
| st.markdown(""" | |
| 1. Upload CSV file with data **OR** | |
| 2. Generate synthetic data for testing | |
| 3. Check data preview | |
| 4. Target variable will be selected automatically | |
| 5. Go to configuration to specify parameters | |
| **Data Requirements:** | |
| - Date in separate column or index | |
| - Clean column names | |
| - Time series with regular intervals | |
| """) | |
| def render_step_2_configuration(self): | |
| """Step 2: Pipeline Configuration""" | |
| st.header("โ๏ธ Pipeline Configuration") | |
| # Automatic configuration for synthetic data | |
| if st.session_state.uploaded_file == "synthetic_data" or st.session_state.config_params['data_path'] == 'synthetic_data': | |
| st.info("โก Synthetic data detected. Optimised configuration applied.") | |
| # Automatic parameter setup for synthetic data | |
| if st.button("Apply Recommended Settings for Synthetic Data", width='stretch'): | |
| st.session_state.config_params.update({ | |
| 'target_column': 'raskhodvoda', | |
| 'max_lags': 7, | |
| 'seasonal_period': 365, | |
| 'rolling_windows': [7, 30, 90], | |
| 'expanding_windows': [30, 90], | |
| 'test_size': 0.2, | |
| 'validation_size': 0.1, | |
| 'scaling_method': 'robust', | |
| 'feature_selection_method': 'correlation', | |
| 'max_features': 15, | |
| 'missing_threshold': 0.3, | |
| 'outlier_method': 'iqr', | |
| 'enable_validation': True | |
| }) | |
| st.success("Settings applied!") | |
| st.rerun() | |
| # Configuration sections | |
| tab1, tab2, tab3, tab4 = st.tabs([ | |
| "๐ Basic Parameters", | |
| "๐ง Data Processing", | |
| "๐ฏ Features and Selection", | |
| "๐ Temporal Parameters" | |
| ]) | |
| with tab1: | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.subheader("Basic Parameters") | |
| st.session_state.config_params['results_dir'] = st.text_input( | |
| "Results Directory", | |
| value=st.session_state.config_params['results_dir'] | |
| ) | |
| # CORRECTION: replace text_input with selectbox for target variable selection | |
| if hasattr(st.session_state, 'data_preview') and st.session_state.data_preview is not None: | |
| # Get all data columns | |
| all_columns = st.session_state.data_preview.columns.tolist() | |
| # If target variable already set and present in data, use it | |
| current_target = st.session_state.config_params.get('target_column', '') | |
| default_index = 0 | |
| if current_target in all_columns: | |
| default_index = all_columns.index(current_target) | |
| elif len(all_columns) > 0: | |
| # Try to find suitable default column | |
| numeric_columns = st.session_state.data_preview.select_dtypes(include=[np.number]).columns.tolist() | |
| if numeric_columns: | |
| # Look for columns with typical target variable names | |
| target_keywords = ['target', 'y', 'value', 'price', 'sales', 'demand', 'raskhod', 'ัะฐัั ะพะด'] | |
| for i, col in enumerate(all_columns): | |
| if any(keyword in col.lower() for keyword in target_keywords): | |
| default_index = i | |
| break | |
| # If not found by keywords, take first numeric column | |
| if default_index == 0 and numeric_columns[0] in all_columns: | |
| default_index = all_columns.index(numeric_columns[0]) | |
| st.session_state.config_params['target_column'] = st.selectbox( | |
| "Select Target Variable", | |
| options=all_columns, | |
| index=default_index, | |
| help="Select column to be predicted" | |
| ) | |
| else: | |
| # If data not loaded, keep text field | |
| st.session_state.config_params['target_column'] = st.text_input( | |
| "Target Variable", | |
| value=st.session_state.config_params.get('target_column', ''), | |
| help="Enter target column name" | |
| ) | |
| st.session_state.config_params['enable_validation'] = st.checkbox( | |
| "Enable Data Validation", | |
| value=st.session_state.config_params['enable_validation'] | |
| ) | |
| with col2: | |
| st.subheader("Data Split") | |
| st.session_state.config_params['test_size'] = st.slider( | |
| "Test Set Size (%)", | |
| min_value=5, | |
| max_value=40, | |
| value=int(st.session_state.config_params['test_size'] * 100), | |
| step=5, | |
| format="%d%%" | |
| ) / 100 | |
| st.session_state.config_params['validation_size'] = st.slider( | |
| "Validation Set Size (%)", | |
| min_value=5, | |
| max_value=30, | |
| value=int(st.session_state.config_params['validation_size'] * 100), | |
| step=5, | |
| format="%d%%" | |
| ) / 100 | |
| split_methods = ['time_based', 'random'] | |
| st.session_state.config_params['split_method'] = st.selectbox( | |
| "Split Method", | |
| options=split_methods, | |
| index=split_methods.index(st.session_state.config_params['split_method']) | |
| ) | |
| with tab2: | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.subheader("Missing Value Processing") | |
| st.session_state.config_params['missing_threshold'] = st.slider( | |
| "Missing Value Column Removal Threshold", | |
| min_value=0.0, | |
| max_value=0.5, | |
| value=st.session_state.config_params['missing_threshold'], | |
| step=0.05, | |
| format="%.2f" | |
| ) | |
| st.subheader("Outlier Processing") | |
| outlier_methods = ['iqr', 'zscore', 'isolation_forest'] | |
| st.session_state.config_params['outlier_method'] = st.selectbox( | |
| "Outlier Detection Method", | |
| options=outlier_methods, | |
| index=outlier_methods.index(st.session_state.config_params['outlier_method']) | |
| ) | |
| with col2: | |
| st.subheader("Data Scaling") | |
| scaling_methods = ['robust', 'standard', 'minmax', 'none'] | |
| st.session_state.config_params['scaling_method'] = st.selectbox( | |
| "Scaling Method", | |
| options=scaling_methods, | |
| index=scaling_methods.index(st.session_state.config_params['scaling_method']) | |
| ) | |
| if st.session_state.config_params['scaling_method'] == 'none': | |
| st.info("โ ๏ธ Data will not be scaled") | |
| with tab3: | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.subheader("Feature Engineering") | |
| st.session_state.config_params['max_lags'] = st.slider( | |
| "Maximum Number of Lags", | |
| min_value=1, | |
| max_value=20, | |
| value=st.session_state.config_params['max_lags'], | |
| step=1 | |
| ) | |
| rolling_windows_input = st.text_input( | |
| "Windows for Rolling Statistics (comma-separated)", | |
| value=', '.join(map(str, st.session_state.config_params['rolling_windows'])) | |
| ) | |
| if rolling_windows_input: | |
| st.session_state.config_params['rolling_windows'] = [ | |
| int(x.strip()) for x in rolling_windows_input.split(',') if x.strip().isdigit() | |
| ] | |
| with col2: | |
| st.subheader("Feature Selection") | |
| feature_methods = ['correlation', 'variance', 'mutual_info', 'rf', 'none'] | |
| st.session_state.config_params['feature_selection_method'] = st.selectbox( | |
| "Feature Selection Method", | |
| options=feature_methods, | |
| index=feature_methods.index(st.session_state.config_params['feature_selection_method']) | |
| ) | |
| st.session_state.config_params['max_features'] = st.slider( | |
| "Maximum Number of Features", | |
| min_value=5, | |
| max_value=100, | |
| value=st.session_state.config_params['max_features'], | |
| step=5 | |
| ) | |
| with tab4: | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.subheader("Temporal Parameters") | |
| # If there is data for preview, show date range | |
| if hasattr(st.session_state, 'data_preview'): | |
| if 'date' in st.session_state.data_preview.columns: | |
| date_col = 'date' | |
| elif isinstance(st.session_state.data_preview.index, pd.DatetimeIndex): | |
| dates = st.session_state.data_preview.index | |
| else: | |
| # Try to find date column | |
| date_cols = [col for col in st.session_state.data_preview.columns | |
| if 'date' in col.lower() or 'time' in col.lower()] | |
| date_col = date_cols[0] if date_cols else None | |
| if date_col: | |
| if date_col in st.session_state.data_preview.columns: | |
| dates = pd.to_datetime(st.session_state.data_preview[date_col]) | |
| else: | |
| dates = st.session_state.data_preview.index | |
| if len(dates) > 0: | |
| min_date = dates.min() | |
| max_date = dates.max() | |
| col1_date, col2_date = st.columns(2) | |
| with col1_date: | |
| st.session_state.config_params['start_year'] = st.number_input( | |
| "Start Year", | |
| min_value=1900, | |
| max_value=2100, | |
| value=min_date.year, | |
| step=1 | |
| ) | |
| with col2_date: | |
| st.session_state.config_params['end_year'] = st.number_input( | |
| "End Year", | |
| min_value=1900, | |
| max_value=2100, | |
| value=max_date.year, | |
| step=1 | |
| ) | |
| with col2: | |
| st.subheader("Seasonality") | |
| st.session_state.config_params['seasonal_period'] = st.selectbox( | |
| "Seasonal Period", | |
| options=[7, 30, 90, 365, 12, 24], | |
| index=[7, 30, 90, 365, 12, 24].index( | |
| st.session_state.config_params['seasonal_period'] | |
| ) if st.session_state.config_params['seasonal_period'] in [7, 30, 90, 365, 12, 24] else 0 | |
| ) | |
| expanding_windows_input = st.text_input( | |
| "Windows for Expanding Statistics (comma-separated)", | |
| value=', '.join(map(str, st.session_state.config_params['expanding_windows'])) | |
| ) | |
| if expanding_windows_input: | |
| st.session_state.config_params['expanding_windows'] = [ | |
| int(x.strip()) for x in expanding_windows_input.split(',') if x.strip().isdigit() | |
| ] | |
| # Navigation buttons | |
| col1, col2, col3 = st.columns([1, 1, 1]) | |
| with col1: | |
| if st.button("โฌ ๏ธ Back to Loading", width='stretch'): | |
| st.session_state.current_step = 1 | |
| st.rerun() | |
| with col3: | |
| if st.button("Go to Analysis โก๏ธ", type="primary", width='stretch'): | |
| st.session_state.current_step = 3 | |
| st.rerun() | |
| def render_step_3_data_analysis(self): | |
| """Step 3: Data Analysis""" | |
| st.header("๐ Data Analysis") | |
| if not hasattr(st.session_state, 'data_preview') or st.session_state.data_preview is None: | |
| st.warning("First load data in Step 1") | |
| if st.button("Return to Data Loading"): | |
| st.session_state.current_step = 1 | |
| st.rerun() | |
| return | |
| data = st.session_state.data_preview | |
| # Analysis tabs | |
| tab1, tab2, tab3, tab4 = st.tabs([ | |
| "๐ Statistics", | |
| "๐ Distributions", | |
| "๐ Temporal Analysis", | |
| "โ Missing Values and Outliers" | |
| ]) | |
| with tab1: | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.subheader("Basic Statistics") | |
| st.dataframe(data.describe().round(2), width='stretch') | |
| with col2: | |
| st.subheader("Data Types") | |
| dtype_info = pd.DataFrame({ | |
| 'Column': data.columns, | |
| 'Type': data.dtypes.values, | |
| 'Unique Values': [data[col].nunique() for col in data.columns] | |
| }) | |
| st.dataframe(dtype_info, width='stretch') | |
| with tab2: | |
| # Select column for visualisation | |
| numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist() | |
| if numeric_cols: | |
| selected_col = st.selectbox( | |
| "Select Column for Analysis", | |
| options=numeric_cols | |
| ) | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| # Histogram | |
| fig = px.histogram( | |
| data, | |
| x=selected_col, | |
| title=f"Distribution of {selected_col}", | |
| nbins=50, | |
| color_discrete_sequence=['#636EFA'] | |
| ) | |
| st.plotly_chart(fig, width='stretch') | |
| with col2: | |
| # Box plot | |
| fig = go.Figure() | |
| fig.add_trace(go.Box( | |
| y=data[selected_col], | |
| name=selected_col, | |
| boxpoints='outliers', | |
| marker_color='#EF553B' | |
| )) | |
| fig.update_layout( | |
| title=f"Box plot {selected_col}", | |
| yaxis_title=selected_col | |
| ) | |
| st.plotly_chart(fig, width='stretch') | |
| else: | |
| st.warning("No numeric columns for distribution analysis") | |
| with tab3: | |
| # Time series analysis | |
| date_cols = [col for col in data.columns if 'date' in col.lower()] | |
| if date_cols or isinstance(data.index, pd.DatetimeIndex): | |
| if date_cols: | |
| date_col = date_cols[0] | |
| dates = pd.to_datetime(data[date_col]) | |
| else: | |
| dates = data.index | |
| date_col = 'index' | |
| # Check for numeric columns | |
| if len(numeric_cols) > 0: | |
| # Select column for time series | |
| ts_col = st.selectbox( | |
| "Select Column for Time Series", | |
| options=numeric_cols | |
| ) | |
| # Time series | |
| fig = go.Figure() | |
| fig.add_trace(go.Scatter( | |
| x=dates, | |
| y=data[ts_col], | |
| mode='lines', | |
| name=ts_col, | |
| line=dict(color='#636EFA', width=2) | |
| )) | |
| fig.update_layout( | |
| title=f"Time Series: {ts_col}", | |
| xaxis_title="Date", | |
| yaxis_title=ts_col, | |
| hovermode='x unified' | |
| ) | |
| st.plotly_chart(fig, width='stretch') | |
| # Seasonality (if sufficient data) | |
| if len(dates) > 30: | |
| # Monthly trend | |
| if hasattr(dates, 'month'): | |
| monthly_data = data.groupby(dates.dt.month)[ts_col].mean() | |
| fig2 = px.bar( | |
| x=monthly_data.index, | |
| y=monthly_data.values, | |
| title=f"Monthly Seasonality: {ts_col}", | |
| labels={'x': 'Month', 'y': 'Average Value'} | |
| ) | |
| st.plotly_chart(fig2, width='stretch') | |
| else: | |
| st.warning("No numeric columns for temporal analysis") | |
| else: | |
| st.info("For temporal analysis, date column or DatetimeIndex required") | |
| with tab4: | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| # Missing value analysis | |
| st.subheader("Missing Values") | |
| missing_data = data.isnull().sum() | |
| missing_percentage = (missing_data / len(data)) * 100 | |
| missing_df = pd.DataFrame({ | |
| 'Column': missing_data.index, | |
| 'Missing Count': missing_data.values, | |
| 'Missing Percentage': missing_percentage.values | |
| }).sort_values('Missing Count', ascending=False) | |
| st.dataframe(missing_df, width='stretch') | |
| # Missing values visualisation | |
| if missing_data.sum() > 0: | |
| fig = px.bar( | |
| missing_df, | |
| x='Column', | |
| y='Missing Percentage', | |
| title="Missing Percentage by Column", | |
| color='Missing Percentage', | |
| color_continuous_scale='Reds' | |
| ) | |
| st.plotly_chart(fig, width='stretch') | |
| with col2: | |
| # Quick outlier analysis | |
| st.subheader("Quick Outlier Analysis") | |
| if len(numeric_cols) > 0: | |
| outlier_summary = [] | |
| for col in numeric_cols[:5]: # Limit to 5 columns for speed | |
| q1 = data[col].quantile(0.25) | |
| q3 = data[col].quantile(0.75) | |
| iqr = q3 - q1 | |
| lower_bound = q1 - 1.5 * iqr | |
| upper_bound = q3 + 1.5 * iqr | |
| outliers = data[(data[col] < lower_bound) | (data[col] > upper_bound)] | |
| outlier_pct = (len(outliers) / len(data)) * 100 | |
| outlier_summary.append({ | |
| 'Column': col, | |
| 'Outliers': len(outliers), | |
| 'Percentage': f"{outlier_pct:.2f}%" | |
| }) | |
| outlier_df = pd.DataFrame(outlier_summary) | |
| st.dataframe(outlier_df, width='stretch') | |
| else: | |
| st.warning("No numeric columns for outlier analysis") | |
| # Navigation buttons | |
| col1, col2, col3 = st.columns([1, 1, 1]) | |
| with col1: | |
| if st.button("โฌ ๏ธ Back to Configuration", width='stretch'): | |
| st.session_state.current_step = 2 | |
| st.rerun() | |
| with col3: | |
| if st.button("Run Pipeline โก๏ธ", type="primary", width='stretch'): | |
| st.session_state.current_step = 4 | |
| st.rerun() | |
| def render_step_4_pipeline_execution(self): | |
| """Step 4: Pipeline Execution""" | |
| st.header("โก Pipeline Execution") | |
| # Readiness check | |
| ready_to_run = True | |
| issues = [] | |
| if not st.session_state.uploaded_file and st.session_state.config_params['data_path'] != 'demo' and st.session_state.config_params['data_path'] != 'synthetic_data': | |
| issues.append("Data not loaded") | |
| ready_to_run = False | |
| if not st.session_state.config_params['target_column']: | |
| issues.append("Target variable not selected") | |
| ready_to_run = False | |
| # Automatic synthetic data generation if quick test enabled | |
| if st.session_state.quick_test_mode and not st.session_state.auto_pipeline_ready: | |
| st.info("โก Quick test mode activated. Generating synthetic data...") | |
| self.quick_test_pipeline() | |
| return | |
| # Display warnings | |
| if issues: | |
| st.error("โ ๏ธ Fix before running:") | |
| for issue in issues: | |
| st.write(f"- {issue}") | |
| # Suggest using synthetic data | |
| st.markdown("---") | |
| st.subheader("๐ฎ Quick Solution") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| if st.button("Generate Synthetic Data", width='stretch'): | |
| st.session_state.current_step = 1 | |
| st.rerun() | |
| with col2: | |
| if st.button("To Data Loading", width='stretch'): | |
| st.session_state.current_step = 1 | |
| st.rerun() | |
| col3, col4 = st.columns(2) | |
| with col3: | |
| if st.button("To Configuration", width='stretch'): | |
| st.session_state.current_step = 2 | |
| st.rerun() | |
| return | |
| # Display configuration | |
| st.subheader("Execution Configuration") | |
| config_col1, config_col2 = st.columns(2) | |
| with config_col1: | |
| st.metric("Target Variable", st.session_state.config_params['target_column']) | |
| st.metric("Test Set", f"{st.session_state.config_params['test_size']*100:.0f}%") | |
| st.metric("Scaling Method", st.session_state.config_params['scaling_method']) | |
| with config_col2: | |
| st.metric("Max Lags", st.session_state.config_params['max_lags']) | |
| st.metric("Feature Selection Method", st.session_state.config_params['feature_selection_method']) | |
| st.metric("Validation Enabled", "Yes" if st.session_state.config_params['enable_validation'] else "No") | |
| # Execution options | |
| st.subheader("Execution Options") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| use_synthetic = st.checkbox( | |
| "Use Synthetic Data", | |
| value=(st.session_state.config_params['data_path'] == 'demo' or | |
| st.session_state.config_params['data_path'] == 'synthetic_data'), | |
| disabled=(st.session_state.config_params['data_path'] == 'demo' or | |
| st.session_state.config_params['data_path'] == 'synthetic_data') | |
| ) | |
| save_intermediate = st.checkbox( | |
| "Save Intermediate Results", | |
| value=True | |
| ) | |
| with col2: | |
| create_reports = st.checkbox( | |
| "Create Reports", | |
| value=True | |
| ) | |
| create_visualisations = st.checkbox( | |
| "Create Visualisations", | |
| value=True, | |
| help="Create data analysis plots" | |
| ) | |
| # Run button | |
| if st.button("๐ Run Preprocessing Pipeline", type="primary", width='stretch'): | |
| # Create progress bar | |
| progress_bar = st.progress(0) | |
| status_text = st.empty() | |
| try: | |
| # Create configuration | |
| status_text.text("Creating configuration...") | |
| progress_bar.progress(10) | |
| config = Config(**st.session_state.config_params) | |
| # Create pipeline | |
| status_text.text("Initialising pipeline...") | |
| progress_bar.progress(20) | |
| self.pipeline = EnhancedDataPreprocessingPipeline(config) | |
| # Determine whether to use synthetic data | |
| use_synthetic_flag = (use_synthetic or | |
| st.session_state.config_params['data_path'] == 'demo' or | |
| st.session_state.config_params['data_path'] == 'synthetic_data') | |
| # Run pipeline | |
| status_text.text("Running preprocessing pipeline...") | |
| progress_bar.progress(30) | |
| processed_data = self.pipeline.run_full_pipeline( | |
| use_synthetic=use_synthetic_flag, | |
| save_intermediate=save_intermediate, | |
| create_reports=create_reports | |
| ) | |
| # Update progress | |
| if processed_data is not None: | |
| status_text.text("Getting data for modelling...") | |
| progress_bar.progress(80) | |
| modeling_data = self.pipeline.get_final_data_for_modelling() | |
| # Save to session state | |
| st.session_state.processed_data = processed_data | |
| st.session_state.modeling_data = modeling_data | |
| st.session_state.pipeline_completed = True | |
| st.session_state.plots_path = os.path.join(config.results_dir, 'plots') | |
| # Collect information about available plots | |
| self.collect_available_plots() | |
| # Completion | |
| status_text.text("Completing...") | |
| progress_bar.progress(100) | |
| st.success("โ Pipeline completed successfully!") | |
| # Show results | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| if hasattr(self.pipeline, 'results') and 'data_loading' in self.pipeline.results: | |
| st.metric("Original Data", f"{self.pipeline.results['data_loading']['shape'][0]:,} rows") | |
| else: | |
| st.metric("Original Data", "Information unavailable") | |
| with col2: | |
| st.metric("Processed Data", f"{processed_data.shape[0]:,} rows") | |
| with col3: | |
| st.metric("Final Features", f"{processed_data.shape[1]} columns") | |
| # Button to proceed to results | |
| if st.button("๐ Go to Results", type="primary", width='stretch'): | |
| st.session_state.current_step = 5 | |
| st.rerun() | |
| else: | |
| st.error("โ Error executing pipeline") | |
| st.error("Check logs for more information") | |
| except Exception as e: | |
| progress_bar.progress(0) | |
| status_text.text("") | |
| st.error(f"โ Error: {str(e)}") | |
| st.exception(e) | |
| # Back button | |
| if st.button("โฌ ๏ธ Back to Analysis", width='stretch'): | |
| st.session_state.current_step = 3 | |
| st.rerun() | |
| def collect_available_plots(self): | |
| """Collect information about available plots""" | |
| if not st.session_state.plots_path or not os.path.exists(st.session_state.plots_path): | |
| st.session_state.available_plots = {} | |
| return | |
| plots_categories = { | |
| 'summary': ['summary_dashboard.png'], | |
| 'missing_values': ['missing_values_analysis.png'], | |
| 'outliers': ['outliers_analysis.png', 'outlier_handling_results.png', 'temporal_outliers.png'], | |
| 'stationarity': ['stationarity_*.png'], | |
| 'data_split': ['data_split.png'], | |
| 'scaling': ['scaling_results.png'], | |
| 'feature_selection': ['feature_selection_*.png'], | |
| 'correlations': ['correlation_matrix.png', 'high_correlations.png', 'target_correlations.png', 'vif_scores.png'] | |
| } | |
| available_plots = {} | |
| for category, patterns in plots_categories.items(): | |
| category_plots = [] | |
| # Search for files for each pattern | |
| for pattern in patterns: | |
| # For general patterns | |
| if '*' in pattern: | |
| search_path = os.path.join(st.session_state.plots_path, pattern) | |
| files = glob.glob(search_path) | |
| # Also search in subfolders | |
| for root, dirs, filenames in os.walk(st.session_state.plots_path): | |
| for filename in filenames: | |
| if pattern.replace('*', '') in filename and filename.endswith('.png'): | |
| full_path = os.path.join(root, filename) | |
| if full_path not in files: | |
| files.append(full_path) | |
| else: | |
| # For specific file names | |
| file_path = os.path.join(st.session_state.plots_path, pattern) | |
| # Check in main folder | |
| if os.path.exists(file_path): | |
| files = [file_path] | |
| else: | |
| # Check in subfolders | |
| files = [] | |
| for root, dirs, filenames in os.walk(st.session_state.plots_path): | |
| for filename in filenames: | |
| if filename == pattern: | |
| files.append(os.path.join(root, filename)) | |
| for file in files: | |
| if os.path.exists(file): | |
| # Get relative path for display | |
| rel_path = os.path.relpath(file, st.session_state.plots_path) | |
| category_plots.append({ | |
| 'path': file, | |
| 'name': os.path.basename(file), | |
| 'rel_path': rel_path, | |
| 'size': os.path.getsize(file) | |
| }) | |
| if category_plots: | |
| available_plots[category] = category_plots | |
| # Also add all found PNG files in general folder | |
| all_png_files = [] | |
| for root, dirs, filenames in os.walk(st.session_state.plots_path): | |
| for filename in filenames: | |
| if filename.endswith('.png'): | |
| file_path = os.path.join(root, filename) | |
| # Check if this file already added | |
| already_added = False | |
| for category_plots in available_plots.values(): | |
| for plot in category_plots: | |
| if plot['path'] == file_path: | |
| already_added = True | |
| break | |
| if not already_added: | |
| rel_path = os.path.relpath(file_path, st.session_state.plots_path) | |
| all_png_files.append({ | |
| 'path': file_path, | |
| 'name': filename, | |
| 'rel_path': rel_path, | |
| 'size': os.path.getsize(file_path) | |
| }) | |
| if all_png_files: | |
| available_plots['other'] = all_png_files | |
| st.session_state.available_plots = available_plots | |
| def render_step_5_results(self): | |
| """Step 5: Results""" | |
| st.header("๐ Pipeline Results") | |
| if not st.session_state.pipeline_completed or st.session_state.processed_data is None: | |
| st.warning("Pipeline not yet run or not completed successfully") | |
| # Suggest using quick test | |
| st.markdown("---") | |
| st.subheader("๐ฎ Quick Start") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| if st.button("๐ Run Quick Test", type="primary", width='stretch'): | |
| st.session_state.quick_test_mode = True | |
| st.session_state.current_step = 1 | |
| st.rerun() | |
| with col2: | |
| if st.button("Load Data", width='stretch'): | |
| st.session_state.current_step = 1 | |
| st.rerun() | |
| return | |
| data = st.session_state.processed_data | |
| modeling_data = st.session_state.modeling_data | |
| # Results tabs | |
| tab1, tab2, tab3, tab4 = st.tabs([ | |
| "๐ Data Overview", | |
| "๐ Feature Analysis", | |
| "๐ Validation", | |
| "๐พ Export" | |
| ]) | |
| with tab1: | |
| st.subheader("Processed Data") | |
| # Basic information | |
| info_col1, info_col2, info_col3, info_col4 = st.columns(4) | |
| with info_col1: | |
| st.metric("Total Records", f"{data.shape[0]:,}") | |
| with info_col2: | |
| st.metric("Total Features", data.shape[1]) | |
| with info_col3: | |
| numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist() | |
| st.metric("Numeric Features", len(numeric_cols)) | |
| with info_col4: | |
| missing_total = data.isnull().sum().sum() | |
| st.metric("Missing Values", missing_total) | |
| # Data preview | |
| st.subheader("Data Preview") | |
| st.dataframe(data.head(100), width='stretch') | |
| # Statistics | |
| st.subheader("Processed Data Statistics") | |
| st.dataframe(data.describe().round(4), width='stretch') | |
| with tab2: | |
| st.subheader("Feature Analysis") | |
| if modeling_data and 'feature_names' in modeling_data: | |
| features = modeling_data['feature_names'] | |
| # Feature list | |
| st.write(f"**Selected Features:** {len(features)}") | |
| # Display features as cards | |
| cols_per_row = 4 | |
| for i in range(0, len(features), cols_per_row): | |
| cols = st.columns(cols_per_row) | |
| for j in range(cols_per_row): | |
| idx = i + j | |
| if idx < len(features): | |
| with cols[j]: | |
| st.info(features[idx]) | |
| # Feature importance (if available) | |
| if (self.pipeline is not None and | |
| hasattr(self.pipeline, 'feature_selector') and | |
| self.pipeline.feature_selector is not None): | |
| # Check for feature_importances_ | |
| if hasattr(self.pipeline.feature_selector, 'feature_importances_'): | |
| importances = self.pipeline.feature_selector.feature_importances_ | |
| if importances is not None and len(importances) > 0: | |
| importance_df = pd.DataFrame({ | |
| 'Feature': features[:len(importances)] if len(features) >= len(importances) else features, | |
| 'Importance': importances[:len(features)] if len(importances) >= len(features) else importances | |
| }).sort_values('Importance', ascending=False) | |
| st.subheader("Feature Importance") | |
| fig = px.bar( | |
| importance_df.head(20), | |
| x='Importance', | |
| y='Feature', | |
| orientation='h', | |
| title="Top-20 Features by Importance", | |
| color='Importance', | |
| color_continuous_scale='Viridis' | |
| ) | |
| st.plotly_chart(fig, width='stretch') | |
| # Correlation matrix (limited for performance) | |
| if data.shape[1] <= 50: # Performance limit | |
| st.subheader("Correlation Matrix (first 20 features)") | |
| # Select only numeric columns and limit quantity | |
| numeric_data = data.select_dtypes(include=[np.number]) | |
| if len(numeric_data.columns) > 20: | |
| numeric_data = numeric_data.iloc[:, :20] | |
| if not numeric_data.empty and len(numeric_data.columns) > 1: | |
| corr_matrix = numeric_data.corr() | |
| fig = go.Figure(data=go.Heatmap( | |
| z=corr_matrix.values, | |
| x=corr_matrix.columns, | |
| y=corr_matrix.columns, | |
| colorscale='RdBu', | |
| zmin=-1, | |
| zmax=1, | |
| text=corr_matrix.round(2).values, | |
| texttemplate='%{text}', | |
| textfont={"size": 10} | |
| )) | |
| fig.update_layout( | |
| title="Correlation Matrix", | |
| width=800, | |
| height=800 | |
| ) | |
| st.plotly_chart(fig, width='stretch') | |
| else: | |
| st.info("Insufficient data for correlation matrix") | |
| with tab3: | |
| st.subheader("Validation Results") | |
| # Improved validation result availability check | |
| validation_available = False | |
| validation_data = None | |
| if self.pipeline is not None: | |
| # Check for results in pipeline | |
| if hasattr(self.pipeline, 'results'): | |
| # Look for validation results under different keys | |
| validation_keys = ['final_validation', 'validation_results', 'validation', 'validation_checks'] | |
| for key in validation_keys: | |
| if key in self.pipeline.results: | |
| validation_data = self.pipeline.results[key] | |
| validation_available = True | |
| break | |
| # If not found in results, check other attributes | |
| if not validation_available and hasattr(self.pipeline, 'validation_report'): | |
| validation_data = self.pipeline.validation_report | |
| validation_available = True | |
| # Or check processing results | |
| if not validation_available and hasattr(self.pipeline, 'get_validation_summary'): | |
| try: | |
| validation_data = self.pipeline.get_validation_summary() | |
| validation_available = True | |
| except: | |
| pass | |
| # If validation results available | |
| if validation_available and validation_data: | |
| st.success("โ Validation results available") | |
| # Check validation data format | |
| if isinstance(validation_data, dict): | |
| # Display as dictionary | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| # Status | |
| status = validation_data.get('status', 'UNKNOWN') | |
| if status == 'PASS': | |
| st.success(f"Status: {status}") | |
| elif status == 'WARNING': | |
| st.warning(f"Status: {status}") | |
| else: | |
| st.error(f"Status: {status}") | |
| # Overall score | |
| score = validation_data.get('overall_score', validation_data.get('score', 0)) | |
| if score: | |
| st.metric("Overall Score", f"{score}/100") | |
| with col2: | |
| # Check counters | |
| if 'checks' in validation_data: | |
| checks = validation_data['checks'] | |
| elif 'basic_checks' in validation_data: | |
| checks = validation_data['basic_checks'] | |
| else: | |
| checks = validation_data | |
| if isinstance(checks, dict): | |
| passed = sum(1 for check in checks.values() | |
| if isinstance(check, dict) and check.get('passed', False)) | |
| total = len(checks) | |
| st.metric("Checks Passed", f"{passed}/{total}") | |
| # Check details | |
| st.subheader("Check Details") | |
| # Determine where checks are located | |
| checks_to_display = None | |
| if 'checks' in validation_data: | |
| checks_to_display = validation_data['checks'] | |
| elif 'basic_checks' in validation_data: | |
| checks_to_display = validation_data['basic_checks'] | |
| elif any(isinstance(v, dict) and 'passed' in v for v in validation_data.values()): | |
| checks_to_display = validation_data | |
| if checks_to_display and isinstance(checks_to_display, dict): | |
| for check_name, check_info in checks_to_display.items(): | |
| if isinstance(check_info, dict): | |
| col1, col2, col3 = st.columns([3, 1, 3]) | |
| with col1: | |
| # Check description | |
| description = check_info.get('description', check_name) | |
| st.write(f"**{description}**") | |
| with col2: | |
| # Status | |
| if check_info.get('passed', False): | |
| st.success("โ ") | |
| else: | |
| st.error("โ") | |
| with col3: | |
| # Message | |
| if 'message' in check_info: | |
| st.caption(check_info['message']) | |
| else: | |
| # Simple format | |
| st.write(f"**{check_name}**: {check_info}") | |
| else: | |
| # Display all validation data | |
| st.json(validation_data) | |
| else: | |
| # If not dictionary, display as is | |
| st.write("Validation results:") | |
| st.write(validation_data) | |
| else: | |
| # If no validation results, show pipeline information | |
| st.info("Validation results in report format not available, but pipeline execution statistics presented below") | |
| # Pipeline stage statistics | |
| st.subheader("Pipeline Execution Statistics") | |
| # Create stage table | |
| stages = [ | |
| ("Data Loading", "โ Successful" if data is not None else "โ Error"), | |
| ("Missing Value Processing", "โ Completed"), | |
| ("Outlier Processing", "โ Completed"), | |
| ("Feature Engineering", "โ Completed"), | |
| ("Scaling", "โ Completed"), | |
| ("Feature Selection", "โ Completed"), | |
| ("Data Split", "โ Completed" if modeling_data else "โ Not completed") | |
| ] | |
| for stage_name, status in stages: | |
| col1, col2 = st.columns([3, 1]) | |
| with col1: | |
| st.write(f"**{stage_name}**") | |
| with col2: | |
| if "โ " in status: | |
| st.success(status) | |
| else: | |
| st.error(status) | |
| # If pipeline exists, show available metrics | |
| if self.pipeline is not None: | |
| # Check for various metrics | |
| st.subheader("Data Quality Metrics") | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| # Data quality | |
| if data is not None: | |
| missing_pct = (data.isnull().sum().sum() / (data.shape[0] * data.shape[1])) * 100 | |
| st.metric("Missing Values", f"{missing_pct:.2f}%") | |
| with col2: | |
| # Feature information | |
| if data is not None: | |
| numeric_cols = len(data.select_dtypes(include=[np.number]).columns) | |
| st.metric("Numeric Features", numeric_cols) | |
| with col3: | |
| # Split information | |
| if modeling_data and 'X_train' in modeling_data: | |
| train_size = len(modeling_data['X_train']) | |
| total_size = train_size | |
| if 'X_test' in modeling_data: | |
| total_size += len(modeling_data['X_test']) | |
| if 'X_val' in modeling_data: | |
| total_size += len(modeling_data['X_val']) | |
| if total_size > 0: | |
| train_pct = (train_size / total_size) * 100 | |
| st.metric("Training Set", f"{train_pct:.1f}%") | |
| with tab4: | |
| st.subheader("Data Export") | |
| # Export formats | |
| export_format = st.radio( | |
| "Export Format", | |
| options=['CSV', 'Parquet', 'Excel'], | |
| horizontal=True | |
| ) | |
| # Export buttons | |
| if data is not None: | |
| # Export processed data | |
| st.write("**Processed Data**") | |
| if export_format == 'CSV': | |
| csv = data.to_csv(index=True) | |
| st.download_button( | |
| label="๐ฅ Download CSV", | |
| data=csv, | |
| file_name="streamlit_processed_data.csv", | |
| mime="text/csv", | |
| width='stretch' | |
| ) | |
| elif export_format == 'Parquet': | |
| # For Parquet need to save to temporary file | |
| import io | |
| buffer = io.BytesIO() | |
| data.to_parquet(buffer) | |
| buffer.seek(0) | |
| st.download_button( | |
| label="๐ฅ Download Parquet", | |
| data=buffer, | |
| file_name="streamlit_processed_data.parquet", | |
| mime="application/octet-stream", | |
| width='stretch' | |
| ) | |
| elif export_format == 'Excel': | |
| import io | |
| buffer = io.BytesIO() | |
| with pd.ExcelWriter(buffer, engine='openpyxl') as writer: | |
| data.to_excel(writer, sheet_name='Processed_Data') | |
| buffer.seek(0) | |
| st.download_button( | |
| label="๐ฅ Download Excel", | |
| data=buffer, | |
| file_name="streamlit_processed_data.xlsx", | |
| mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", | |
| width='stretch' | |
| ) | |
| # Export modeling data | |
| if modeling_data: | |
| st.write("**Modeling Data**") | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| if 'X_train' in modeling_data and modeling_data['X_train'] is not None: | |
| train_df = pd.concat([ | |
| modeling_data['X_train'], | |
| modeling_data['y_train'].rename('target') | |
| ], axis=1) if 'y_train' in modeling_data else modeling_data['X_train'] | |
| st.download_button( | |
| label="๐ฅ Training Set", | |
| data=train_df.to_csv(), | |
| file_name="train_data.csv", | |
| mime="text/csv", | |
| width='stretch' | |
| ) | |
| with col2: | |
| if 'X_val' in modeling_data and modeling_data['X_val'] is not None: | |
| val_df = pd.concat([ | |
| modeling_data['X_val'], | |
| modeling_data['y_val'].rename('target') | |
| ], axis=1) if 'y_val' in modeling_data else modeling_data['X_val'] | |
| st.download_button( | |
| label="๐ฅ Validation Set", | |
| data=val_df.to_csv(), | |
| file_name="validation_data.csv", | |
| mime="text/csv", | |
| width='stretch' | |
| ) | |
| with col3: | |
| if 'X_test' in modeling_data and modeling_data['X_test'] is not None: | |
| test_df = pd.concat([ | |
| modeling_data['X_test'], | |
| modeling_data['y_test'].rename('target') | |
| ], axis=1) if 'y_test' in modeling_data else modeling_data['X_test'] | |
| st.download_button( | |
| label="๐ฅ Test Set", | |
| data=test_df.to_csv(), | |
| file_name="test_data.csv", | |
| mime="text/csv", | |
| width='stretch' | |
| ) | |
| # Navigation | |
| st.markdown("---") | |
| col1, col2, col3 = st.columns([1, 1, 1]) | |
| with col1: | |
| if st.button("โฌ ๏ธ Back to Pipeline", width='stretch'): | |
| st.session_state.current_step = 4 | |
| st.rerun() | |
| with col3: | |
| if st.button("Go to Visualisations โก๏ธ", type="primary", width='stretch'): | |
| st.session_state.current_step = 6 | |
| st.rerun() | |
| def render_step_6_visualisations(self): | |
| """Step 6: Visualisations""" | |
| st.header("๐ Pipeline Visualisations") | |
| if not st.session_state.pipeline_completed: | |
| st.warning("First run pipeline in Step 4") | |
| # Suggest quick test | |
| st.markdown("---") | |
| st.subheader("๐ฎ Quick Test") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| if st.button("๐ Run Quick Test", type="primary", width='stretch'): | |
| st.session_state.quick_test_mode = True | |
| st.session_state.current_step = 1 | |
| st.rerun() | |
| with col2: | |
| if st.button("Run Pipeline", width='stretch'): | |
| st.session_state.current_step = 4 | |
| st.rerun() | |
| return | |
| # Check for plots | |
| if not st.session_state.available_plots: | |
| st.warning("Plots not found. Ensure pipeline was run with visualisation option enabled.") | |
| # Try to collect plots again | |
| if st.button("Try to Find Plots", width='stretch'): | |
| self.collect_available_plots() | |
| st.rerun() | |
| return | |
| # Plot statistics | |
| total_plots = sum(len(plots) for plots in st.session_state.available_plots.values()) | |
| st.success(f"โ Found {total_plots} plots") | |
| # Plot category tabs | |
| categories = list(st.session_state.available_plots.keys()) | |
| if 'summary' in categories: | |
| categories.remove('summary') | |
| categories.insert(0, 'summary') | |
| tabs = st.tabs([cat.capitalize().replace('_', ' ') for cat in categories]) | |
| for i, category in enumerate(categories): | |
| with tabs[i]: | |
| self.display_category_plots(category) | |
| # All plots in one gallery | |
| st.markdown("---") | |
| st.subheader("๐ผ๏ธ All Plots Gallery") | |
| # Collect all plots | |
| all_plots = [] | |
| for category, plots in st.session_state.available_plots.items(): | |
| for plot in plots: | |
| all_plots.append((category, plot)) | |
| # Display plots in grid | |
| cols_per_row = 3 | |
| for i in range(0, len(all_plots), cols_per_row): | |
| cols = st.columns(cols_per_row) | |
| for j in range(cols_per_row): | |
| idx = i + j | |
| if idx < len(all_plots): | |
| category, plot_info = all_plots[idx] | |
| with cols[j]: | |
| self.display_plot_card(plot_info, category) | |
| def display_category_plots(self, category): | |
| """Display plots in category""" | |
| plots = st.session_state.available_plots.get(category, []) | |
| if not plots: | |
| st.info(f"No plots in category '{category}'") | |
| return | |
| st.subheader(f"{category.capitalize().replace('_', ' ')} ({len(plots)} plots)") | |
| # Sort plots by name | |
| plots_sorted = sorted(plots, key=lambda x: x['name']) | |
| # Display plots in accordions for convenience | |
| for plot_info in plots_sorted: | |
| with st.expander(f"๐ {plot_info['name'].replace('_', ' ').replace('.png', '')}", expanded=True): | |
| self.display_plot_image(plot_info) | |
| def display_plot_card(self, plot_info, category): | |
| """Display plot card""" | |
| try: | |
| # Load image | |
| image = Image.open(plot_info['path']) | |
| # Create safe key for state | |
| safe_key = plot_info['path'].replace('/', '_').replace('\\', '_').replace('.', '_') | |
| # Initialise state for this plot if not exists | |
| if f"show_{safe_key}" not in st.session_state: | |
| st.session_state[f"show_{safe_key}"] = False | |
| # Create card | |
| with st.container(): | |
| st.markdown(f"**{plot_info['name'].replace('_', ' ').replace('.png', '')}**") | |
| st.image(image, width='stretch', caption=plot_info['rel_path']) | |
| # File information | |
| size_kb = plot_info['size'] / 1024 | |
| st.caption(f"Size: {size_kb:.1f} KB | Category: {category}") | |
| # Zoom control buttons | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| # Zoom button | |
| if st.button("๐ Zoom", key=f"zoom_{safe_key}", width='stretch'): | |
| st.session_state[f"show_{safe_key}"] = True | |
| # Don't use st.rerun() here | |
| with col2: | |
| # Hide zoomed image button (if shown) | |
| if st.session_state[f"show_{safe_key}"]: | |
| if st.button("โ Hide", key=f"hide_{safe_key}", width='stretch'): | |
| st.session_state[f"show_{safe_key}"] = False | |
| # Don't use st.rerun() here | |
| # If zoom button clicked, show zoomed image | |
| if st.session_state[f"show_{safe_key}"]: | |
| st.markdown("---") | |
| st.subheader(f"๐ {plot_info['name'].replace('_', ' ').replace('.png', '')}") | |
| st.image(image, width='stretch') | |
| except Exception as e: | |
| st.error(f"Error loading plot: {str(e)}") | |
| st.code(f"Path: {plot_info['path']}") | |
| def display_plot_image(self, plot_info): | |
| """Display plot image""" | |
| try: | |
| # Load image | |
| image = Image.open(plot_info['path']) | |
| # Display with information | |
| col1, col2 = st.columns([3, 1]) | |
| with col1: | |
| st.image(image, width='stretch') | |
| with col2: | |
| # File information | |
| st.metric("Size", f"{plot_info['size'] / 1024:.1f} KB") | |
| st.metric("Resolution", f"{image.width}ร{image.height}") | |
| # File format | |
| st.write(f"**Format:** {image.format}") | |
| # Download button | |
| with open(plot_info['path'], 'rb') as file: | |
| btn = st.download_button( | |
| label="๐ฅ Download", | |
| data=file, | |
| file_name=plot_info['name'], | |
| mime="image/png", | |
| width='stretch' | |
| ) | |
| except Exception as e: | |
| st.error(f"Error loading plot: {str(e)}") | |
| st.code(f"Path: {plot_info['path']}") | |
| def render_step_7_modeling(self): | |
| """Step 7: Modelling Preparation""" | |
| st.header("๐ค Modelling Preparation") | |
| if not st.session_state.pipeline_completed or st.session_state.modeling_data is None: | |
| st.warning("First run pipeline in Step 4") | |
| # Suggest quick test | |
| st.markdown("---") | |
| st.subheader("๐ฎ Quick Test") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| if st.button("๐ Run Quick Test", type="primary", width='stretch'): | |
| st.session_state.quick_test_mode = True | |
| st.session_state.current_step = 1 | |
| st.rerun() | |
| with col2: | |
| if st.button("Run Pipeline", width='stretch'): | |
| st.session_state.current_step = 4 | |
| st.rerun() | |
| return | |
| modeling_data = st.session_state.modeling_data | |
| # Basic information | |
| col1, col2, col3, col4 = st.columns(4) | |
| with col1: | |
| if 'X_train' in modeling_data and modeling_data['X_train'] is not None: | |
| st.metric("Training Set", f"{modeling_data['X_train'].shape[0]:,} records") | |
| with col2: | |
| if 'X_val' in modeling_data and modeling_data['X_val'] is not None: | |
| st.metric("Validation Set", f"{modeling_data['X_val'].shape[0]:,} records") | |
| with col3: | |
| if 'X_test' in modeling_data and modeling_data['X_test'] is not None: | |
| st.metric("Test Set", f"{modeling_data['X_test'].shape[0]:,} records") | |
| with col4: | |
| if 'feature_names' in modeling_data and modeling_data['feature_names'] is not None: | |
| st.metric("Number of Features", len(modeling_data['feature_names'])) | |
| # Tabs | |
| tab1, tab2, tab3 = st.tabs([ | |
| "๐ Data Structure", | |
| "๐ Target Variable Distribution", | |
| "๐ ML Integration" | |
| ]) | |
| with tab1: | |
| st.subheader("Modeling Data Structure") | |
| # Information table | |
| data_info = [] | |
| if 'X_train' in modeling_data and modeling_data['X_train'] is not None: | |
| data_info.append({ | |
| 'Dataset': 'Training', | |
| 'Samples': modeling_data['X_train'].shape[0], | |
| 'Features': modeling_data['X_train'].shape[1], | |
| 'Target Variable': 'Yes' if 'y_train' in modeling_data and modeling_data['y_train'] is not None else 'No' | |
| }) | |
| if 'X_val' in modeling_data and modeling_data['X_val'] is not None: | |
| data_info.append({ | |
| 'Dataset': 'Validation', | |
| 'Samples': modeling_data['X_val'].shape[0], | |
| 'Features': modeling_data['X_val'].shape[1], | |
| 'Target Variable': 'Yes' if 'y_val' in modeling_data and modeling_data['y_val'] is not None else 'No' | |
| }) | |
| if 'X_test' in modeling_data and modeling_data['X_test'] is not None: | |
| data_info.append({ | |
| 'Dataset': 'Test', | |
| 'Samples': modeling_data['X_test'].shape[0], | |
| 'Features': modeling_data['X_test'].shape[1], | |
| 'Target Variable': 'Yes' if 'y_test' in modeling_data and modeling_data['y_test'] is not None else 'No' | |
| }) | |
| if data_info: | |
| st.table(pd.DataFrame(data_info)) | |
| else: | |
| st.info("Modeling data not available") | |
| # Data sample | |
| st.subheader("Training Data Sample") | |
| if ('X_train' in modeling_data and modeling_data['X_train'] is not None and | |
| 'y_train' in modeling_data and modeling_data['y_train'] is not None): | |
| sample_data = pd.concat([ | |
| modeling_data['X_train'].head(10), | |
| modeling_data['y_train'].head(10).rename('target') | |
| ], axis=1) | |
| st.dataframe(sample_data, width='stretch') | |
| with tab2: | |
| st.subheader("Target Variable Distribution") | |
| if 'y_train' in modeling_data and modeling_data['y_train'] is not None: | |
| # Target variable histogram | |
| fig = px.histogram( | |
| x=modeling_data['y_train'], | |
| nbins=50, | |
| title="Target Variable Distribution (Training Set)", | |
| labels={'x': 'Target Variable', 'y': 'Frequency'}, | |
| color_discrete_sequence=['#00CC96'] | |
| ) | |
| st.plotly_chart(fig, width='stretch') | |
| # Statistics | |
| col1, col2, col3, col4 = st.columns(4) | |
| with col1: | |
| st.metric("Mean", f"{modeling_data['y_train'].mean():.2f}") | |
| with col2: | |
| st.metric("Standard Deviation", f"{modeling_data['y_train'].std():.2f}") | |
| with col3: | |
| st.metric("Minimum", f"{modeling_data['y_train'].min():.2f}") | |
| with col4: | |
| st.metric("Maximum", f"{modeling_data['y_train'].max():.2f}") | |
| else: | |
| st.info("Target variable not available") | |
| with tab3: | |
| st.subheader("Machine Learning Library Integration") | |
| st.info(""" | |
| Your data is ready for use with any Python ML libraries. | |
| Below are code examples for various libraries. | |
| """) | |
| # Library selection | |
| ml_library = st.selectbox( | |
| "Select ML Library", | |
| options=["Scikit-learn", "XGBoost", "LightGBM", "CatBoost", "PyTorch", "TensorFlow"] | |
| ) | |
| # Code generation | |
| code_placeholder = st.empty() | |
| if ml_library == "Scikit-learn": | |
| code = """# Example usage with Scikit-learn | |
| from sklearn.ensemble import RandomForestRegressor | |
| from sklearn.metrics import mean_squared_error, r2_score | |
| import numpy as np | |
| # Use prepared data | |
| X_train = modeling_data['X_train'] | |
| y_train = modeling_data['y_train'] | |
| X_val = modeling_data['X_val'] | |
| y_val = modeling_data['y_val'] | |
| # Create and train model | |
| model = RandomForestRegressor( | |
| n_estimators=100, | |
| max_depth=10, | |
| random_state=42 | |
| ) | |
| model.fit(X_train, y_train) | |
| # Predictions and evaluation | |
| y_pred = model.predict(X_val) | |
| print(f"RMSE: {np.sqrt(mean_squared_error(y_val, y_pred)):.4f}") | |
| print(f"Rยฒ Score: {r2_score(y_val, y_pred):.4f}") | |
| print(f"Feature Importance: {model.feature_importances_}")""" | |
| elif ml_library == "XGBoost": | |
| code = """# Example usage with XGBoost | |
| import xgboost as xgb | |
| from sklearn.metrics import mean_squared_error | |
| import numpy as np | |
| # Prepare data in DMatrix format | |
| dtrain = xgb.DMatrix(modeling_data['X_train'], label=modeling_data['y_train']) | |
| dval = xgb.DMatrix(modeling_data['X_val'], label=modeling_data['y_val']) | |
| # Model parameters | |
| params = { | |
| 'objective': 'reg:squarederror', | |
| 'max_depth': 6, | |
| 'learning_rate': 0.1, | |
| 'subsample': 0.8, | |
| 'colsample_bytree': 0.8, | |
| 'seed': 42 | |
| } | |
| # Train model | |
| model = xgb.train( | |
| params, | |
| dtrain, | |
| num_boost_round=100, | |
| evals=[(dval, 'validation')], | |
| early_stopping_rounds=10, | |
| verbose_eval=False | |
| ) | |
| # Predictions | |
| y_pred = model.predict(dval) | |
| print(f"RMSE: {np.sqrt(mean_squared_error(modeling_data['y_val'], y_pred)):.4f}") | |
| print(f"Number of Trees: {model.best_ntree_limit}")""" | |
| elif ml_library == "LightGBM": | |
| code = """# Example usage with LightGBM | |
| import lightgbm as lgb | |
| from sklearn.metrics import mean_squared_error | |
| import numpy as np | |
| # Prepare data | |
| train_data = lgb.Dataset( | |
| modeling_data['X_train'], | |
| label=modeling_data['y_train'] | |
| ) | |
| val_data = lgb.Dataset( | |
| modeling_data['X_val'], | |
| label=modeling_data['y_val'], | |
| reference=train_data | |
| ) | |
| # Model parameters | |
| params = { | |
| 'objective': 'regression', | |
| 'metric': 'rmse', | |
| 'num_leaves': 31, | |
| 'learning_rate': 0.05, | |
| 'feature_fraction': 0.9, | |
| 'bagging_fraction': 0.8, | |
| 'bagging_freq': 5, | |
| 'verbose': 0 | |
| } | |
| # Train model | |
| model = lgb.train( | |
| params, | |
| train_data, | |
| valid_sets=[val_data], | |
| num_boost_round=100, | |
| callbacks=[lgb.early_stopping(10)] | |
| ) | |
| # Predictions | |
| y_pred = model.predict(modeling_data['X_val']) | |
| print(f"RMSE: {np.sqrt(mean_squared_error(modeling_data['y_val'], y_pred)):.4f}") | |
| print(f"Best Iteration: {model.best_iteration}")""" | |
| else: | |
| code = f"""# Template for {ml_library} | |
| # Your data available in modeling_data variable | |
| X_train = modeling_data['X_train'] | |
| y_train = modeling_data['y_train'] | |
| X_val = modeling_data['X_val'] | |
| y_val = modeling_data['y_val'] | |
| X_test = modeling_data['X_test'] | |
| y_test = modeling_data['y_test'] | |
| # Code for {ml_library}... | |
| print(f"Data sizes:") | |
| print(f" X_train: {{X_train.shape}}") | |
| print(f" y_train: {{y_train.shape}}") | |
| print(f" X_val: {{X_val.shape}}") | |
| print(f" X_test: {{X_test.shape}}")""" | |
| # Display code | |
| code_placeholder.code(code, language='python') | |
| # Copy code button | |
| try: | |
| import pyperclip | |
| if st.button("๐ Copy Code", width='stretch'): | |
| try: | |
| pyperclip.copy(code) | |
| st.success("Code copied to clipboard!") | |
| except: | |
| st.warning("Failed to copy code. Copy manually.") | |
| except: | |
| st.warning("To copy code, install pyperclip library: pip install pyperclip") | |
| # Final information | |
| st.markdown("---") | |
| st.success(""" | |
| ๐ Congratulations! You have successfully prepared data for machine learning. | |
| **Next Steps:** | |
| 1. Use code above for integration with chosen ML library | |
| 2. Experiment with various models | |
| 3. Optimise hyperparameters | |
| 4. Evaluate results on test set | |
| """) | |
| # Navigation | |
| col1, col2 = st.columns([1, 1]) | |
| with col1: | |
| if st.button("โฌ ๏ธ Back to Visualisations", width='stretch'): | |
| st.session_state.current_step = 6 | |
| st.rerun() | |
| with col2: | |
| if st.button("๐ Run New Pipeline", type="primary", width='stretch'): | |
| # Reset state | |
| st.session_state.pipeline_completed = False | |
| st.session_state.processed_data = None | |
| st.session_state.modeling_data = None | |
| st.session_state.current_step = 1 | |
| st.session_state.uploaded_file = None | |
| st.session_state.plots_path = None | |
| st.session_state.available_plots = {} | |
| st.session_state.synthetic_data_generated = False | |
| st.session_state.auto_pipeline_ready = False | |
| st.session_state.quick_test_mode = False | |
| st.rerun() | |
| def render_footer(self): | |
| """Application footer""" | |
| st.markdown("---") | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| st.markdown("**TimeFlowPro** v1.1.0") | |
| st.caption("Added synthetic data generation") | |
| with col2: | |
| st.markdown("๐ง Contacts: cool.araby@gmail.com") | |
| with col3: | |
| st.markdown("ยฉ 2026 All Rights Reserved") | |
| def run(self): | |
| """Run application""" | |
| # Header | |
| st.title("๐ TimeFlow Pro - Data Analysis and Preprocessing") | |
| st.markdown("---") | |
| # Sidebar | |
| self.create_sidebar() | |
| # Main content depending on step | |
| if st.session_state.current_step == 1: | |
| self.render_step_1_data_loading() | |
| elif st.session_state.current_step == 2: | |
| self.render_step_2_configuration() | |
| elif st.session_state.current_step == 3: | |
| self.render_step_3_data_analysis() | |
| elif st.session_state.current_step == 4: | |
| self.render_step_4_pipeline_execution() | |
| elif st.session_state.current_step == 5: | |
| self.render_step_5_results() | |
| elif st.session_state.current_step == 6: | |
| self.render_step_6_visualisations() | |
| elif st.session_state.current_step == 7: | |
| self.render_step_7_modeling() | |
| # Footer | |
| self.render_footer() | |
| # ============================================ | |
| # APPLICATION LAUNCH | |
| # ============================================ | |
| if __name__ == "__main__": | |
| app = StreamlitApp() | |
| app.run() |