TimeFlowPro / app.py
ArabovMK's picture
Update app, run_pipeline.
d0a9de5
# ============================================
# TimeFlow Pro - Data Analysis and Preprocessing
# ============================================
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
import streamlit as st
import pandas as pd
import numpy as np
import os
import sys
import glob
import re
from datetime import datetime, timedelta
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
from PIL import Image
import matplotlib.pyplot as plt
import warnings
from pipeline.main_pipeline import EnhancedDataPreprocessingPipeline
warnings.filterwarnings('ignore')
# Add project path
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from config.config import Config
from data_loader.data_loader import DataLoader
from visualization.visualization_manager import VisualisationManager
# ============================================
# PAGE CONFIGURATION
# ============================================
st.set_page_config(
page_title="TimeFlow Pro - Data Analysis and Preprocessing",
page_icon="๐Ÿ“Š",
layout="wide",
initial_sidebar_state="expanded"
)
# ============================================
# STATE MANAGEMENT CLASS
# ============================================
class StreamlitApp:
"""Main Streamlit application class"""
def __init__(self):
self.init_session_state()
self.config = None
self.pipeline = None
self.data = None
def init_session_state(self):
"""Initialise session state"""
if 'pipeline_completed' not in st.session_state:
st.session_state.pipeline_completed = False
if 'processed_data' not in st.session_state:
st.session_state.processed_data = None
if 'modeling_data' not in st.session_state:
st.session_state.modeling_data = None
if 'current_step' not in st.session_state:
st.session_state.current_step = 1
if 'uploaded_file' not in st.session_state:
st.session_state.uploaded_file = None
if 'config_params' not in st.session_state:
st.session_state.config_params = self.get_default_config()
if 'plots_path' not in st.session_state:
st.session_state.plots_path = None
if 'available_plots' not in st.session_state:
st.session_state.available_plots = {}
if 'synthetic_data_generated' not in st.session_state:
st.session_state.synthetic_data_generated = False
if 'auto_pipeline_ready' not in st.session_state:
st.session_state.auto_pipeline_ready = False
if 'quick_test_mode' not in st.session_state:
st.session_state.quick_test_mode = False
def get_default_config(self):
"""Get default configuration"""
return {
'data_path': '',
'results_dir': 'streamlit_results',
'target_column': '',
'start_year': 1970,
'end_year': 1990,
'max_lags': 5,
'seasonal_period': 365,
'rolling_windows': [7, 30, 90],
'expanding_windows': [30, 90],
'test_size': 0.2,
'validation_size': 0.1,
'scaling_method': 'robust',
'feature_selection_method': 'correlation',
'max_features': 20,
'missing_threshold': 0.3,
'outlier_method': 'iqr',
'enable_validation': True,
'split_method': 'time_based'
}
def create_sidebar(self):
"""Create sidebar"""
with st.sidebar:
st.title("๐ŸŽฏ TimeFlowPro")
st.markdown("---")
# Navigation
st.subheader("Navigation")
steps = {
1: "๐Ÿ“ Data Loading",
2: "โš™๏ธ Configuration",
3: "๐Ÿ” Data Analysis",
4: "โšก Pipeline Execution",
5: "๐Ÿ“Š Results",
6: "๐Ÿ“ˆ Visualisations",
7: "๐Ÿค– Modelling"
}
for step_num, step_name in steps.items():
if st.button(
f"{step_name}",
key=f"nav_{step_num}",
type="primary" if st.session_state.current_step == step_num else "secondary",
width='stretch'
):
st.session_state.current_step = step_num
st.rerun()
st.markdown("---")
# Quick start with synthetic data
st.subheader("โšก Quick Test")
if st.button("๐Ÿš€ Quick Start with Synthetic Data",
type="primary",
width='stretch',
help="Generate synthetic data and run pipeline immediately"):
st.session_state.quick_test_mode = True
st.session_state.current_step = 1
st.rerun()
st.markdown("---")
# Project information
st.subheader("๐Ÿ“ˆ About the Project")
st.info("""
TimeFlow Pro - Data Analysis and Preprocessing.
**New Features:**
- Synthetic data generation for testing
- Automatic pipeline execution
- Quick testing without file upload
**Standard Features:**
- Missing data analysis and processing
- Outlier detection
- Feature engineering
- Stationarity analysis
- Data scaling
- Feature selection
""")
# Progress indicator
if st.session_state.pipeline_completed:
st.success("โœ… Pipeline completed")
else:
st.warning("โš ๏ธ Pipeline not started")
# Quick test indicator
if st.session_state.quick_test_mode:
st.info("โšก Quick test mode active")
def generate_synthetic_data(self, n_days=1095, include_seasonality=True, include_trend=True,
include_noise=True, include_exogenous=True, data_type="complex"):
"""
Generate synthetic data for testing
Args:
n_days (int): Number of days of data
include_seasonality (bool): Include seasonality
include_trend (bool): Include trend
include_noise (bool): Include noise
include_exogenous (bool): Include exogenous variables
data_type (str): Data type (simple, medium, complex)
Returns:
pd.DataFrame: Generated synthetic data
"""
try:
# Base parameters depending on data type
if data_type == "simple":
n_days = min(n_days, 365) # Limit for simple type
trend_strength = 0.005
noise_std = 2
include_exogenous = False
elif data_type == "medium":
n_days = min(n_days, 730) # Limit for medium type
trend_strength = 0.01
noise_std = 5
include_exogenous = True
else: # complex
n_days = min(n_days, 1095) # Limit for complex type
trend_strength = 0.02
noise_std = 10
include_exogenous = True
# Create dates
start_date = datetime.now() - timedelta(days=n_days)
dates = pd.date_range(start=start_date, periods=n_days, freq='D')
# Base trend
if include_trend:
trend = np.linspace(0, trend_strength * n_days, n_days)
else:
trend = np.zeros(n_days)
# Seasonality
if include_seasonality:
# Annual seasonality
seasonal = 10 * np.sin(2 * np.pi * np.arange(n_days) / 365)
# Quarterly seasonality
seasonal += 5 * np.sin(2 * np.pi * np.arange(n_days) / 90)
# Monthly seasonality
seasonal += 3 * np.sin(2 * np.pi * np.arange(n_days) / 30)
# Weekly seasonality
seasonal += 2 * np.sin(2 * np.pi * np.arange(n_days) / 7)
else:
seasonal = np.zeros(n_days)
# Main target variable (water consumption)
base_value = 100
raskhodvoda = base_value + trend + seasonal
# Add noise
if include_noise:
noise = np.random.normal(0, noise_std, n_days)
raskhodvoda += noise
# Create DataFrame
data = pd.DataFrame({
'date': dates,
'raskhodvoda': raskhodvoda
})
# Add exogenous variables
if include_exogenous:
# Temperature (seasonal)
data['temperature'] = 15 + 10 * np.sin(2 * np.pi * np.arange(n_days) / 365) + np.random.normal(0, 3, n_days)
# Precipitation (random spikes)
precipitation = np.random.exponential(2, n_days)
# Add seasonality to precipitation
precipitation_seasonality = 5 * np.sin(2 * np.pi * np.arange(n_days) / 365 + np.pi/2)
data['precipitation'] = np.maximum(0, precipitation + precipitation_seasonality)
# Pressure
data['pressure'] = 760 + np.random.normal(0, 5, n_days)
# Humidity
data['humidity'] = 60 + 20 * np.sin(2 * np.pi * np.arange(n_days) / 180) + np.random.normal(0, 10, n_days)
# Electricity consumption (correlated with target variable)
data['electricity_consumption'] = raskhodvoda * 0.8 + np.random.normal(0, 5, n_days)
# Day of week (categorical variable)
data['day_of_week'] = dates.dayofweek
data['is_weekend'] = (data['day_of_week'] >= 5).astype(int)
# Holidays (random)
holidays = np.random.choice([0, 1], size=n_days, p=[0.95, 0.05])
data['is_holiday'] = holidays
# Lag variables
for lag in [1, 7, 30]:
data[f'raskhodvoda_lag_{lag}'] = data['raskhodvoda'].shift(lag)
# Moving averages
for window in [7, 30]:
data[f'raskhodvoda_ma_{window}'] = data['raskhodvoda'].rolling(window=window).mean()
# Add missing values for realism (5% random missing values)
# CORRECTION: proper creation of missing value mask
for col in data.columns:
if col != 'date': # Don't add missing values to dates
mask = np.random.random(len(data)) < 0.05
data.loc[mask, col] = np.nan
# Add outliers (1% of data)
# CORRECTION: proper creation of outlier mask
numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist()
for col in numeric_cols:
outlier_mask = np.random.random(len(data)) < 0.01
if outlier_mask.any():
# Find outlier indices
outlier_indices = data.index[outlier_mask]
for idx in outlier_indices:
if col in data.columns:
mean_val = data[col].mean(skipna=True)
std_val = data[col].std(skipna=True)
if not np.isnan(mean_val) and not np.isnan(std_val) and std_val > 0:
outlier_value = mean_val + 5 * std_val * np.random.choice([-1, 1])
data.at[idx, col] = outlier_value
# Reset index
data.reset_index(drop=True, inplace=True)
st.session_state.synthetic_data_generated = True
return data
except Exception as e:
st.error(f"Error generating synthetic data: {str(e)}")
import traceback
st.error(f"Error traceback: {traceback.format_exc()}")
return None
def quick_test_pipeline(self):
"""Quick pipeline execution with synthetic data"""
with st.spinner("๐Ÿš€ Running quick test with synthetic data..."):
try:
# Step 1: Generate synthetic data
st.info("Step 1: Generating synthetic data...")
synthetic_data = self.generate_synthetic_data(
n_days=365, # Reduced for speed
include_seasonality=True,
include_trend=True,
include_noise=True,
include_exogenous=True,
data_type="medium" # Changed to medium for balance between speed and quality
)
if synthetic_data is None:
st.error("Failed to generate synthetic data")
return
# Save data to temporary file
temp_file = "temp_synthetic_data.csv"
synthetic_data.to_csv(temp_file, index=False)
# Step 2: Configure settings
st.info("Step 2: Configuring settings...")
config_params = st.session_state.config_params.copy()
config_params.update({
'data_path': temp_file,
'target_column': 'raskhodvoda',
'start_year': 2020,
'end_year': 2023,
'max_lags': 7,
'seasonal_period': 365,
'rolling_windows': [7, 30],
'expanding_windows': [30],
'test_size': 0.2,
'validation_size': 0.1,
'scaling_method': 'robust',
'feature_selection_method': 'correlation',
'max_features': 10, # Reduced for speed
'missing_threshold': 0.3,
'outlier_method': 'iqr',
'enable_validation': True,
'split_method': 'time_based'
})
# Step 3: Create and run pipeline
st.info("Step 3: Creating and running pipeline...")
# Create progress bar
progress_bar = st.progress(0)
status_text = st.empty()
# Update configuration
st.session_state.config_params = config_params
st.session_state.uploaded_file = temp_file
st.session_state.data_preview = synthetic_data
# Create configuration
status_text.text("Creating configuration...")
progress_bar.progress(20)
config = Config(**config_params)
# Create pipeline
status_text.text("Initialising pipeline...")
progress_bar.progress(40)
self.pipeline = EnhancedDataPreprocessingPipeline(config)
# Run pipeline
status_text.text("Running preprocessing pipeline...")
progress_bar.progress(60)
processed_data = self.pipeline.run_full_pipeline(
use_synthetic=False, # Synthetic data already loaded
save_intermediate=True,
create_reports=True
)
# Update progress
if processed_data is not None:
status_text.text("Getting data for modelling...")
progress_bar.progress(80)
modeling_data = self.pipeline.get_final_data_for_modelling()
# Save to session state
st.session_state.processed_data = processed_data
st.session_state.modeling_data = modeling_data
st.session_state.pipeline_completed = True
st.session_state.plots_path = os.path.join(config.results_dir, 'plots')
st.session_state.auto_pipeline_ready = True
# Collect information about available plots
self.collect_available_plots()
# Completion
status_text.text("Completing...")
progress_bar.progress(100)
st.success("โœ… Quick test completed successfully!")
# Show results
col1, col2, col3 = st.columns(3)
with col1:
st.metric("Records generated", f"{synthetic_data.shape[0]:,}")
with col2:
st.metric("Processed data", f"{processed_data.shape[0]:,} rows")
with col3:
st.metric("Final features", f"{processed_data.shape[1]} columns")
# Automatic transition to results
st.session_state.current_step = 5
st.rerun()
else:
st.error("โŒ Error running pipeline")
st.error("Check logs for more information")
except Exception as e:
st.error(f"โŒ Error during quick test: {str(e)}")
import traceback
st.error(f"Error traceback: {traceback.format_exc()}")
def render_step_1_data_loading(self):
"""Step 1: Data Loading"""
st.header("๐Ÿ“ Data Loading")
# Check quick test mode
if st.session_state.quick_test_mode and not st.session_state.auto_pipeline_ready:
st.info("โšก Quick test mode activated. Generating synthetic data and running pipeline...")
self.quick_test_pipeline()
return
col1, col2 = st.columns([2, 1])
with col1:
# File upload
uploaded_file = st.file_uploader(
"Upload CSV file with data",
type=['csv', 'xlsx', 'parquet'],
help="Supported formats: CSV, Excel, Parquet"
)
if uploaded_file is not None:
# Save file temporarily
file_path = f"temp_data.{uploaded_file.name.split('.')[-1]}"
with open(file_path, "wb") as f:
f.write(uploaded_file.getbuffer())
st.session_state.uploaded_file = file_path
st.session_state.config_params['data_path'] = file_path
# Load and preview data
try:
if file_path.endswith('.csv'):
data = pd.read_csv(file_path)
elif file_path.endswith('.xlsx'):
data = pd.read_excel(file_path)
elif file_path.endswith('.parquet'):
data = pd.read_parquet(file_path)
else:
st.error("Unsupported file format")
return
st.session_state.data_preview = data
# Data preview
st.subheader("Data Preview")
st.dataframe(data.head(50), width='stretch')
# Basic information
st.subheader("๐Ÿ“‹ Data Information")
info_col1, info_col2, info_col3 = st.columns(3)
with info_col1:
st.metric("Rows", data.shape[0])
st.metric("Columns", data.shape[1])
with info_col2:
numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist()
st.metric("Numeric columns", len(numeric_cols))
categorical_cols = data.select_dtypes(include=['object', 'category']).columns.tolist()
st.metric("Categorical columns", len(categorical_cols))
with info_col3:
total_missing = data.isnull().sum().sum()
missing_percentage = (total_missing / (data.shape[0] * data.shape[1])) * 100
st.metric("Missing values", f"{total_missing:,}")
st.metric("Missing percentage", f"{missing_percentage:.2f}%")
# Automatic target column selection if not set
if 'target_column' not in st.session_state.config_params or not st.session_state.config_params['target_column']:
numeric_columns = data.select_dtypes(include=[np.number]).columns.tolist()
if numeric_columns:
# Automatically select column with typical name
target_keywords = ['target', 'y', 'value', 'price', 'sales', 'demand', 'raskhod', 'ั€ะฐัั…ะพะด']
selected_target = None
for col in numeric_columns:
if any(keyword in col.lower() for keyword in target_keywords):
selected_target = col
break
# If not found by keywords, take last numeric column
if not selected_target and numeric_columns:
selected_target = numeric_columns[-1]
if selected_target:
st.session_state.config_params['target_column'] = selected_target
st.info(f"Target variable automatically selected: **{selected_target}**")
st.info("You can change it in the next step")
# Button to proceed to next step
if st.button("โžก๏ธ Go to Configuration", type="primary", width='stretch'):
st.session_state.current_step = 2
st.rerun()
except Exception as e:
st.error(f"Error loading data: {str(e)}")
with col2:
# Demo data
st.subheader("๐ŸŽฎ Demo Mode")
demo_option = st.radio(
"Choose demo data:",
["Synthetic Data", "Time Series Example"]
)
# Synthetic data settings
with st.expander("โš™๏ธ Synthetic Data Settings", expanded=False):
data_type = st.selectbox(
"Data Type",
options=["Simple", "Medium", "Complex"],
index=1,
help="Simple: 1 year, few features\nMedium: 2 years, main features\nComplex: 3 years, all features"
)
n_days = st.slider(
"Number of days",
min_value=90,
max_value=1825,
value=1095,
step=30,
help="Number of days in synthetic data"
)
include_trend = st.checkbox("Include trend", value=True)
include_seasonality = st.checkbox("Include seasonality", value=True)
include_noise = st.checkbox("Include noise", value=True)
include_exogenous = st.checkbox("Include additional features", value=True)
if st.button("Generate and Load Synthetic Data", width='stretch'):
with st.spinner("Creating synthetic data..."):
try:
# Data type mapping
data_type_map = {
"Simple": "simple",
"Medium": "medium",
"Complex": "complex"
}
# Generate synthetic data
synthetic_data = self.generate_synthetic_data(
n_days=n_days,
include_seasonality=include_seasonality,
include_trend=include_trend,
include_noise=include_noise,
include_exogenous=include_exogenous,
data_type=data_type_map[data_type]
)
if synthetic_data is not None:
st.session_state.data_preview = synthetic_data
st.session_state.uploaded_file = "synthetic_data"
st.session_state.config_params['data_path'] = 'synthetic_data'
# Automatically select target variable
if 'raskhodvoda' in synthetic_data.columns:
st.session_state.config_params['target_column'] = 'raskhodvoda'
st.success(f"โœ… Synthetic data created: {synthetic_data.shape[0]} rows, {synthetic_data.shape[1]} columns")
# Show preview
st.subheader("Synthetic Data Preview")
st.dataframe(synthetic_data.head(20), width='stretch')
# Statistics
st.subheader("๐Ÿ“Š Synthetic Data Statistics")
stat_col1, stat_col2 = st.columns(2)
with stat_col1:
st.metric("Period", f"{synthetic_data.shape[0]} days")
# CORRECTION: convert dates to strings for display
if 'date' in synthetic_data.columns:
min_date = synthetic_data['date'].min()
max_date = synthetic_data['date'].max()
if isinstance(min_date, (pd.Timestamp, datetime)):
st.text(f"Start: {min_date.strftime('%Y-%m-%d')}")
else:
st.text(f"Start: {str(min_date)}")
if isinstance(max_date, (pd.Timestamp, datetime)):
st.text(f"End: {max_date.strftime('%Y-%m-%d')}")
else:
st.text(f"End: {str(max_date)}")
with stat_col2:
if 'raskhodvoda' in synthetic_data.columns:
st.metric("Average consumption", f"{synthetic_data['raskhodvoda'].mean():.2f}")
st.metric("Max consumption", f"{synthetic_data['raskhodvoda'].max():.2f}")
st.metric("Min consumption", f"{synthetic_data['raskhodvoda'].min():.2f}")
# Quick pipeline execution
st.markdown("---")
if st.button("๐Ÿš€ Quick Run Pipeline with This Data", type="primary", width='stretch'):
st.session_state.quick_test_mode = True
st.session_state.auto_pipeline_ready = False
st.rerun()
st.rerun()
else:
st.error("Failed to generate synthetic data")
except Exception as e:
st.error(f"Error creating synthetic data: {str(e)}")
st.markdown("---")
# Instructions
st.subheader("๐Ÿ“– Instructions")
st.markdown("""
1. Upload CSV file with data **OR**
2. Generate synthetic data for testing
3. Check data preview
4. Target variable will be selected automatically
5. Go to configuration to specify parameters
**Data Requirements:**
- Date in separate column or index
- Clean column names
- Time series with regular intervals
""")
def render_step_2_configuration(self):
"""Step 2: Pipeline Configuration"""
st.header("โš™๏ธ Pipeline Configuration")
# Automatic configuration for synthetic data
if st.session_state.uploaded_file == "synthetic_data" or st.session_state.config_params['data_path'] == 'synthetic_data':
st.info("โšก Synthetic data detected. Optimised configuration applied.")
# Automatic parameter setup for synthetic data
if st.button("Apply Recommended Settings for Synthetic Data", width='stretch'):
st.session_state.config_params.update({
'target_column': 'raskhodvoda',
'max_lags': 7,
'seasonal_period': 365,
'rolling_windows': [7, 30, 90],
'expanding_windows': [30, 90],
'test_size': 0.2,
'validation_size': 0.1,
'scaling_method': 'robust',
'feature_selection_method': 'correlation',
'max_features': 15,
'missing_threshold': 0.3,
'outlier_method': 'iqr',
'enable_validation': True
})
st.success("Settings applied!")
st.rerun()
# Configuration sections
tab1, tab2, tab3, tab4 = st.tabs([
"๐Ÿ“Š Basic Parameters",
"๐Ÿ”ง Data Processing",
"๐ŸŽฏ Features and Selection",
"๐Ÿ“ˆ Temporal Parameters"
])
with tab1:
col1, col2 = st.columns(2)
with col1:
st.subheader("Basic Parameters")
st.session_state.config_params['results_dir'] = st.text_input(
"Results Directory",
value=st.session_state.config_params['results_dir']
)
# CORRECTION: replace text_input with selectbox for target variable selection
if hasattr(st.session_state, 'data_preview') and st.session_state.data_preview is not None:
# Get all data columns
all_columns = st.session_state.data_preview.columns.tolist()
# If target variable already set and present in data, use it
current_target = st.session_state.config_params.get('target_column', '')
default_index = 0
if current_target in all_columns:
default_index = all_columns.index(current_target)
elif len(all_columns) > 0:
# Try to find suitable default column
numeric_columns = st.session_state.data_preview.select_dtypes(include=[np.number]).columns.tolist()
if numeric_columns:
# Look for columns with typical target variable names
target_keywords = ['target', 'y', 'value', 'price', 'sales', 'demand', 'raskhod', 'ั€ะฐัั…ะพะด']
for i, col in enumerate(all_columns):
if any(keyword in col.lower() for keyword in target_keywords):
default_index = i
break
# If not found by keywords, take first numeric column
if default_index == 0 and numeric_columns[0] in all_columns:
default_index = all_columns.index(numeric_columns[0])
st.session_state.config_params['target_column'] = st.selectbox(
"Select Target Variable",
options=all_columns,
index=default_index,
help="Select column to be predicted"
)
else:
# If data not loaded, keep text field
st.session_state.config_params['target_column'] = st.text_input(
"Target Variable",
value=st.session_state.config_params.get('target_column', ''),
help="Enter target column name"
)
st.session_state.config_params['enable_validation'] = st.checkbox(
"Enable Data Validation",
value=st.session_state.config_params['enable_validation']
)
with col2:
st.subheader("Data Split")
st.session_state.config_params['test_size'] = st.slider(
"Test Set Size (%)",
min_value=5,
max_value=40,
value=int(st.session_state.config_params['test_size'] * 100),
step=5,
format="%d%%"
) / 100
st.session_state.config_params['validation_size'] = st.slider(
"Validation Set Size (%)",
min_value=5,
max_value=30,
value=int(st.session_state.config_params['validation_size'] * 100),
step=5,
format="%d%%"
) / 100
split_methods = ['time_based', 'random']
st.session_state.config_params['split_method'] = st.selectbox(
"Split Method",
options=split_methods,
index=split_methods.index(st.session_state.config_params['split_method'])
)
with tab2:
col1, col2 = st.columns(2)
with col1:
st.subheader("Missing Value Processing")
st.session_state.config_params['missing_threshold'] = st.slider(
"Missing Value Column Removal Threshold",
min_value=0.0,
max_value=0.5,
value=st.session_state.config_params['missing_threshold'],
step=0.05,
format="%.2f"
)
st.subheader("Outlier Processing")
outlier_methods = ['iqr', 'zscore', 'isolation_forest']
st.session_state.config_params['outlier_method'] = st.selectbox(
"Outlier Detection Method",
options=outlier_methods,
index=outlier_methods.index(st.session_state.config_params['outlier_method'])
)
with col2:
st.subheader("Data Scaling")
scaling_methods = ['robust', 'standard', 'minmax', 'none']
st.session_state.config_params['scaling_method'] = st.selectbox(
"Scaling Method",
options=scaling_methods,
index=scaling_methods.index(st.session_state.config_params['scaling_method'])
)
if st.session_state.config_params['scaling_method'] == 'none':
st.info("โš ๏ธ Data will not be scaled")
with tab3:
col1, col2 = st.columns(2)
with col1:
st.subheader("Feature Engineering")
st.session_state.config_params['max_lags'] = st.slider(
"Maximum Number of Lags",
min_value=1,
max_value=20,
value=st.session_state.config_params['max_lags'],
step=1
)
rolling_windows_input = st.text_input(
"Windows for Rolling Statistics (comma-separated)",
value=', '.join(map(str, st.session_state.config_params['rolling_windows']))
)
if rolling_windows_input:
st.session_state.config_params['rolling_windows'] = [
int(x.strip()) for x in rolling_windows_input.split(',') if x.strip().isdigit()
]
with col2:
st.subheader("Feature Selection")
feature_methods = ['correlation', 'variance', 'mutual_info', 'rf', 'none']
st.session_state.config_params['feature_selection_method'] = st.selectbox(
"Feature Selection Method",
options=feature_methods,
index=feature_methods.index(st.session_state.config_params['feature_selection_method'])
)
st.session_state.config_params['max_features'] = st.slider(
"Maximum Number of Features",
min_value=5,
max_value=100,
value=st.session_state.config_params['max_features'],
step=5
)
with tab4:
col1, col2 = st.columns(2)
with col1:
st.subheader("Temporal Parameters")
# If there is data for preview, show date range
if hasattr(st.session_state, 'data_preview'):
if 'date' in st.session_state.data_preview.columns:
date_col = 'date'
elif isinstance(st.session_state.data_preview.index, pd.DatetimeIndex):
dates = st.session_state.data_preview.index
else:
# Try to find date column
date_cols = [col for col in st.session_state.data_preview.columns
if 'date' in col.lower() or 'time' in col.lower()]
date_col = date_cols[0] if date_cols else None
if date_col:
if date_col in st.session_state.data_preview.columns:
dates = pd.to_datetime(st.session_state.data_preview[date_col])
else:
dates = st.session_state.data_preview.index
if len(dates) > 0:
min_date = dates.min()
max_date = dates.max()
col1_date, col2_date = st.columns(2)
with col1_date:
st.session_state.config_params['start_year'] = st.number_input(
"Start Year",
min_value=1900,
max_value=2100,
value=min_date.year,
step=1
)
with col2_date:
st.session_state.config_params['end_year'] = st.number_input(
"End Year",
min_value=1900,
max_value=2100,
value=max_date.year,
step=1
)
with col2:
st.subheader("Seasonality")
st.session_state.config_params['seasonal_period'] = st.selectbox(
"Seasonal Period",
options=[7, 30, 90, 365, 12, 24],
index=[7, 30, 90, 365, 12, 24].index(
st.session_state.config_params['seasonal_period']
) if st.session_state.config_params['seasonal_period'] in [7, 30, 90, 365, 12, 24] else 0
)
expanding_windows_input = st.text_input(
"Windows for Expanding Statistics (comma-separated)",
value=', '.join(map(str, st.session_state.config_params['expanding_windows']))
)
if expanding_windows_input:
st.session_state.config_params['expanding_windows'] = [
int(x.strip()) for x in expanding_windows_input.split(',') if x.strip().isdigit()
]
# Navigation buttons
col1, col2, col3 = st.columns([1, 1, 1])
with col1:
if st.button("โฌ…๏ธ Back to Loading", width='stretch'):
st.session_state.current_step = 1
st.rerun()
with col3:
if st.button("Go to Analysis โžก๏ธ", type="primary", width='stretch'):
st.session_state.current_step = 3
st.rerun()
def render_step_3_data_analysis(self):
"""Step 3: Data Analysis"""
st.header("๐Ÿ” Data Analysis")
if not hasattr(st.session_state, 'data_preview') or st.session_state.data_preview is None:
st.warning("First load data in Step 1")
if st.button("Return to Data Loading"):
st.session_state.current_step = 1
st.rerun()
return
data = st.session_state.data_preview
# Analysis tabs
tab1, tab2, tab3, tab4 = st.tabs([
"๐Ÿ“ˆ Statistics",
"๐Ÿ” Distributions",
"๐Ÿ“… Temporal Analysis",
"โ“ Missing Values and Outliers"
])
with tab1:
col1, col2 = st.columns(2)
with col1:
st.subheader("Basic Statistics")
st.dataframe(data.describe().round(2), width='stretch')
with col2:
st.subheader("Data Types")
dtype_info = pd.DataFrame({
'Column': data.columns,
'Type': data.dtypes.values,
'Unique Values': [data[col].nunique() for col in data.columns]
})
st.dataframe(dtype_info, width='stretch')
with tab2:
# Select column for visualisation
numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist()
if numeric_cols:
selected_col = st.selectbox(
"Select Column for Analysis",
options=numeric_cols
)
col1, col2 = st.columns(2)
with col1:
# Histogram
fig = px.histogram(
data,
x=selected_col,
title=f"Distribution of {selected_col}",
nbins=50,
color_discrete_sequence=['#636EFA']
)
st.plotly_chart(fig, width='stretch')
with col2:
# Box plot
fig = go.Figure()
fig.add_trace(go.Box(
y=data[selected_col],
name=selected_col,
boxpoints='outliers',
marker_color='#EF553B'
))
fig.update_layout(
title=f"Box plot {selected_col}",
yaxis_title=selected_col
)
st.plotly_chart(fig, width='stretch')
else:
st.warning("No numeric columns for distribution analysis")
with tab3:
# Time series analysis
date_cols = [col for col in data.columns if 'date' in col.lower()]
if date_cols or isinstance(data.index, pd.DatetimeIndex):
if date_cols:
date_col = date_cols[0]
dates = pd.to_datetime(data[date_col])
else:
dates = data.index
date_col = 'index'
# Check for numeric columns
if len(numeric_cols) > 0:
# Select column for time series
ts_col = st.selectbox(
"Select Column for Time Series",
options=numeric_cols
)
# Time series
fig = go.Figure()
fig.add_trace(go.Scatter(
x=dates,
y=data[ts_col],
mode='lines',
name=ts_col,
line=dict(color='#636EFA', width=2)
))
fig.update_layout(
title=f"Time Series: {ts_col}",
xaxis_title="Date",
yaxis_title=ts_col,
hovermode='x unified'
)
st.plotly_chart(fig, width='stretch')
# Seasonality (if sufficient data)
if len(dates) > 30:
# Monthly trend
if hasattr(dates, 'month'):
monthly_data = data.groupby(dates.dt.month)[ts_col].mean()
fig2 = px.bar(
x=monthly_data.index,
y=monthly_data.values,
title=f"Monthly Seasonality: {ts_col}",
labels={'x': 'Month', 'y': 'Average Value'}
)
st.plotly_chart(fig2, width='stretch')
else:
st.warning("No numeric columns for temporal analysis")
else:
st.info("For temporal analysis, date column or DatetimeIndex required")
with tab4:
col1, col2 = st.columns(2)
with col1:
# Missing value analysis
st.subheader("Missing Values")
missing_data = data.isnull().sum()
missing_percentage = (missing_data / len(data)) * 100
missing_df = pd.DataFrame({
'Column': missing_data.index,
'Missing Count': missing_data.values,
'Missing Percentage': missing_percentage.values
}).sort_values('Missing Count', ascending=False)
st.dataframe(missing_df, width='stretch')
# Missing values visualisation
if missing_data.sum() > 0:
fig = px.bar(
missing_df,
x='Column',
y='Missing Percentage',
title="Missing Percentage by Column",
color='Missing Percentage',
color_continuous_scale='Reds'
)
st.plotly_chart(fig, width='stretch')
with col2:
# Quick outlier analysis
st.subheader("Quick Outlier Analysis")
if len(numeric_cols) > 0:
outlier_summary = []
for col in numeric_cols[:5]: # Limit to 5 columns for speed
q1 = data[col].quantile(0.25)
q3 = data[col].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
outliers = data[(data[col] < lower_bound) | (data[col] > upper_bound)]
outlier_pct = (len(outliers) / len(data)) * 100
outlier_summary.append({
'Column': col,
'Outliers': len(outliers),
'Percentage': f"{outlier_pct:.2f}%"
})
outlier_df = pd.DataFrame(outlier_summary)
st.dataframe(outlier_df, width='stretch')
else:
st.warning("No numeric columns for outlier analysis")
# Navigation buttons
col1, col2, col3 = st.columns([1, 1, 1])
with col1:
if st.button("โฌ…๏ธ Back to Configuration", width='stretch'):
st.session_state.current_step = 2
st.rerun()
with col3:
if st.button("Run Pipeline โžก๏ธ", type="primary", width='stretch'):
st.session_state.current_step = 4
st.rerun()
def render_step_4_pipeline_execution(self):
"""Step 4: Pipeline Execution"""
st.header("โšก Pipeline Execution")
# Readiness check
ready_to_run = True
issues = []
if not st.session_state.uploaded_file and st.session_state.config_params['data_path'] != 'demo' and st.session_state.config_params['data_path'] != 'synthetic_data':
issues.append("Data not loaded")
ready_to_run = False
if not st.session_state.config_params['target_column']:
issues.append("Target variable not selected")
ready_to_run = False
# Automatic synthetic data generation if quick test enabled
if st.session_state.quick_test_mode and not st.session_state.auto_pipeline_ready:
st.info("โšก Quick test mode activated. Generating synthetic data...")
self.quick_test_pipeline()
return
# Display warnings
if issues:
st.error("โš ๏ธ Fix before running:")
for issue in issues:
st.write(f"- {issue}")
# Suggest using synthetic data
st.markdown("---")
st.subheader("๐ŸŽฎ Quick Solution")
col1, col2 = st.columns(2)
with col1:
if st.button("Generate Synthetic Data", width='stretch'):
st.session_state.current_step = 1
st.rerun()
with col2:
if st.button("To Data Loading", width='stretch'):
st.session_state.current_step = 1
st.rerun()
col3, col4 = st.columns(2)
with col3:
if st.button("To Configuration", width='stretch'):
st.session_state.current_step = 2
st.rerun()
return
# Display configuration
st.subheader("Execution Configuration")
config_col1, config_col2 = st.columns(2)
with config_col1:
st.metric("Target Variable", st.session_state.config_params['target_column'])
st.metric("Test Set", f"{st.session_state.config_params['test_size']*100:.0f}%")
st.metric("Scaling Method", st.session_state.config_params['scaling_method'])
with config_col2:
st.metric("Max Lags", st.session_state.config_params['max_lags'])
st.metric("Feature Selection Method", st.session_state.config_params['feature_selection_method'])
st.metric("Validation Enabled", "Yes" if st.session_state.config_params['enable_validation'] else "No")
# Execution options
st.subheader("Execution Options")
col1, col2 = st.columns(2)
with col1:
use_synthetic = st.checkbox(
"Use Synthetic Data",
value=(st.session_state.config_params['data_path'] == 'demo' or
st.session_state.config_params['data_path'] == 'synthetic_data'),
disabled=(st.session_state.config_params['data_path'] == 'demo' or
st.session_state.config_params['data_path'] == 'synthetic_data')
)
save_intermediate = st.checkbox(
"Save Intermediate Results",
value=True
)
with col2:
create_reports = st.checkbox(
"Create Reports",
value=True
)
create_visualisations = st.checkbox(
"Create Visualisations",
value=True,
help="Create data analysis plots"
)
# Run button
if st.button("๐Ÿš€ Run Preprocessing Pipeline", type="primary", width='stretch'):
# Create progress bar
progress_bar = st.progress(0)
status_text = st.empty()
try:
# Create configuration
status_text.text("Creating configuration...")
progress_bar.progress(10)
config = Config(**st.session_state.config_params)
# Create pipeline
status_text.text("Initialising pipeline...")
progress_bar.progress(20)
self.pipeline = EnhancedDataPreprocessingPipeline(config)
# Determine whether to use synthetic data
use_synthetic_flag = (use_synthetic or
st.session_state.config_params['data_path'] == 'demo' or
st.session_state.config_params['data_path'] == 'synthetic_data')
# Run pipeline
status_text.text("Running preprocessing pipeline...")
progress_bar.progress(30)
processed_data = self.pipeline.run_full_pipeline(
use_synthetic=use_synthetic_flag,
save_intermediate=save_intermediate,
create_reports=create_reports
)
# Update progress
if processed_data is not None:
status_text.text("Getting data for modelling...")
progress_bar.progress(80)
modeling_data = self.pipeline.get_final_data_for_modelling()
# Save to session state
st.session_state.processed_data = processed_data
st.session_state.modeling_data = modeling_data
st.session_state.pipeline_completed = True
st.session_state.plots_path = os.path.join(config.results_dir, 'plots')
# Collect information about available plots
self.collect_available_plots()
# Completion
status_text.text("Completing...")
progress_bar.progress(100)
st.success("โœ… Pipeline completed successfully!")
# Show results
col1, col2, col3 = st.columns(3)
with col1:
if hasattr(self.pipeline, 'results') and 'data_loading' in self.pipeline.results:
st.metric("Original Data", f"{self.pipeline.results['data_loading']['shape'][0]:,} rows")
else:
st.metric("Original Data", "Information unavailable")
with col2:
st.metric("Processed Data", f"{processed_data.shape[0]:,} rows")
with col3:
st.metric("Final Features", f"{processed_data.shape[1]} columns")
# Button to proceed to results
if st.button("๐Ÿ“Š Go to Results", type="primary", width='stretch'):
st.session_state.current_step = 5
st.rerun()
else:
st.error("โŒ Error executing pipeline")
st.error("Check logs for more information")
except Exception as e:
progress_bar.progress(0)
status_text.text("")
st.error(f"โŒ Error: {str(e)}")
st.exception(e)
# Back button
if st.button("โฌ…๏ธ Back to Analysis", width='stretch'):
st.session_state.current_step = 3
st.rerun()
def collect_available_plots(self):
"""Collect information about available plots"""
if not st.session_state.plots_path or not os.path.exists(st.session_state.plots_path):
st.session_state.available_plots = {}
return
plots_categories = {
'summary': ['summary_dashboard.png'],
'missing_values': ['missing_values_analysis.png'],
'outliers': ['outliers_analysis.png', 'outlier_handling_results.png', 'temporal_outliers.png'],
'stationarity': ['stationarity_*.png'],
'data_split': ['data_split.png'],
'scaling': ['scaling_results.png'],
'feature_selection': ['feature_selection_*.png'],
'correlations': ['correlation_matrix.png', 'high_correlations.png', 'target_correlations.png', 'vif_scores.png']
}
available_plots = {}
for category, patterns in plots_categories.items():
category_plots = []
# Search for files for each pattern
for pattern in patterns:
# For general patterns
if '*' in pattern:
search_path = os.path.join(st.session_state.plots_path, pattern)
files = glob.glob(search_path)
# Also search in subfolders
for root, dirs, filenames in os.walk(st.session_state.plots_path):
for filename in filenames:
if pattern.replace('*', '') in filename and filename.endswith('.png'):
full_path = os.path.join(root, filename)
if full_path not in files:
files.append(full_path)
else:
# For specific file names
file_path = os.path.join(st.session_state.plots_path, pattern)
# Check in main folder
if os.path.exists(file_path):
files = [file_path]
else:
# Check in subfolders
files = []
for root, dirs, filenames in os.walk(st.session_state.plots_path):
for filename in filenames:
if filename == pattern:
files.append(os.path.join(root, filename))
for file in files:
if os.path.exists(file):
# Get relative path for display
rel_path = os.path.relpath(file, st.session_state.plots_path)
category_plots.append({
'path': file,
'name': os.path.basename(file),
'rel_path': rel_path,
'size': os.path.getsize(file)
})
if category_plots:
available_plots[category] = category_plots
# Also add all found PNG files in general folder
all_png_files = []
for root, dirs, filenames in os.walk(st.session_state.plots_path):
for filename in filenames:
if filename.endswith('.png'):
file_path = os.path.join(root, filename)
# Check if this file already added
already_added = False
for category_plots in available_plots.values():
for plot in category_plots:
if plot['path'] == file_path:
already_added = True
break
if not already_added:
rel_path = os.path.relpath(file_path, st.session_state.plots_path)
all_png_files.append({
'path': file_path,
'name': filename,
'rel_path': rel_path,
'size': os.path.getsize(file_path)
})
if all_png_files:
available_plots['other'] = all_png_files
st.session_state.available_plots = available_plots
def render_step_5_results(self):
"""Step 5: Results"""
st.header("๐Ÿ“Š Pipeline Results")
if not st.session_state.pipeline_completed or st.session_state.processed_data is None:
st.warning("Pipeline not yet run or not completed successfully")
# Suggest using quick test
st.markdown("---")
st.subheader("๐ŸŽฎ Quick Start")
col1, col2 = st.columns(2)
with col1:
if st.button("๐Ÿš€ Run Quick Test", type="primary", width='stretch'):
st.session_state.quick_test_mode = True
st.session_state.current_step = 1
st.rerun()
with col2:
if st.button("Load Data", width='stretch'):
st.session_state.current_step = 1
st.rerun()
return
data = st.session_state.processed_data
modeling_data = st.session_state.modeling_data
# Results tabs
tab1, tab2, tab3, tab4 = st.tabs([
"๐Ÿ“ˆ Data Overview",
"๐Ÿ“Š Feature Analysis",
"๐Ÿ“‰ Validation",
"๐Ÿ’พ Export"
])
with tab1:
st.subheader("Processed Data")
# Basic information
info_col1, info_col2, info_col3, info_col4 = st.columns(4)
with info_col1:
st.metric("Total Records", f"{data.shape[0]:,}")
with info_col2:
st.metric("Total Features", data.shape[1])
with info_col3:
numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist()
st.metric("Numeric Features", len(numeric_cols))
with info_col4:
missing_total = data.isnull().sum().sum()
st.metric("Missing Values", missing_total)
# Data preview
st.subheader("Data Preview")
st.dataframe(data.head(100), width='stretch')
# Statistics
st.subheader("Processed Data Statistics")
st.dataframe(data.describe().round(4), width='stretch')
with tab2:
st.subheader("Feature Analysis")
if modeling_data and 'feature_names' in modeling_data:
features = modeling_data['feature_names']
# Feature list
st.write(f"**Selected Features:** {len(features)}")
# Display features as cards
cols_per_row = 4
for i in range(0, len(features), cols_per_row):
cols = st.columns(cols_per_row)
for j in range(cols_per_row):
idx = i + j
if idx < len(features):
with cols[j]:
st.info(features[idx])
# Feature importance (if available)
if (self.pipeline is not None and
hasattr(self.pipeline, 'feature_selector') and
self.pipeline.feature_selector is not None):
# Check for feature_importances_
if hasattr(self.pipeline.feature_selector, 'feature_importances_'):
importances = self.pipeline.feature_selector.feature_importances_
if importances is not None and len(importances) > 0:
importance_df = pd.DataFrame({
'Feature': features[:len(importances)] if len(features) >= len(importances) else features,
'Importance': importances[:len(features)] if len(importances) >= len(features) else importances
}).sort_values('Importance', ascending=False)
st.subheader("Feature Importance")
fig = px.bar(
importance_df.head(20),
x='Importance',
y='Feature',
orientation='h',
title="Top-20 Features by Importance",
color='Importance',
color_continuous_scale='Viridis'
)
st.plotly_chart(fig, width='stretch')
# Correlation matrix (limited for performance)
if data.shape[1] <= 50: # Performance limit
st.subheader("Correlation Matrix (first 20 features)")
# Select only numeric columns and limit quantity
numeric_data = data.select_dtypes(include=[np.number])
if len(numeric_data.columns) > 20:
numeric_data = numeric_data.iloc[:, :20]
if not numeric_data.empty and len(numeric_data.columns) > 1:
corr_matrix = numeric_data.corr()
fig = go.Figure(data=go.Heatmap(
z=corr_matrix.values,
x=corr_matrix.columns,
y=corr_matrix.columns,
colorscale='RdBu',
zmin=-1,
zmax=1,
text=corr_matrix.round(2).values,
texttemplate='%{text}',
textfont={"size": 10}
))
fig.update_layout(
title="Correlation Matrix",
width=800,
height=800
)
st.plotly_chart(fig, width='stretch')
else:
st.info("Insufficient data for correlation matrix")
with tab3:
st.subheader("Validation Results")
# Improved validation result availability check
validation_available = False
validation_data = None
if self.pipeline is not None:
# Check for results in pipeline
if hasattr(self.pipeline, 'results'):
# Look for validation results under different keys
validation_keys = ['final_validation', 'validation_results', 'validation', 'validation_checks']
for key in validation_keys:
if key in self.pipeline.results:
validation_data = self.pipeline.results[key]
validation_available = True
break
# If not found in results, check other attributes
if not validation_available and hasattr(self.pipeline, 'validation_report'):
validation_data = self.pipeline.validation_report
validation_available = True
# Or check processing results
if not validation_available and hasattr(self.pipeline, 'get_validation_summary'):
try:
validation_data = self.pipeline.get_validation_summary()
validation_available = True
except:
pass
# If validation results available
if validation_available and validation_data:
st.success("โœ… Validation results available")
# Check validation data format
if isinstance(validation_data, dict):
# Display as dictionary
col1, col2 = st.columns(2)
with col1:
# Status
status = validation_data.get('status', 'UNKNOWN')
if status == 'PASS':
st.success(f"Status: {status}")
elif status == 'WARNING':
st.warning(f"Status: {status}")
else:
st.error(f"Status: {status}")
# Overall score
score = validation_data.get('overall_score', validation_data.get('score', 0))
if score:
st.metric("Overall Score", f"{score}/100")
with col2:
# Check counters
if 'checks' in validation_data:
checks = validation_data['checks']
elif 'basic_checks' in validation_data:
checks = validation_data['basic_checks']
else:
checks = validation_data
if isinstance(checks, dict):
passed = sum(1 for check in checks.values()
if isinstance(check, dict) and check.get('passed', False))
total = len(checks)
st.metric("Checks Passed", f"{passed}/{total}")
# Check details
st.subheader("Check Details")
# Determine where checks are located
checks_to_display = None
if 'checks' in validation_data:
checks_to_display = validation_data['checks']
elif 'basic_checks' in validation_data:
checks_to_display = validation_data['basic_checks']
elif any(isinstance(v, dict) and 'passed' in v for v in validation_data.values()):
checks_to_display = validation_data
if checks_to_display and isinstance(checks_to_display, dict):
for check_name, check_info in checks_to_display.items():
if isinstance(check_info, dict):
col1, col2, col3 = st.columns([3, 1, 3])
with col1:
# Check description
description = check_info.get('description', check_name)
st.write(f"**{description}**")
with col2:
# Status
if check_info.get('passed', False):
st.success("โœ…")
else:
st.error("โŒ")
with col3:
# Message
if 'message' in check_info:
st.caption(check_info['message'])
else:
# Simple format
st.write(f"**{check_name}**: {check_info}")
else:
# Display all validation data
st.json(validation_data)
else:
# If not dictionary, display as is
st.write("Validation results:")
st.write(validation_data)
else:
# If no validation results, show pipeline information
st.info("Validation results in report format not available, but pipeline execution statistics presented below")
# Pipeline stage statistics
st.subheader("Pipeline Execution Statistics")
# Create stage table
stages = [
("Data Loading", "โœ… Successful" if data is not None else "โŒ Error"),
("Missing Value Processing", "โœ… Completed"),
("Outlier Processing", "โœ… Completed"),
("Feature Engineering", "โœ… Completed"),
("Scaling", "โœ… Completed"),
("Feature Selection", "โœ… Completed"),
("Data Split", "โœ… Completed" if modeling_data else "โŒ Not completed")
]
for stage_name, status in stages:
col1, col2 = st.columns([3, 1])
with col1:
st.write(f"**{stage_name}**")
with col2:
if "โœ…" in status:
st.success(status)
else:
st.error(status)
# If pipeline exists, show available metrics
if self.pipeline is not None:
# Check for various metrics
st.subheader("Data Quality Metrics")
col1, col2, col3 = st.columns(3)
with col1:
# Data quality
if data is not None:
missing_pct = (data.isnull().sum().sum() / (data.shape[0] * data.shape[1])) * 100
st.metric("Missing Values", f"{missing_pct:.2f}%")
with col2:
# Feature information
if data is not None:
numeric_cols = len(data.select_dtypes(include=[np.number]).columns)
st.metric("Numeric Features", numeric_cols)
with col3:
# Split information
if modeling_data and 'X_train' in modeling_data:
train_size = len(modeling_data['X_train'])
total_size = train_size
if 'X_test' in modeling_data:
total_size += len(modeling_data['X_test'])
if 'X_val' in modeling_data:
total_size += len(modeling_data['X_val'])
if total_size > 0:
train_pct = (train_size / total_size) * 100
st.metric("Training Set", f"{train_pct:.1f}%")
with tab4:
st.subheader("Data Export")
# Export formats
export_format = st.radio(
"Export Format",
options=['CSV', 'Parquet', 'Excel'],
horizontal=True
)
# Export buttons
if data is not None:
# Export processed data
st.write("**Processed Data**")
if export_format == 'CSV':
csv = data.to_csv(index=True)
st.download_button(
label="๐Ÿ“ฅ Download CSV",
data=csv,
file_name="streamlit_processed_data.csv",
mime="text/csv",
width='stretch'
)
elif export_format == 'Parquet':
# For Parquet need to save to temporary file
import io
buffer = io.BytesIO()
data.to_parquet(buffer)
buffer.seek(0)
st.download_button(
label="๐Ÿ“ฅ Download Parquet",
data=buffer,
file_name="streamlit_processed_data.parquet",
mime="application/octet-stream",
width='stretch'
)
elif export_format == 'Excel':
import io
buffer = io.BytesIO()
with pd.ExcelWriter(buffer, engine='openpyxl') as writer:
data.to_excel(writer, sheet_name='Processed_Data')
buffer.seek(0)
st.download_button(
label="๐Ÿ“ฅ Download Excel",
data=buffer,
file_name="streamlit_processed_data.xlsx",
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
width='stretch'
)
# Export modeling data
if modeling_data:
st.write("**Modeling Data**")
col1, col2, col3 = st.columns(3)
with col1:
if 'X_train' in modeling_data and modeling_data['X_train'] is not None:
train_df = pd.concat([
modeling_data['X_train'],
modeling_data['y_train'].rename('target')
], axis=1) if 'y_train' in modeling_data else modeling_data['X_train']
st.download_button(
label="๐Ÿ“ฅ Training Set",
data=train_df.to_csv(),
file_name="train_data.csv",
mime="text/csv",
width='stretch'
)
with col2:
if 'X_val' in modeling_data and modeling_data['X_val'] is not None:
val_df = pd.concat([
modeling_data['X_val'],
modeling_data['y_val'].rename('target')
], axis=1) if 'y_val' in modeling_data else modeling_data['X_val']
st.download_button(
label="๐Ÿ“ฅ Validation Set",
data=val_df.to_csv(),
file_name="validation_data.csv",
mime="text/csv",
width='stretch'
)
with col3:
if 'X_test' in modeling_data and modeling_data['X_test'] is not None:
test_df = pd.concat([
modeling_data['X_test'],
modeling_data['y_test'].rename('target')
], axis=1) if 'y_test' in modeling_data else modeling_data['X_test']
st.download_button(
label="๐Ÿ“ฅ Test Set",
data=test_df.to_csv(),
file_name="test_data.csv",
mime="text/csv",
width='stretch'
)
# Navigation
st.markdown("---")
col1, col2, col3 = st.columns([1, 1, 1])
with col1:
if st.button("โฌ…๏ธ Back to Pipeline", width='stretch'):
st.session_state.current_step = 4
st.rerun()
with col3:
if st.button("Go to Visualisations โžก๏ธ", type="primary", width='stretch'):
st.session_state.current_step = 6
st.rerun()
def render_step_6_visualisations(self):
"""Step 6: Visualisations"""
st.header("๐Ÿ“ˆ Pipeline Visualisations")
if not st.session_state.pipeline_completed:
st.warning("First run pipeline in Step 4")
# Suggest quick test
st.markdown("---")
st.subheader("๐ŸŽฎ Quick Test")
col1, col2 = st.columns(2)
with col1:
if st.button("๐Ÿš€ Run Quick Test", type="primary", width='stretch'):
st.session_state.quick_test_mode = True
st.session_state.current_step = 1
st.rerun()
with col2:
if st.button("Run Pipeline", width='stretch'):
st.session_state.current_step = 4
st.rerun()
return
# Check for plots
if not st.session_state.available_plots:
st.warning("Plots not found. Ensure pipeline was run with visualisation option enabled.")
# Try to collect plots again
if st.button("Try to Find Plots", width='stretch'):
self.collect_available_plots()
st.rerun()
return
# Plot statistics
total_plots = sum(len(plots) for plots in st.session_state.available_plots.values())
st.success(f"โœ… Found {total_plots} plots")
# Plot category tabs
categories = list(st.session_state.available_plots.keys())
if 'summary' in categories:
categories.remove('summary')
categories.insert(0, 'summary')
tabs = st.tabs([cat.capitalize().replace('_', ' ') for cat in categories])
for i, category in enumerate(categories):
with tabs[i]:
self.display_category_plots(category)
# All plots in one gallery
st.markdown("---")
st.subheader("๐Ÿ–ผ๏ธ All Plots Gallery")
# Collect all plots
all_plots = []
for category, plots in st.session_state.available_plots.items():
for plot in plots:
all_plots.append((category, plot))
# Display plots in grid
cols_per_row = 3
for i in range(0, len(all_plots), cols_per_row):
cols = st.columns(cols_per_row)
for j in range(cols_per_row):
idx = i + j
if idx < len(all_plots):
category, plot_info = all_plots[idx]
with cols[j]:
self.display_plot_card(plot_info, category)
def display_category_plots(self, category):
"""Display plots in category"""
plots = st.session_state.available_plots.get(category, [])
if not plots:
st.info(f"No plots in category '{category}'")
return
st.subheader(f"{category.capitalize().replace('_', ' ')} ({len(plots)} plots)")
# Sort plots by name
plots_sorted = sorted(plots, key=lambda x: x['name'])
# Display plots in accordions for convenience
for plot_info in plots_sorted:
with st.expander(f"๐Ÿ“Š {plot_info['name'].replace('_', ' ').replace('.png', '')}", expanded=True):
self.display_plot_image(plot_info)
def display_plot_card(self, plot_info, category):
"""Display plot card"""
try:
# Load image
image = Image.open(plot_info['path'])
# Create safe key for state
safe_key = plot_info['path'].replace('/', '_').replace('\\', '_').replace('.', '_')
# Initialise state for this plot if not exists
if f"show_{safe_key}" not in st.session_state:
st.session_state[f"show_{safe_key}"] = False
# Create card
with st.container():
st.markdown(f"**{plot_info['name'].replace('_', ' ').replace('.png', '')}**")
st.image(image, width='stretch', caption=plot_info['rel_path'])
# File information
size_kb = plot_info['size'] / 1024
st.caption(f"Size: {size_kb:.1f} KB | Category: {category}")
# Zoom control buttons
col1, col2 = st.columns(2)
with col1:
# Zoom button
if st.button("๐Ÿ” Zoom", key=f"zoom_{safe_key}", width='stretch'):
st.session_state[f"show_{safe_key}"] = True
# Don't use st.rerun() here
with col2:
# Hide zoomed image button (if shown)
if st.session_state[f"show_{safe_key}"]:
if st.button("โœ• Hide", key=f"hide_{safe_key}", width='stretch'):
st.session_state[f"show_{safe_key}"] = False
# Don't use st.rerun() here
# If zoom button clicked, show zoomed image
if st.session_state[f"show_{safe_key}"]:
st.markdown("---")
st.subheader(f"๐Ÿ” {plot_info['name'].replace('_', ' ').replace('.png', '')}")
st.image(image, width='stretch')
except Exception as e:
st.error(f"Error loading plot: {str(e)}")
st.code(f"Path: {plot_info['path']}")
def display_plot_image(self, plot_info):
"""Display plot image"""
try:
# Load image
image = Image.open(plot_info['path'])
# Display with information
col1, col2 = st.columns([3, 1])
with col1:
st.image(image, width='stretch')
with col2:
# File information
st.metric("Size", f"{plot_info['size'] / 1024:.1f} KB")
st.metric("Resolution", f"{image.width}ร—{image.height}")
# File format
st.write(f"**Format:** {image.format}")
# Download button
with open(plot_info['path'], 'rb') as file:
btn = st.download_button(
label="๐Ÿ“ฅ Download",
data=file,
file_name=plot_info['name'],
mime="image/png",
width='stretch'
)
except Exception as e:
st.error(f"Error loading plot: {str(e)}")
st.code(f"Path: {plot_info['path']}")
def render_step_7_modeling(self):
"""Step 7: Modelling Preparation"""
st.header("๐Ÿค– Modelling Preparation")
if not st.session_state.pipeline_completed or st.session_state.modeling_data is None:
st.warning("First run pipeline in Step 4")
# Suggest quick test
st.markdown("---")
st.subheader("๐ŸŽฎ Quick Test")
col1, col2 = st.columns(2)
with col1:
if st.button("๐Ÿš€ Run Quick Test", type="primary", width='stretch'):
st.session_state.quick_test_mode = True
st.session_state.current_step = 1
st.rerun()
with col2:
if st.button("Run Pipeline", width='stretch'):
st.session_state.current_step = 4
st.rerun()
return
modeling_data = st.session_state.modeling_data
# Basic information
col1, col2, col3, col4 = st.columns(4)
with col1:
if 'X_train' in modeling_data and modeling_data['X_train'] is not None:
st.metric("Training Set", f"{modeling_data['X_train'].shape[0]:,} records")
with col2:
if 'X_val' in modeling_data and modeling_data['X_val'] is not None:
st.metric("Validation Set", f"{modeling_data['X_val'].shape[0]:,} records")
with col3:
if 'X_test' in modeling_data and modeling_data['X_test'] is not None:
st.metric("Test Set", f"{modeling_data['X_test'].shape[0]:,} records")
with col4:
if 'feature_names' in modeling_data and modeling_data['feature_names'] is not None:
st.metric("Number of Features", len(modeling_data['feature_names']))
# Tabs
tab1, tab2, tab3 = st.tabs([
"๐Ÿ“ Data Structure",
"๐Ÿ“Š Target Variable Distribution",
"๐Ÿ”— ML Integration"
])
with tab1:
st.subheader("Modeling Data Structure")
# Information table
data_info = []
if 'X_train' in modeling_data and modeling_data['X_train'] is not None:
data_info.append({
'Dataset': 'Training',
'Samples': modeling_data['X_train'].shape[0],
'Features': modeling_data['X_train'].shape[1],
'Target Variable': 'Yes' if 'y_train' in modeling_data and modeling_data['y_train'] is not None else 'No'
})
if 'X_val' in modeling_data and modeling_data['X_val'] is not None:
data_info.append({
'Dataset': 'Validation',
'Samples': modeling_data['X_val'].shape[0],
'Features': modeling_data['X_val'].shape[1],
'Target Variable': 'Yes' if 'y_val' in modeling_data and modeling_data['y_val'] is not None else 'No'
})
if 'X_test' in modeling_data and modeling_data['X_test'] is not None:
data_info.append({
'Dataset': 'Test',
'Samples': modeling_data['X_test'].shape[0],
'Features': modeling_data['X_test'].shape[1],
'Target Variable': 'Yes' if 'y_test' in modeling_data and modeling_data['y_test'] is not None else 'No'
})
if data_info:
st.table(pd.DataFrame(data_info))
else:
st.info("Modeling data not available")
# Data sample
st.subheader("Training Data Sample")
if ('X_train' in modeling_data and modeling_data['X_train'] is not None and
'y_train' in modeling_data and modeling_data['y_train'] is not None):
sample_data = pd.concat([
modeling_data['X_train'].head(10),
modeling_data['y_train'].head(10).rename('target')
], axis=1)
st.dataframe(sample_data, width='stretch')
with tab2:
st.subheader("Target Variable Distribution")
if 'y_train' in modeling_data and modeling_data['y_train'] is not None:
# Target variable histogram
fig = px.histogram(
x=modeling_data['y_train'],
nbins=50,
title="Target Variable Distribution (Training Set)",
labels={'x': 'Target Variable', 'y': 'Frequency'},
color_discrete_sequence=['#00CC96']
)
st.plotly_chart(fig, width='stretch')
# Statistics
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("Mean", f"{modeling_data['y_train'].mean():.2f}")
with col2:
st.metric("Standard Deviation", f"{modeling_data['y_train'].std():.2f}")
with col3:
st.metric("Minimum", f"{modeling_data['y_train'].min():.2f}")
with col4:
st.metric("Maximum", f"{modeling_data['y_train'].max():.2f}")
else:
st.info("Target variable not available")
with tab3:
st.subheader("Machine Learning Library Integration")
st.info("""
Your data is ready for use with any Python ML libraries.
Below are code examples for various libraries.
""")
# Library selection
ml_library = st.selectbox(
"Select ML Library",
options=["Scikit-learn", "XGBoost", "LightGBM", "CatBoost", "PyTorch", "TensorFlow"]
)
# Code generation
code_placeholder = st.empty()
if ml_library == "Scikit-learn":
code = """# Example usage with Scikit-learn
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
# Use prepared data
X_train = modeling_data['X_train']
y_train = modeling_data['y_train']
X_val = modeling_data['X_val']
y_val = modeling_data['y_val']
# Create and train model
model = RandomForestRegressor(
n_estimators=100,
max_depth=10,
random_state=42
)
model.fit(X_train, y_train)
# Predictions and evaluation
y_pred = model.predict(X_val)
print(f"RMSE: {np.sqrt(mean_squared_error(y_val, y_pred)):.4f}")
print(f"Rยฒ Score: {r2_score(y_val, y_pred):.4f}")
print(f"Feature Importance: {model.feature_importances_}")"""
elif ml_library == "XGBoost":
code = """# Example usage with XGBoost
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import numpy as np
# Prepare data in DMatrix format
dtrain = xgb.DMatrix(modeling_data['X_train'], label=modeling_data['y_train'])
dval = xgb.DMatrix(modeling_data['X_val'], label=modeling_data['y_val'])
# Model parameters
params = {
'objective': 'reg:squarederror',
'max_depth': 6,
'learning_rate': 0.1,
'subsample': 0.8,
'colsample_bytree': 0.8,
'seed': 42
}
# Train model
model = xgb.train(
params,
dtrain,
num_boost_round=100,
evals=[(dval, 'validation')],
early_stopping_rounds=10,
verbose_eval=False
)
# Predictions
y_pred = model.predict(dval)
print(f"RMSE: {np.sqrt(mean_squared_error(modeling_data['y_val'], y_pred)):.4f}")
print(f"Number of Trees: {model.best_ntree_limit}")"""
elif ml_library == "LightGBM":
code = """# Example usage with LightGBM
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
import numpy as np
# Prepare data
train_data = lgb.Dataset(
modeling_data['X_train'],
label=modeling_data['y_train']
)
val_data = lgb.Dataset(
modeling_data['X_val'],
label=modeling_data['y_val'],
reference=train_data
)
# Model parameters
params = {
'objective': 'regression',
'metric': 'rmse',
'num_leaves': 31,
'learning_rate': 0.05,
'feature_fraction': 0.9,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'verbose': 0
}
# Train model
model = lgb.train(
params,
train_data,
valid_sets=[val_data],
num_boost_round=100,
callbacks=[lgb.early_stopping(10)]
)
# Predictions
y_pred = model.predict(modeling_data['X_val'])
print(f"RMSE: {np.sqrt(mean_squared_error(modeling_data['y_val'], y_pred)):.4f}")
print(f"Best Iteration: {model.best_iteration}")"""
else:
code = f"""# Template for {ml_library}
# Your data available in modeling_data variable
X_train = modeling_data['X_train']
y_train = modeling_data['y_train']
X_val = modeling_data['X_val']
y_val = modeling_data['y_val']
X_test = modeling_data['X_test']
y_test = modeling_data['y_test']
# Code for {ml_library}...
print(f"Data sizes:")
print(f" X_train: {{X_train.shape}}")
print(f" y_train: {{y_train.shape}}")
print(f" X_val: {{X_val.shape}}")
print(f" X_test: {{X_test.shape}}")"""
# Display code
code_placeholder.code(code, language='python')
# Copy code button
try:
import pyperclip
if st.button("๐Ÿ“‹ Copy Code", width='stretch'):
try:
pyperclip.copy(code)
st.success("Code copied to clipboard!")
except:
st.warning("Failed to copy code. Copy manually.")
except:
st.warning("To copy code, install pyperclip library: pip install pyperclip")
# Final information
st.markdown("---")
st.success("""
๐ŸŽ‰ Congratulations! You have successfully prepared data for machine learning.
**Next Steps:**
1. Use code above for integration with chosen ML library
2. Experiment with various models
3. Optimise hyperparameters
4. Evaluate results on test set
""")
# Navigation
col1, col2 = st.columns([1, 1])
with col1:
if st.button("โฌ…๏ธ Back to Visualisations", width='stretch'):
st.session_state.current_step = 6
st.rerun()
with col2:
if st.button("๐Ÿ”„ Run New Pipeline", type="primary", width='stretch'):
# Reset state
st.session_state.pipeline_completed = False
st.session_state.processed_data = None
st.session_state.modeling_data = None
st.session_state.current_step = 1
st.session_state.uploaded_file = None
st.session_state.plots_path = None
st.session_state.available_plots = {}
st.session_state.synthetic_data_generated = False
st.session_state.auto_pipeline_ready = False
st.session_state.quick_test_mode = False
st.rerun()
def render_footer(self):
"""Application footer"""
st.markdown("---")
col1, col2, col3 = st.columns(3)
with col1:
st.markdown("**TimeFlowPro** v1.1.0")
st.caption("Added synthetic data generation")
with col2:
st.markdown("๐Ÿ“ง Contacts: cool.araby@gmail.com")
with col3:
st.markdown("ยฉ 2026 All Rights Reserved")
def run(self):
"""Run application"""
# Header
st.title("๐Ÿ“Š TimeFlow Pro - Data Analysis and Preprocessing")
st.markdown("---")
# Sidebar
self.create_sidebar()
# Main content depending on step
if st.session_state.current_step == 1:
self.render_step_1_data_loading()
elif st.session_state.current_step == 2:
self.render_step_2_configuration()
elif st.session_state.current_step == 3:
self.render_step_3_data_analysis()
elif st.session_state.current_step == 4:
self.render_step_4_pipeline_execution()
elif st.session_state.current_step == 5:
self.render_step_5_results()
elif st.session_state.current_step == 6:
self.render_step_6_visualisations()
elif st.session_state.current_step == 7:
self.render_step_7_modeling()
# Footer
self.render_footer()
# ============================================
# APPLICATION LAUNCH
# ============================================
if __name__ == "__main__":
app = StreamlitApp()
app.run()