Spaces:
Build error
Build error
| # Import necessary libraries | |
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| from plotly import graph_objs as go | |
| import joblib | |
| import cloudpickle | |
| from xgboost import XGBRegressor | |
| from sklearn.preprocessing import StandardScaler | |
| from sklearn.preprocessing import MinMaxScaler | |
| from sklearn.preprocessing import RobustScaler | |
| from skforecast.utils import save_forecaster | |
| from skforecast.utils import load_forecaster | |
| from skforecast.ForecasterAutoreg import ForecasterAutoreg | |
| from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error | |
| # ========================================== Helper Functions ========================================== | |
| def evaluate_forecast(y_true, y_pred): | |
| results = { | |
| 'RMSE': np.sqrt(mean_squared_error(y_true, y_pred)), | |
| 'MAPE': mean_absolute_percentage_error(y_true, y_pred) | |
| } | |
| return pd.Series(results) | |
| # Define functions for transformations | |
| def apply_transformation(data, transform_type): | |
| if transform_type == 'Log': | |
| return np.log1p(data) | |
| elif transform_type == 'Square Root': | |
| return np.sqrt(data) | |
| else: | |
| return data | |
| def reverse_transformation(transformed_data, transform_type): | |
| if transform_type == 'Log': | |
| return np.expm1(transformed_data) | |
| elif transform_type == 'Square Root': | |
| return np.square(transformed_data) | |
| else: | |
| return transformed_data | |
| # Cached function for auto-tuning | |
| def run_auto_tuning(train, test, lags_to_try, differentiation_options, transformer_options, external_transform_options): | |
| results = [] | |
| for lag in lags_to_try: | |
| for diff in differentiation_options: | |
| for trans in transformer_options: | |
| for ext_trans in external_transform_options: | |
| # Apply External Transformation | |
| train_transformed = apply_transformation(train[target_column], ext_trans) | |
| # Transformer Selection | |
| transformer_y = select_transformer(trans) | |
| # Create and fit the forecaster | |
| forecaster = ForecasterAutoreg( | |
| regressor = XGBRegressor(random_state=123), | |
| lags = lag, | |
| differentiation = diff, | |
| transformer_y = transformer_y | |
| ) | |
| forecaster.fit(y=train_transformed) | |
| # Predictions and Evaluation | |
| predictions = forecaster.predict(steps=len(test)) | |
| predictions_reversed = reverse_transformation(predictions, ext_trans) | |
| actual = test[target_column].iloc[:len(predictions)] | |
| rmse = np.sqrt(mean_squared_error(actual, predictions_reversed)) | |
| mape = mean_absolute_percentage_error(actual, predictions_reversed) | |
| # Store results | |
| results.append({ | |
| 'Lag': lag, | |
| 'Differentiation': diff, | |
| 'Transformer': trans, | |
| 'External Transformer': ext_trans, | |
| 'RMSE': rmse, | |
| 'MAPE': mape | |
| }) | |
| return pd.DataFrame(results) | |
| # Helper function to select transformer | |
| def select_transformer(transformer_option): | |
| if transformer_option == 'StandardScaler': | |
| return StandardScaler() | |
| elif transformer_option == 'MinMaxScaler': | |
| return MinMaxScaler() | |
| elif transformer_option == 'RobustScaler': | |
| return RobustScaler() | |
| return None | |
| def train_model(lags,differentiation,_transformer_y,train_data): | |
| # Create and fit forecaster | |
| forecaster = \ | |
| ForecasterAutoreg(regressor=XGBRegressor(random_state=123), | |
| lags=lags, differentiation=differentiation, | |
| transformer_y=transformer_y) | |
| forecaster.fit(y=train_data) | |
| save_forecaster(forecaster, file_name='forecaster_temp.py', | |
| verbose=False) | |
| return forecaster | |
| def predict(_forecaster, n_steps, external_transform, test, target_column): | |
| predictions = forecaster.predict(steps=n_steps) | |
| predictions_reversed = reverse_transformation(predictions, external_transform) | |
| # Prepare Comparison DataFrame | |
| actual = test[target_column].iloc[:len(predictions)] | |
| pred = predictions_reversed.to_frame(name='Predicted') | |
| comparison_df = pd.concat([actual.reset_index(drop=True), pred.reset_index(drop=True)], axis=1) | |
| evaluation_results = evaluate_forecast(comparison_df[target_column], comparison_df['Predicted']) | |
| return predictions_reversed, actual, pred, comparison_df, evaluation_results | |
| # Function to load and cache the data | |
| def load_data(uploaded_file): | |
| return pd.read_excel(uploaded_file) | |
| def refit(_forecaster, df, target_column, external_transform): | |
| entire_data_transformed = apply_transformation(df[target_column], external_transform) | |
| forecaster.fit(y=entire_data_transformed) | |
| return forecaster | |
| # ========================================== Header ========================================== | |
| # Streamlit app layout | |
| st.title("SKForecast Forecasting App") | |
| st.write("Upload an xlsx file for time series analysis") | |
| # ========================================== Section: Load Data ========================================== | |
| st.header("Load Data") | |
| uploaded_file = st.file_uploader("Choose a file", type="xlsx") | |
| if uploaded_file is not None: | |
| # Load and cache the dataframe | |
| df = load_data(uploaded_file) | |
| st.write("Dataframe:") | |
| st.write(df) | |
| # ========================================== Section: Select Data ========================================== | |
| st.header("Select Data") | |
| date_column = st.selectbox("Select Date Column", df.columns) | |
| target_column = st.selectbox("Select Target Column", [col for col in df.columns if col != date_column]) | |
| if date_column != target_column: | |
| df[date_column] = pd.to_datetime(df[date_column]) | |
| df.set_index(date_column, inplace=True) | |
| # Date Range Selection | |
| st.subheader("Filter Date Range") | |
| start_date = st.date_input("Start Date", value=df.index.min(), min_value=df.index.min(), max_value=df.index.max()) | |
| end_date = st.date_input("End Date", value=df.index.max(), min_value=df.index.min(), max_value=df.index.max()) | |
| df = df[start_date:end_date] | |
| freq_option = st.selectbox("Select Frequency for Resampling", ['No Resampling', 'W-SUN', 'W-MON', 'W-TUE', 'W-WED', 'W-THU', 'W-FRI', 'W-SAT', 'M', 'MS']) | |
| if freq_option != 'No Resampling': | |
| df = df.resample(freq_option).mean() | |
| st.write("Selected Data with Datetime Index:") | |
| st.write(df[[target_column]]) | |
| # ========================================== Section: Split Data ========================================== | |
| st.header("Split Data") | |
| split_method = st.radio("Select Method for Train-Test Split", ('Percentage', 'Size', 'Year Range', 'Specific Year')) | |
| if split_method == 'Percentage': | |
| split_type = st.radio("Select Split Type", ('Training Set', 'Testing Set')) | |
| if split_type == 'Training Set': | |
| percentage = st.slider("Select Percentage for Training Set", 0.1, 0.85, 0.7) | |
| split_point = int(len(df) * percentage) | |
| else: | |
| percentage = st.slider("Select Percentage for Testing Set", 0.15, 0.9, 0.15) | |
| split_point = int(len(df) * (1 - percentage)) | |
| train = df.iloc[:split_point] | |
| test = df.iloc[split_point:] | |
| elif split_method == 'Size': | |
| split_type = st.radio("Select Split Type", ('Training Set', 'Testing Set')) | |
| max_train_size = int(0.9 * len(df)) | |
| max_test_size = int(0.9 * len(df)) | |
| if split_type == 'Training Set': | |
| size = st.number_input("Enter Size for Training Set", 1, max_train_size, max_train_size) | |
| train = df.iloc[:size] | |
| test = df.iloc[size:] | |
| else: | |
| size = st.number_input("Enter Size for Testing Set", 1, max_test_size, max_test_size) | |
| train = df.iloc[:-size] | |
| test = df.iloc[-size:] | |
| elif split_method == 'Year Range': | |
| start_year = st.selectbox("Select Start Year", range(df.index.year.min(), df.index.year.max() + 1)) | |
| end_year = st.selectbox("Select End Year", range(start_year, df.index.year.max() + 1)) | |
| train = df[(df.index.year >= start_year) & (df.index.year <= end_year)] | |
| test = df.drop(train.index) | |
| elif split_method == 'Specific Year': | |
| split_type = st.radio("Select Split Type", ('Training Set', 'Testing Set')) | |
| year = st.selectbox("Select Year", range(df.index.year.min(), df.index.year.max() + 1)) | |
| if split_type == 'Training Set': | |
| train = df[df.index.year <= year] | |
| test = df[df.index.year > year] | |
| else: | |
| test = df[df.index.year == year] | |
| train = df.drop(test.index) | |
| # ========================================== Section: Display Sets and Visualize ========================================== | |
| st.header("Display Data and Visualize Split") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.write("Training Set:") | |
| st.write(train[target_column]) | |
| with col2: | |
| st.write("Test Set:") | |
| st.write(test[target_column]) | |
| # Plotting both Sets | |
| fig = go.Figure() | |
| fig.add_trace(go.Scatter(x=train.index, y=train[target_column], mode='lines', name='Training Set', line=dict(color='aqua'))) | |
| fig.add_trace(go.Scatter(x=test.index, y=test[target_column], mode='lines', name='Test Set', line=dict(color='orange'))) | |
| fig.update_layout(title='Train-Test Split Visualization', xaxis_title='Date', yaxis_title=target_column) | |
| st.plotly_chart(fig) | |
| # Initialize session state for auto-tuning results | |
| if 'auto_tuning_results' not in st.session_state: | |
| st.session_state.auto_tuning_results = None | |
| # ========================================== Section: Auto-Tuning ========================================== | |
| st.header("Auto-Tuning") | |
| st.write("Automatically test various configurations to identify the optimal setup") | |
| # Input for Lag Ranges | |
| lag_input = st.text_input("Enter Lag Ranges (e.g. 1,2,3-5)", "1,2,3-5") | |
| # Parsing lag ranges | |
| lags_to_try = [] | |
| for part in lag_input.split(','): | |
| if '-' in part: | |
| a, b = part.split('-') | |
| lags_to_try.extend(range(int(a), int(b) + 1)) | |
| else: | |
| lags_to_try.append(int(part)) | |
| # Other Parameters | |
| differentiation_options = [None, 1, 2] | |
| transformer_options = [None, 'StandardScaler', 'MinMaxScaler', 'RobustScaler'] | |
| external_transform_options = [None, 'Log', 'Square Root'] | |
| # Run Button for Auto-Tuning | |
| if st.button("Run Auto-Tuning"): | |
| st.cache_data.clear() | |
| # Run the cached auto-tuning function | |
| auto_tuning_results = run_auto_tuning(train, test, lags_to_try, differentiation_options, transformer_options, external_transform_options) | |
| # Storing best configurations in session state | |
| st.session_state.best_config_rmse = auto_tuning_results.sort_values(by='RMSE').iloc[0] | |
| st.session_state.best_config_mape = auto_tuning_results.sort_values(by='MAPE').iloc[0] | |
| st.session_state.auto_tuning_results = auto_tuning_results | |
| st.success("Auto-tuning finished!") | |
| # Display auto-tuning results from session state | |
| if st.session_state.auto_tuning_results is not None: | |
| st.write("Auto-Tuning Results:") | |
| st.write(st.session_state.auto_tuning_results.sort_values(by='MAPE')) | |
| # Display Best Configurations for Each Metric | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.write("Best Configuration for RMSE:", st.session_state.best_config_rmse) | |
| with col2: | |
| st.write("Best Configuration for MAPE:", st.session_state.best_config_mape) | |
| # ========================================== Section: Train Model ========================================== | |
| st.header("Train Model") | |
| # Initialize session state for prediction results | |
| if 'forecaster' not in st.session_state: | |
| st.session_state.forecaster = None | |
| st.session_state.final_forecaster = None | |
| if 'train' in locals(): | |
| # Check if auto-tuning results are available and valid | |
| if ('auto_tuning_results' in st.session_state and | |
| isinstance(st.session_state.auto_tuning_results, pd.DataFrame) and | |
| not st.session_state.auto_tuning_results.empty): | |
| auto_tuned_config_option = st.radio( | |
| "Choose Configuration to Use", | |
| ('Manual Configuration', 'Best RMSE Configuration', 'Best MAPE Configuration') | |
| ) | |
| if auto_tuned_config_option != 'Manual Configuration': | |
| if auto_tuned_config_option == 'Best RMSE Configuration': | |
| best_config = st.session_state.auto_tuning_results.sort_values(by='RMSE').iloc[0] | |
| elif auto_tuned_config_option == 'Best MAPE Configuration': | |
| best_config = st.session_state.auto_tuning_results.sort_values(by='MAPE').iloc[0] | |
| lags = int(best_config['Lag']) # Convert to regular Python integer | |
| differentiation = int(best_config['Differentiation']) if pd.notna(best_config['Differentiation']) else None | |
| transformer_y = best_config['Transformer'] | |
| external_transform = best_config['External Transformer'] | |
| else: | |
| # Manual configuration | |
| lags = st.slider("Select Lags", 1, max(1, int(len(train) * 0.5)), 4) | |
| differentiation = st.selectbox("Select Differentiation Order", [None, 1, 2]) | |
| transformer_y = st.selectbox("Select Transformer", [None, 'StandardScaler', 'MinMaxScaler', 'RobustScaler']) | |
| external_transform = st.selectbox("Select External Transformation", [None, 'Log', 'Square Root']) | |
| else: | |
| # Only manual configuration available | |
| st.write("Manual Configuration:") | |
| lags = st.slider("Select Lags", 1, max(1, int(len(train) * 0.5)), 4) | |
| differentiation = st.selectbox("Select Differentiation Order", [None, 1, 2]) | |
| transformer_y = st.selectbox("Select Transformer", [None, 'StandardScaler', 'MinMaxScaler', 'RobustScaler']) | |
| external_transform = st.selectbox("Select External Transformation", [None, 'Log', 'Square Root']) | |
| # Apply External Transformation | |
| train_transformed = apply_transformation(train[target_column], external_transform) | |
| # Train Button | |
| if st.button("Train"): | |
| st.cache_resource.clear() | |
| with st.spinner('Training in progress...'): | |
| if transformer_y == 'StandardScaler': | |
| transformer_y = StandardScaler() | |
| elif transformer_y == 'MinMaxScaler': | |
| transformer_y = MinMaxScaler() | |
| elif transformer_y == 'RobustScaler': | |
| transformer_y = RobustScaler() | |
| else: | |
| transformer_y = None | |
| forecaster = train_model(lags, differentiation, transformer_y, train_transformed) | |
| save_forecaster(forecaster, file_name='forecaster_temp.py', verbose=False) | |
| st.session_state.forecaster = forecaster | |
| st.success("Model trained successfully!") | |
| else: | |
| st.warning("Please complete the 'Split Data' section first.") | |
| # ========================================== Section: Predict ========================================== | |
| # Initialize session state for prediction results | |
| if 'comparison_df' not in st.session_state: | |
| st.session_state.comparison_df = None | |
| st.session_state.predictions_reversed = None | |
| st.session_state.pred = None | |
| st.session_state.actual = None | |
| st.session_state.evaluation_results = None | |
| st.header("Predict") | |
| st.subheader("Forecast Configuration") | |
| default_steps = len(test) if 'test' in locals() else 1 | |
| n_steps = st.number_input("Number of Steps for Prediction", 1, len(df), default_steps) | |
| # Predict Button | |
| if st.button("Predict"): | |
| st.cache_data.clear() | |
| forecaster = st.session_state.forecaster | |
| st.session_state.predictions_reversed, st.session_state.actual, st.session_state.pred, st.session_state.comparison_df, st.session_state.evaluation_results = predict(forecaster, n_steps, external_transform, test, target_column) | |
| if st.session_state.comparison_df is not None: | |
| # Display Predictions vs Actual | |
| st.subheader("Predictions vs Actual Values") | |
| st.write(st.session_state.comparison_df) | |
| # Plotting Predictions vs Actual | |
| fig = go.Figure() | |
| fig.add_trace(go.Scatter(y=st.session_state.actual, mode='lines', name='Actual')) | |
| fig.add_trace(go.Scatter(y=st.session_state.pred['Predicted'], mode='lines', name='Predicted')) | |
| fig.update_layout(title='Actual vs Predicted Values', xaxis_title='Index', yaxis_title=target_column) | |
| st.plotly_chart(fig) | |
| # Plotting Train + Actual vs Train + Predicted | |
| fig_comparison = go.Figure() | |
| fig_comparison.add_trace(go.Scatter(x=train.index, y=train[target_column], mode='lines', name='Train')) | |
| fig_comparison.add_trace(go.Scatter(x=st.session_state.actual.index, y=st.session_state.actual, mode='lines', name='Actual')) | |
| fig_comparison.add_trace(go.Scatter(x=st.session_state.pred.index, y=st.session_state.pred['Predicted'], mode='lines', name='Predicted')) | |
| fig_comparison.update_layout(title='Train, Actual vs Predicted Values', xaxis_title='Date', yaxis_title=target_column) | |
| st.plotly_chart(fig_comparison) | |
| # Enhanced Evaluation Results Display | |
| st.subheader("Model Evaluation Results") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.metric(label="RMSE", value=f"{st.session_state.evaluation_results['RMSE']:.3f}") | |
| with col2: | |
| st.metric(label="MAPE", value=f"{st.session_state.evaluation_results['MAPE']*100:.3f} %") | |
| # ========================================== Section: Save & Download Model ========================================== | |
| st.header("Save & Download Model") | |
| # Refit Model | |
| if st.button("Refit Model on Entire Dataset"): | |
| forecaster = st.session_state.forecaster | |
| st.session_state.final_forecaster = refit(forecaster, df, target_column, external_transform) | |
| st.success("Model refitted on the entire dataset.") | |
| else: | |
| st.session_state.final_forecaster = st.session_state.forecaster | |
| save_method = st.selectbox("Select Save Method", ['SKForecast', 'Joblib', 'Pickle']) | |
| model_name = st.text_input("Enter Model Name", 'forecaster_model') | |
| # Save/Download Button | |
| if save_method == 'SKForecast': | |
| file_name = f'{model_name}.py' | |
| save_forecaster(st.session_state.final_forecaster, file_name=file_name, verbose=False) | |
| st.download_button(label="Download Model as SKForecast", data=open(file_name, "rb").read(), file_name=file_name, mime='text/plain') | |
| elif save_method == 'Joblib': | |
| file_name = f'{model_name}.joblib' | |
| joblib.dump(st.session_state.final_forecaster, filename=file_name) | |
| st.download_button(label="Download Model as Joblib", data=open(file_name, "rb").read(), file_name=file_name, mime='application/octet-stream') | |
| elif save_method == 'Pickle': | |
| file_name = f'{model_name}.pkl' | |
| with open(file_name, 'wb') as file: | |
| cloudpickle.dump(st.session_state.final_forecaster, file) | |
| st.download_button(label="Download Model as Pickle", data=open(file_name, "rb").read(), file_name=file_name, mime='application/octet-stream') | |
| else: | |
| st.error("Date column and Target column cannot be the same. Please select different columns.") | |
| else: | |
| st.warning("Please upload an xlsx file to proceed.") |