HilmiZr commited on
Commit
7383a98
·
2 Parent(s): e50c0e4b30250a

Merge branch 'main' of https://huggingface.co/spaces/HilmiZr/PDST-Forecast-Streamlit

Browse files
Files changed (1) hide show
  1. app.py +436 -2
app.py CHANGED
@@ -1,4 +1,438 @@
 
1
  import streamlit as st
 
 
 
 
 
2
 
3
- x = st.slider('Select a value')
4
- st.write(x, 'squared is', x * x)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import necessary libraries
2
  import streamlit as st
3
+ import pandas as pd
4
+ import numpy as np
5
+ from plotly import graph_objs as go
6
+ import joblib
7
+ import cloudpickle
8
 
9
+ from xgboost import XGBRegressor
10
+ from sklearn.preprocessing import StandardScaler
11
+ from sklearn.preprocessing import MinMaxScaler
12
+ from sklearn.preprocessing import RobustScaler
13
+
14
+ from skforecast.utils import save_forecaster
15
+ from skforecast.utils import load_forecaster
16
+ from skforecast.ForecasterAutoreg import ForecasterAutoreg
17
+
18
+ from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
19
+
20
+ # ========================================== Helper Functions ==========================================
21
+
22
+ def evaluate_forecast(y_true, y_pred):
23
+ results = {
24
+ 'RMSE': np.sqrt(mean_squared_error(y_true, y_pred)),
25
+ 'MAPE': mean_absolute_percentage_error(y_true, y_pred)
26
+ }
27
+ return pd.Series(results)
28
+
29
+ # Define functions for transformations
30
+ def apply_transformation(data, transform_type):
31
+ if transform_type == 'Log':
32
+ return np.log1p(data)
33
+ elif transform_type == 'Square Root':
34
+ return np.sqrt(data)
35
+ else:
36
+ return data
37
+
38
+ def reverse_transformation(transformed_data, transform_type):
39
+ if transform_type == 'Log':
40
+ return np.expm1(transformed_data)
41
+ elif transform_type == 'Square Root':
42
+ return np.square(transformed_data)
43
+ else:
44
+ return transformed_data
45
+
46
+ # Cached function for auto-tuning
47
+ @st.cache_data
48
+ def run_auto_tuning(train, test, lags_to_try, differentiation_options, transformer_options, external_transform_options):
49
+ results = []
50
+ for lag in lags_to_try:
51
+ for diff in differentiation_options:
52
+ for trans in transformer_options:
53
+ for ext_trans in external_transform_options:
54
+ # Apply External Transformation
55
+ train_transformed = apply_transformation(train[target_column], ext_trans)
56
+
57
+ # Transformer Selection
58
+ transformer_y = select_transformer(trans)
59
+
60
+ # Create and fit the forecaster
61
+ forecaster = ForecasterAutoreg(
62
+ regressor = XGBRegressor(random_state=123),
63
+ lags = lag,
64
+ differentiation = diff,
65
+ transformer_y = transformer_y
66
+ )
67
+ forecaster.fit(y=train_transformed)
68
+
69
+ # Predictions and Evaluation
70
+ predictions = forecaster.predict(steps=len(test))
71
+ predictions_reversed = reverse_transformation(predictions, ext_trans)
72
+ actual = test[target_column].iloc[:len(predictions)]
73
+ rmse = np.sqrt(mean_squared_error(actual, predictions_reversed))
74
+ mape = mean_absolute_percentage_error(actual, predictions_reversed)
75
+
76
+ # Store results
77
+ results.append({
78
+ 'Lag': lag,
79
+ 'Differentiation': diff,
80
+ 'Transformer': trans,
81
+ 'External Transformer': ext_trans,
82
+ 'RMSE': rmse,
83
+ 'MAPE': mape
84
+ })
85
+
86
+ return pd.DataFrame(results)
87
+
88
+ # Helper function to select transformer
89
+ def select_transformer(transformer_option):
90
+ if transformer_option == 'StandardScaler':
91
+ return StandardScaler()
92
+ elif transformer_option == 'MinMaxScaler':
93
+ return MinMaxScaler()
94
+ elif transformer_option == 'RobustScaler':
95
+ return RobustScaler()
96
+ return None
97
+
98
+ @st.cache_resource
99
+ def train_model(lags,differentiation,_transformer_y,train_data):
100
+
101
+ # Create and fit forecaster
102
+
103
+ forecaster = \
104
+ ForecasterAutoreg(regressor=XGBRegressor(random_state=123),
105
+ lags=lags, differentiation=differentiation,
106
+ transformer_y=transformer_y)
107
+
108
+ forecaster.fit(y=train_data)
109
+ save_forecaster(forecaster, file_name='forecaster_temp.py',
110
+ verbose=False)
111
+ return forecaster
112
+
113
+ @st.cache_data
114
+ def predict(_forecaster, n_steps, external_transform, test, target_column):
115
+ predictions = forecaster.predict(steps=n_steps)
116
+ predictions_reversed = reverse_transformation(predictions, external_transform)
117
+
118
+ # Prepare Comparison DataFrame
119
+ actual = test[target_column].iloc[:len(predictions)]
120
+ pred = predictions_reversed.to_frame(name='Predicted')
121
+ comparison_df = pd.concat([actual.reset_index(drop=True), pred.reset_index(drop=True)], axis=1)
122
+ evaluation_results = evaluate_forecast(comparison_df[target_column], comparison_df['Predicted'])
123
+
124
+ return predictions_reversed, actual, pred, comparison_df, evaluation_results
125
+
126
+ # Function to load and cache the data
127
+ @st.cache_data
128
+ def load_data(uploaded_file):
129
+ return pd.read_excel(uploaded_file)
130
+
131
+ @st.cache_resource
132
+ def refit(_forecaster, df, target_column, external_transform):
133
+ entire_data_transformed = apply_transformation(df[target_column], external_transform)
134
+ forecaster.fit(y=entire_data_transformed)
135
+ return forecaster
136
+
137
+ # ========================================== Header ==========================================
138
+
139
+ # Streamlit app layout
140
+ st.title("SKForecast Forecasting App")
141
+ st.write("Upload an xlsx file for time series analysis")
142
+
143
+ # ========================================== Section: Load Data ==========================================
144
+ st.header("Load Data")
145
+ uploaded_file = st.file_uploader("Choose a file", type="xlsx")
146
+
147
+ if uploaded_file is not None:
148
+ # Load and cache the dataframe
149
+ df = load_data(uploaded_file)
150
+
151
+ st.write("Dataframe:")
152
+ st.write(df)
153
+
154
+ # ========================================== Section: Select Data ==========================================
155
+ st.header("Select Data")
156
+ date_column = st.selectbox("Select Date Column", df.columns)
157
+ target_column = st.selectbox("Select Target Column", [col for col in df.columns if col != date_column])
158
+
159
+ if date_column != target_column:
160
+ df[date_column] = pd.to_datetime(df[date_column])
161
+ df.set_index(date_column, inplace=True)
162
+
163
+ # Date Range Selection
164
+ st.subheader("Filter Date Range")
165
+ start_date = st.date_input("Start Date", value=df.index.min(), min_value=df.index.min(), max_value=df.index.max())
166
+ end_date = st.date_input("End Date", value=df.index.max(), min_value=df.index.min(), max_value=df.index.max())
167
+ df = df[start_date:end_date]
168
+
169
+ freq_option = st.selectbox("Select Frequency for Resampling", ['No Resampling', 'W-SUN', 'W-MON', 'W-TUE', 'W-WED', 'W-THU', 'W-FRI', 'W-SAT', 'M', 'MS'])
170
+ if freq_option != 'No Resampling':
171
+ df = df.resample(freq_option).mean()
172
+
173
+ st.write("Selected Data with Datetime Index:")
174
+ st.write(df[[target_column]])
175
+
176
+ # ========================================== Section: Split Data ==========================================
177
+ st.header("Split Data")
178
+ split_method = st.radio("Select Method for Train-Test Split", ('Percentage', 'Size', 'Year Range', 'Specific Year'))
179
+
180
+ if split_method == 'Percentage':
181
+ split_type = st.radio("Select Split Type", ('Training Set', 'Testing Set'))
182
+ if split_type == 'Training Set':
183
+ percentage = st.slider("Select Percentage for Training Set", 0.1, 0.85, 0.7)
184
+ split_point = int(len(df) * percentage)
185
+ else:
186
+ percentage = st.slider("Select Percentage for Testing Set", 0.15, 0.9, 0.15)
187
+ split_point = int(len(df) * (1 - percentage))
188
+ train = df.iloc[:split_point]
189
+ test = df.iloc[split_point:]
190
+
191
+ elif split_method == 'Size':
192
+ split_type = st.radio("Select Split Type", ('Training Set', 'Testing Set'))
193
+ max_train_size = int(0.9 * len(df))
194
+ max_test_size = int(0.9 * len(df))
195
+ if split_type == 'Training Set':
196
+ size = st.number_input("Enter Size for Training Set", 1, max_train_size, max_train_size)
197
+ train = df.iloc[:size]
198
+ test = df.iloc[size:]
199
+ else:
200
+ size = st.number_input("Enter Size for Testing Set", 1, max_test_size, max_test_size)
201
+ train = df.iloc[:-size]
202
+ test = df.iloc[-size:]
203
+
204
+ elif split_method == 'Year Range':
205
+ start_year = st.selectbox("Select Start Year", range(df.index.year.min(), df.index.year.max() + 1))
206
+ end_year = st.selectbox("Select End Year", range(start_year, df.index.year.max() + 1))
207
+ train = df[(df.index.year >= start_year) & (df.index.year <= end_year)]
208
+ test = df.drop(train.index)
209
+
210
+ elif split_method == 'Specific Year':
211
+ split_type = st.radio("Select Split Type", ('Training Set', 'Testing Set'))
212
+ year = st.selectbox("Select Year", range(df.index.year.min(), df.index.year.max() + 1))
213
+ if split_type == 'Training Set':
214
+ train = df[df.index.year <= year]
215
+ test = df[df.index.year > year]
216
+ else:
217
+ test = df[df.index.year == year]
218
+ train = df.drop(test.index)
219
+
220
+ # ========================================== Section: Display Sets and Visualize ==========================================
221
+ st.header("Display Data and Visualize Split")
222
+ col1, col2 = st.columns(2)
223
+ with col1:
224
+ st.write("Training Set:")
225
+ st.write(train[target_column])
226
+ with col2:
227
+ st.write("Test Set:")
228
+ st.write(test[target_column])
229
+
230
+ # Plotting both Sets
231
+ fig = go.Figure()
232
+ fig.add_trace(go.Scatter(x=train.index, y=train[target_column], mode='lines', name='Training Set', line=dict(color='aqua')))
233
+ fig.add_trace(go.Scatter(x=test.index, y=test[target_column], mode='lines', name='Test Set', line=dict(color='orange')))
234
+ fig.update_layout(title='Train-Test Split Visualization', xaxis_title='Date', yaxis_title=target_column)
235
+ st.plotly_chart(fig)
236
+
237
+
238
+ # Initialize session state for auto-tuning results
239
+ if 'auto_tuning_results' not in st.session_state:
240
+ st.session_state.auto_tuning_results = None
241
+
242
+ # ========================================== Section: Auto-Tuning ==========================================
243
+ st.header("Auto-Tuning")
244
+ st.write("Automatically test various configurations to identify the optimal setup")
245
+
246
+ # Input for Lag Ranges
247
+ lag_input = st.text_input("Enter Lag Ranges (e.g. 1,2,3-5)", "1,2,3-5")
248
+
249
+ # Parsing lag ranges
250
+ lags_to_try = []
251
+ for part in lag_input.split(','):
252
+ if '-' in part:
253
+ a, b = part.split('-')
254
+ lags_to_try.extend(range(int(a), int(b) + 1))
255
+ else:
256
+ lags_to_try.append(int(part))
257
+
258
+ # Other Parameters
259
+ differentiation_options = [None, 1, 2]
260
+ transformer_options = [None, 'StandardScaler', 'MinMaxScaler', 'RobustScaler']
261
+ external_transform_options = [None, 'Log', 'Square Root']
262
+
263
+ # Run Button for Auto-Tuning
264
+ if st.button("Run Auto-Tuning"):
265
+ st.cache_data.clear()
266
+ # Run the cached auto-tuning function
267
+ auto_tuning_results = run_auto_tuning(train, test, lags_to_try, differentiation_options, transformer_options, external_transform_options)
268
+
269
+ # Storing best configurations in session state
270
+ st.session_state.best_config_rmse = auto_tuning_results.sort_values(by='RMSE').iloc[0]
271
+ st.session_state.best_config_mape = auto_tuning_results.sort_values(by='MAPE').iloc[0]
272
+
273
+ st.session_state.auto_tuning_results = auto_tuning_results
274
+ st.success("Auto-tuning finished!")
275
+
276
+ # Display auto-tuning results from session state
277
+ if st.session_state.auto_tuning_results is not None:
278
+ st.write("Auto-Tuning Results:")
279
+ st.write(st.session_state.auto_tuning_results.sort_values(by='MAPE'))
280
+
281
+ # Display Best Configurations for Each Metric
282
+ col1, col2 = st.columns(2)
283
+ with col1:
284
+ st.write("Best Configuration for RMSE:", st.session_state.best_config_rmse)
285
+ with col2:
286
+ st.write("Best Configuration for MAPE:", st.session_state.best_config_mape)
287
+
288
+ # ========================================== Section: Train Model ==========================================
289
+ st.header("Train Model")
290
+
291
+ # Initialize session state for prediction results
292
+ if 'forecaster' not in st.session_state:
293
+ st.session_state.forecaster = None
294
+ st.session_state.final_forecaster = None
295
+
296
+ if 'train' in locals():
297
+ # Check if auto-tuning results are available and valid
298
+ if ('auto_tuning_results' in st.session_state and
299
+ isinstance(st.session_state.auto_tuning_results, pd.DataFrame) and
300
+ not st.session_state.auto_tuning_results.empty):
301
+
302
+ auto_tuned_config_option = st.radio(
303
+ "Choose Configuration to Use",
304
+ ('Manual Configuration', 'Best RMSE Configuration', 'Best MAPE Configuration')
305
+ )
306
+
307
+ if auto_tuned_config_option != 'Manual Configuration':
308
+ if auto_tuned_config_option == 'Best RMSE Configuration':
309
+ best_config = st.session_state.auto_tuning_results.sort_values(by='RMSE').iloc[0]
310
+ elif auto_tuned_config_option == 'Best MAPE Configuration':
311
+ best_config = st.session_state.auto_tuning_results.sort_values(by='MAPE').iloc[0]
312
+
313
+ lags = int(best_config['Lag']) # Convert to regular Python integer
314
+ differentiation = int(best_config['Differentiation']) if pd.notna(best_config['Differentiation']) else None
315
+ transformer_y = best_config['Transformer']
316
+ external_transform = best_config['External Transformer']
317
+ else:
318
+ # Manual configuration
319
+ lags = st.slider("Select Lags", 1, max(1, int(len(train) * 0.5)), 4)
320
+ differentiation = st.selectbox("Select Differentiation Order", [None, 1, 2])
321
+ transformer_y = st.selectbox("Select Transformer", [None, 'StandardScaler', 'MinMaxScaler', 'RobustScaler'])
322
+ external_transform = st.selectbox("Select External Transformation", [None, 'Log', 'Square Root'])
323
+
324
+ else:
325
+ # Only manual configuration available
326
+ st.write("Manual Configuration:")
327
+ lags = st.slider("Select Lags", 1, max(1, int(len(train) * 0.5)), 4)
328
+ differentiation = st.selectbox("Select Differentiation Order", [None, 1, 2])
329
+ transformer_y = st.selectbox("Select Transformer", [None, 'StandardScaler', 'MinMaxScaler', 'RobustScaler'])
330
+ external_transform = st.selectbox("Select External Transformation", [None, 'Log', 'Square Root'])
331
+
332
+ # Apply External Transformation
333
+ train_transformed = apply_transformation(train[target_column], external_transform)
334
+
335
+ # Train Button
336
+ if st.button("Train"):
337
+ st.cache_resource.clear()
338
+ with st.spinner('Training in progress...'):
339
+ if transformer_y == 'StandardScaler':
340
+ transformer_y = StandardScaler()
341
+ elif transformer_y == 'MinMaxScaler':
342
+ transformer_y = MinMaxScaler()
343
+ elif transformer_y == 'RobustScaler':
344
+ transformer_y = RobustScaler()
345
+ else:
346
+ transformer_y = None
347
+
348
+ forecaster = train_model(lags, differentiation, transformer_y, train_transformed)
349
+ save_forecaster(forecaster, file_name='forecaster_temp.py', verbose=False)
350
+ st.session_state.forecaster = forecaster
351
+ st.success("Model trained successfully!")
352
+ else:
353
+ st.warning("Please complete the 'Split Data' section first.")
354
+ # ========================================== Section: Predict ==========================================
355
+
356
+ # Initialize session state for prediction results
357
+ if 'comparison_df' not in st.session_state:
358
+ st.session_state.comparison_df = None
359
+ st.session_state.predictions_reversed = None
360
+ st.session_state.pred = None
361
+ st.session_state.actual = None
362
+ st.session_state.evaluation_results = None
363
+
364
+ st.header("Predict")
365
+ st.subheader("Forecast Configuration")
366
+ default_steps = len(test) if 'test' in locals() else 1
367
+ n_steps = st.number_input("Number of Steps for Prediction", 1, len(df), default_steps)
368
+
369
+ # Predict Button
370
+ if st.button("Predict"):
371
+ st.cache_data.clear()
372
+ forecaster = st.session_state.forecaster
373
+ st.session_state.predictions_reversed, st.session_state.actual, st.session_state.pred, st.session_state.comparison_df, st.session_state.evaluation_results = predict(forecaster, n_steps, external_transform, test, target_column)
374
+
375
+ if st.session_state.comparison_df is not None:
376
+ # Display Predictions vs Actual
377
+ st.subheader("Predictions vs Actual Values")
378
+ st.write(st.session_state.comparison_df)
379
+
380
+ # Plotting Predictions vs Actual
381
+ fig = go.Figure()
382
+ fig.add_trace(go.Scatter(y=st.session_state.actual, mode='lines', name='Actual'))
383
+ fig.add_trace(go.Scatter(y=st.session_state.pred['Predicted'], mode='lines', name='Predicted'))
384
+ fig.update_layout(title='Actual vs Predicted Values', xaxis_title='Index', yaxis_title=target_column)
385
+ st.plotly_chart(fig)
386
+
387
+ # Plotting Train + Actual vs Train + Predicted
388
+ fig_comparison = go.Figure()
389
+ fig_comparison.add_trace(go.Scatter(x=train.index, y=train[target_column], mode='lines', name='Train'))
390
+ fig_comparison.add_trace(go.Scatter(x=st.session_state.actual.index, y=st.session_state.actual, mode='lines', name='Actual'))
391
+ fig_comparison.add_trace(go.Scatter(x=st.session_state.pred.index, y=st.session_state.pred['Predicted'], mode='lines', name='Predicted'))
392
+ fig_comparison.update_layout(title='Train, Actual vs Predicted Values', xaxis_title='Date', yaxis_title=target_column)
393
+ st.plotly_chart(fig_comparison)
394
+
395
+ # Enhanced Evaluation Results Display
396
+ st.subheader("Model Evaluation Results")
397
+
398
+ col1, col2 = st.columns(2)
399
+ with col1:
400
+ st.metric(label="RMSE", value=f"{st.session_state.evaluation_results['RMSE']:.3f}")
401
+ with col2:
402
+ st.metric(label="MAPE", value=f"{st.session_state.evaluation_results['MAPE']*100:.3f} %")
403
+
404
+ # ========================================== Section: Save & Download Model ==========================================
405
+ st.header("Save & Download Model")
406
+
407
+ # Refit Model
408
+ if st.button("Refit Model on Entire Dataset"):
409
+ forecaster = st.session_state.forecaster
410
+ st.session_state.final_forecaster = refit(forecaster, df, target_column, external_transform)
411
+ st.success("Model refitted on the entire dataset.")
412
+ else:
413
+ st.session_state.final_forecaster = st.session_state.forecaster
414
+
415
+ save_method = st.selectbox("Select Save Method", ['SKForecast', 'Joblib', 'Pickle'])
416
+ model_name = st.text_input("Enter Model Name", 'forecaster_model')
417
+
418
+ # Save/Download Button
419
+ if save_method == 'SKForecast':
420
+ file_name = f'{model_name}.py'
421
+ save_forecaster(st.session_state.final_forecaster, file_name=file_name, verbose=False)
422
+ st.download_button(label="Download Model as SKForecast", data=open(file_name, "rb").read(), file_name=file_name, mime='text/plain')
423
+
424
+ elif save_method == 'Joblib':
425
+ file_name = f'{model_name}.joblib'
426
+ joblib.dump(st.session_state.final_forecaster, filename=file_name)
427
+ st.download_button(label="Download Model as Joblib", data=open(file_name, "rb").read(), file_name=file_name, mime='application/octet-stream')
428
+
429
+ elif save_method == 'Pickle':
430
+ file_name = f'{model_name}.pkl'
431
+ with open(file_name, 'wb') as file:
432
+ cloudpickle.dump(st.session_state.final_forecaster, file)
433
+ st.download_button(label="Download Model as Pickle", data=open(file_name, "rb").read(), file_name=file_name, mime='application/octet-stream')
434
+
435
+ else:
436
+ st.error("Date column and Target column cannot be the same. Please select different columns.")
437
+ else:
438
+ st.warning("Please upload an xlsx file to proceed.")