Spaces:
Build error
Build error
| """ | |
| MMO Build Sprint 3 | |
| date : | |
| changes : capability to tune MixedLM as well as simple LR in the same page | |
| """ | |
| import os | |
| import streamlit as st | |
| import pandas as pd | |
| from data_analysis import format_numbers | |
| import pickle | |
| from utilities import set_header, load_local_css | |
| import statsmodels.api as sm | |
| import re | |
| from sklearn.preprocessing import MaxAbsScaler | |
| import matplotlib.pyplot as plt | |
| from statsmodels.stats.outliers_influence import variance_inflation_factor | |
| import statsmodels.formula.api as smf | |
| from data_prep import * | |
| import sqlite3 | |
| from utilities import ( | |
| set_header, | |
| load_local_css, | |
| update_db, | |
| project_selection, | |
| retrieve_pkl_object, | |
| ) | |
| import numpy as np | |
| from post_gres_cred import db_cred | |
| import re | |
| from constants import ( | |
| NUM_FLAG_COLS_TO_DISPLAY, | |
| HALF_YEAR_THRESHOLD, | |
| FULL_YEAR_THRESHOLD, | |
| TREND_MIN, | |
| ANNUAL_FREQUENCY, | |
| QTR_FREQUENCY_FACTOR, | |
| HALF_YEARLY_FREQUENCY_FACTOR, | |
| ) | |
| from log_application import log_message | |
| import sys, traceback | |
| schema = db_cred["schema"] | |
| st.set_option("deprecation.showPyplotGlobalUse", False) | |
| st.set_page_config( | |
| page_title="AI Model Tuning", | |
| page_icon=":shark:", | |
| layout="wide", | |
| initial_sidebar_state="collapsed", | |
| ) | |
| load_local_css("styles.css") | |
| set_header() | |
| # Define functions | |
| # Get random effect from MixedLM Model | |
| def get_random_effects(media_data, panel_col, _mdf): | |
| # create an empty dataframe | |
| random_eff_df = pd.DataFrame(columns=[panel_col, "random_effect"]) | |
| # Iterate over all panel values and add to dataframe | |
| for i, market in enumerate(media_data[panel_col].unique()): | |
| intercept = _mdf.random_effects[market].values[0] | |
| random_eff_df.loc[i, "random_effect"] = intercept | |
| random_eff_df.loc[i, panel_col] = market | |
| return random_eff_df | |
| # Predict on df using MixedLM model | |
| def mdf_predict(X_df, mdf, random_eff_df): | |
| # Create a copy of input df and predict using MixedLM model i.e fixed effect | |
| X = X_df.copy() | |
| X["fixed_effect"] = mdf.predict(X) | |
| # Merge random effects | |
| X = pd.merge(X, random_eff_df, on=panel_col, how="left") | |
| # Get final predictions by adding random effect to fixed effect | |
| X["pred"] = X["fixed_effect"] + X["random_effect"] | |
| # Drop intermediate columns | |
| X.drop(columns=["fixed_effect", "random_effect"], inplace=True) | |
| return X["pred"] | |
| def format_display(inp): | |
| # Format display titles | |
| return inp.title().replace("_", " ").strip() | |
| if "username" not in st.session_state: | |
| st.session_state["username"] = None | |
| if "project_name" not in st.session_state: | |
| st.session_state["project_name"] = None | |
| if "project_dct" not in st.session_state: | |
| project_selection() | |
| st.stop() | |
| if "Flags" not in st.session_state: | |
| st.session_state["Flags"] = {} | |
| try: | |
| # Check Authentications | |
| if "username" in st.session_state and st.session_state["username"] is not None: | |
| if ( | |
| retrieve_pkl_object( | |
| st.session_state["project_number"], "Model_Build", "best_models", schema | |
| ) | |
| is None | |
| ): # db | |
| st.error("Please save a model before tuning") | |
| log_message( | |
| "warning", | |
| "No models saved", | |
| "Model Tuning", | |
| ) | |
| st.stop() | |
| # Read previous progress (persistence) | |
| if ( | |
| "session_state_saved" | |
| in st.session_state["project_dct"]["model_build"].keys() | |
| ): | |
| for key in [ | |
| "Model", | |
| "date", | |
| "saved_model_names", | |
| "media_data", | |
| "X_test_spends", | |
| "spends_data", | |
| ]: | |
| if key not in st.session_state: | |
| st.session_state[key] = st.session_state["project_dct"][ | |
| "model_build" | |
| ]["session_state_saved"][key] | |
| st.session_state["bin_dict"] = st.session_state["project_dct"][ | |
| "model_build" | |
| ]["session_state_saved"]["bin_dict"] | |
| if ( | |
| "used_response_metrics" not in st.session_state | |
| or st.session_state["used_response_metrics"] == [] | |
| ): | |
| st.session_state["used_response_metrics"] = st.session_state[ | |
| "project_dct" | |
| ]["model_build"]["session_state_saved"]["used_response_metrics"] | |
| else: | |
| st.error("Please load a session with a built model") | |
| log_message( | |
| "error", | |
| "Session state saved not found in Project Dictionary", | |
| "Model Tuning", | |
| ) | |
| st.stop() | |
| for key in ["select_all_flags_check", "selected_flags", "sel_model"]: | |
| if key not in st.session_state["project_dct"]["model_tuning"].keys(): | |
| st.session_state["project_dct"]["model_tuning"][key] = {} | |
| # is_panel = st.session_state['is_panel'] | |
| # panel_col = 'markets' # set the panel column | |
| date_col = "date" | |
| # set the panel column | |
| panel_col = "panel" | |
| is_panel = ( | |
| True if st.session_state["media_data"][panel_col].nunique() > 1 else False | |
| ) | |
| if "Model_Tuned" not in st.session_state: | |
| st.session_state["Model_Tuned"] = {} | |
| cols1 = st.columns([2, 1]) | |
| with cols1[0]: | |
| st.markdown(f"**Welcome {st.session_state['username']}**") | |
| with cols1[1]: | |
| st.markdown(f"**Current Project: {st.session_state['project_name']}**") | |
| st.title("AI Model Tuning") | |
| # flag indicating there is not tuned model till now | |
| if "is_tuned_model" not in st.session_state: | |
| st.session_state["is_tuned_model"] = {} | |
| # # Read all saved models | |
| model_dict = retrieve_pkl_object( | |
| st.session_state["project_number"], "Model_Build", "best_models", schema | |
| ) | |
| saved_models = model_dict.keys() | |
| # Get list of response metrics | |
| st.session_state["used_response_metrics"] = list( | |
| set([model.split("__")[1] for model in saved_models]) | |
| ) | |
| # Select previously selected response_metric (persistence) | |
| default_target_idx = ( | |
| st.session_state["project_dct"]["model_tuning"].get("sel_target_col", None) | |
| if st.session_state["project_dct"]["model_tuning"].get( | |
| "sel_target_col", None | |
| ) | |
| is not None | |
| else st.session_state["used_response_metrics"][0] | |
| ) | |
| # Dropdown to select response metric | |
| sel_target_col = st.selectbox( | |
| "Select the response metric", | |
| st.session_state["used_response_metrics"], | |
| index=st.session_state["used_response_metrics"].index(default_target_idx), | |
| format_func=format_display, | |
| ) | |
| # Format selected response metrics (target col) | |
| target_col = ( | |
| sel_target_col.lower() | |
| .replace(" ", "_") | |
| .replace("-", "") | |
| .replace(":", "") | |
| .replace("__", "_") | |
| ) | |
| st.session_state["project_dct"]["model_tuning"][ | |
| "sel_target_col" | |
| ] = sel_target_col | |
| # Look through all saved models, only show saved models of the selected resp metric (target_col) | |
| # Get a list of models saved for selected response metric | |
| required_saved_models = [ | |
| m.split("__")[0] for m in saved_models if m.split("__")[1] == target_col | |
| ] | |
| # Get previously seelcted model if available (persistence) | |
| default_model_idx = st.session_state["project_dct"]["model_tuning"][ | |
| "sel_model" | |
| ].get(sel_target_col, required_saved_models[0]) | |
| sel_model = st.selectbox( | |
| "Select the model to tune", | |
| required_saved_models, | |
| index=required_saved_models.index(default_model_idx), | |
| ) | |
| st.session_state["project_dct"]["model_tuning"]["sel_model"][ | |
| sel_target_col | |
| ] = default_model_idx | |
| sel_model_dict = model_dict[ | |
| sel_model + "__" + target_col | |
| ] # get the model obj of the selected model | |
| X_train = sel_model_dict["X_train"] | |
| X_test = sel_model_dict["X_test"] | |
| y_train = sel_model_dict["y_train"] | |
| y_test = sel_model_dict["y_test"] | |
| df = st.session_state["media_data"] | |
| st.markdown("### Event Flags") | |
| st.markdown("Helps in quantifying the impact of specific occurrences of events") | |
| try: | |
| # Dropdown to add event flags | |
| with st.expander("Apply Event Flags"): | |
| model = sel_model_dict["Model_object"] | |
| date = st.session_state["date"] | |
| date = pd.to_datetime(date) | |
| X_train = sel_model_dict["X_train"] | |
| features_set = sel_model_dict["feature_set"] | |
| col = st.columns(3) | |
| # Get date range | |
| min_date = min(date).date() | |
| max_date = max(date).date() | |
| # Get previously selected start and end date of flag (persistence) | |
| start_date_default = ( | |
| st.session_state["project_dct"]["model_tuning"].get( | |
| "start_date_default" | |
| ) | |
| if st.session_state["project_dct"]["model_tuning"].get( | |
| "start_date_default" | |
| ) | |
| is not None | |
| else min_date | |
| ) | |
| start_date_default = ( | |
| start_date_default if start_date_default > min_date else min_date | |
| ) | |
| start_date_default = ( | |
| start_date_default if start_date_default < max_date else min_date | |
| ) | |
| end_date_default = ( | |
| st.session_state["project_dct"]["model_tuning"].get( | |
| "end_date_default" | |
| ) | |
| if st.session_state["project_dct"]["model_tuning"].get( | |
| "end_date_default" | |
| ) | |
| is not None | |
| else max_date | |
| ) | |
| end_date_default = ( | |
| end_date_default if end_date_default > min_date else max_date | |
| ) | |
| end_date_default = ( | |
| end_date_default if end_date_default < max_date else max_date | |
| ) | |
| # Flag start and end date input boxes | |
| with col[0]: | |
| start_date = st.date_input( | |
| "Select Start Date", | |
| start_date_default, | |
| min_value=min_date, | |
| max_value=max_date, | |
| ) | |
| if (start_date < min_date) or (start_date > max_date): | |
| st.error( | |
| "Please select dates in the range of the dates in the data" | |
| ) | |
| st.stop() | |
| with col[1]: | |
| # Check if end date default > selected start date | |
| end_date_default = ( | |
| end_date_default | |
| if pd.Timestamp(end_date_default) >= pd.Timestamp(start_date) | |
| else start_date | |
| ) | |
| end_date = st.date_input( | |
| "Select End Date", | |
| end_date_default, | |
| min_value=max( | |
| pd.to_datetime(min_date), pd.to_datetime(start_date) | |
| ), | |
| max_value=pd.to_datetime(max_date), | |
| ) | |
| if ( | |
| (start_date < min_date) | |
| or (end_date < min_date) | |
| or (start_date > max_date) | |
| or (end_date > max_date) | |
| ): | |
| st.error( | |
| "Please select dates in the range of the dates in the data" | |
| ) | |
| st.stop() | |
| if end_date < start_date: | |
| st.error("Please select end date after start date") | |
| st.stop() | |
| with col[2]: | |
| # Get default value of repeat check box (persistence) | |
| repeat_default = ( | |
| st.session_state["project_dct"]["model_tuning"].get( | |
| "repeat_default" | |
| ) | |
| if st.session_state["project_dct"]["model_tuning"].get( | |
| "repeat_default" | |
| ) | |
| is not None | |
| else "No" | |
| ) | |
| repeat_default_idx = 0 if repeat_default.lower() == "yes" else 1 | |
| repeat = st.selectbox( | |
| "Repeat Annually", ["Yes", "No"], index=repeat_default_idx | |
| ) | |
| # Update selected values to session dictionary (persistence) | |
| st.session_state["project_dct"]["model_tuning"][ | |
| "start_date_default" | |
| ] = start_date | |
| st.session_state["project_dct"]["model_tuning"][ | |
| "end_date_default" | |
| ] = end_date | |
| st.session_state["project_dct"]["model_tuning"][ | |
| "repeat_default" | |
| ] = repeat | |
| if repeat == "Yes": | |
| repeat = True | |
| else: | |
| repeat = False | |
| if "flags" in st.session_state["project_dct"]["model_tuning"].keys(): | |
| st.session_state["Flags"] = st.session_state["project_dct"][ | |
| "model_tuning" | |
| ]["flags"] | |
| if is_panel: | |
| # Create flag on Train | |
| met, line_values, fig_flag = plot_actual_vs_predicted( | |
| X_train[date_col], | |
| y_train, | |
| model.fittedvalues, | |
| model, | |
| target_column=sel_target_col, | |
| flag=(start_date, end_date), | |
| repeat_all_years=repeat, | |
| is_panel=True, | |
| ) | |
| st.plotly_chart(fig_flag, use_container_width=True) | |
| # create flag on test | |
| met, test_line_values, fig_flag = plot_actual_vs_predicted( | |
| X_test[date_col], | |
| y_test, | |
| sel_model_dict["pred_test"], | |
| model, | |
| target_column=sel_target_col, | |
| flag=(start_date, end_date), | |
| repeat_all_years=repeat, | |
| is_panel=True, | |
| ) | |
| else: | |
| pred_train = model.predict(X_train[features_set]) | |
| # Create flag on Train | |
| met, line_values, fig_flag = plot_actual_vs_predicted( | |
| X_train[date_col], | |
| y_train, | |
| pred_train, | |
| model, | |
| flag=(start_date, end_date), | |
| repeat_all_years=repeat, | |
| is_panel=False, | |
| ) | |
| st.plotly_chart(fig_flag, use_container_width=True) | |
| # create flag on test | |
| pred_test = model.predict(X_test[features_set]) | |
| met, test_line_values, fig_flag = plot_actual_vs_predicted( | |
| X_test[date_col], | |
| y_test, | |
| pred_test, | |
| model, | |
| flag=(start_date, end_date), | |
| repeat_all_years=repeat, | |
| is_panel=False, | |
| ) | |
| flag_name = "f1_flag" | |
| flag_name = st.text_input("Enter Flag Name") | |
| # add selected target col to flag name | |
| # Save the flag name, flag train values, flag test values to session state | |
| if st.button("Save flag"): | |
| st.session_state["Flags"][flag_name + "_flag__" + target_col] = {} | |
| st.session_state["Flags"][flag_name + "_flag__" + target_col][ | |
| "train" | |
| ] = line_values | |
| st.session_state["Flags"][flag_name + "_flag__" + target_col][ | |
| "test" | |
| ] = test_line_values | |
| st.success(f'{flag_name + "_flag__" + target_col} stored') | |
| st.session_state["project_dct"]["model_tuning"]["flags"] = ( | |
| st.session_state["Flags"] | |
| ) | |
| # Only show flags created for the particular target col | |
| target_model_flags = [ | |
| f.split("__")[0] | |
| for f in st.session_state["Flags"].keys() | |
| if f.split("__")[1] == target_col | |
| ] | |
| options = list(target_model_flags) | |
| num_rows = -(-len(options) // NUM_FLAG_COLS_TO_DISPLAY) | |
| tick = False | |
| # Select all flags checkbox | |
| if st.checkbox( | |
| "Select all", | |
| value=st.session_state["project_dct"]["model_tuning"][ | |
| "select_all_flags_check" | |
| ].get(sel_target_col, False), | |
| ): | |
| tick = True | |
| st.session_state["project_dct"]["model_tuning"][ | |
| "select_all_flags_check" | |
| ][sel_target_col] = True | |
| else: | |
| st.session_state["project_dct"]["model_tuning"][ | |
| "select_all_flags_check" | |
| ][sel_target_col] = False | |
| # Get previous flag selection (persistence) | |
| selection_defualts = st.session_state["project_dct"]["model_tuning"][ | |
| "selected_flags" | |
| ].get(sel_target_col, []) | |
| selected_options = selection_defualts | |
| # create a checkbox for each available flag for selected response metric | |
| for row in range(num_rows): | |
| cols = st.columns(NUM_FLAG_COLS_TO_DISPLAY) | |
| for col in cols: | |
| if options: | |
| option = options.pop(0) | |
| option_default = True if option in selection_defualts else False | |
| selected = col.checkbox(option, value=(tick or option_default)) | |
| if selected: | |
| selected_options.append(option) | |
| else: | |
| if option in selected_options: | |
| selected_options.remove(option) | |
| selected_options = list(set(selected_options)) | |
| # Check if flag values match Data | |
| # This is necessary because different models can have different train/test dates | |
| remove_flags = [] | |
| for opt in selected_options: | |
| train_match = len( | |
| st.session_state["Flags"][opt + "__" + target_col]["train"] | |
| ) == len(X_train[date_col]) | |
| test_match = len( | |
| st.session_state["Flags"][opt + "__" + target_col]["test"] | |
| ) == len(X_test[date_col]) | |
| if not train_match: | |
| st.warning(f"Flag {opt} can not be used due to train date mismatch") | |
| # selected_options.remove(opt) | |
| remove_flags.append(opt) | |
| if not test_match: | |
| st.warning(f"Flag {opt} can not be used due to test date mismatch") | |
| # selected_options.remove(opt) | |
| remove_flags.append(opt) | |
| if ( | |
| len(remove_flags) > 0 | |
| and len(list(set(selected_options).intersection(set(remove_flags)))) > 0 | |
| ): | |
| selected_options = list(set(selected_options) - set(remove_flags)) | |
| st.session_state["project_dct"]["model_tuning"]["selected_flags"][ | |
| sel_target_col | |
| ] = selected_options | |
| except: | |
| # Capture the error details | |
| exc_type, exc_value, exc_traceback = sys.exc_info() | |
| error_message = "".join( | |
| traceback.format_exception(exc_type, exc_value, exc_traceback) | |
| ) | |
| log_message( | |
| "error", f"Error while creating flags: {error_message}", "Model Tuning" | |
| ) | |
| st.warning("An error occured, please try again", icon="⚠️") | |
| try: | |
| st.markdown("### Trend and Seasonality Calibration") | |
| parameters = st.columns(3) | |
| # Trend checkbox | |
| with parameters[0]: | |
| Trend = st.checkbox( | |
| "**Trend**", | |
| value=st.session_state["project_dct"]["model_tuning"].get( | |
| "trend_check", False | |
| ), | |
| ) | |
| st.markdown( | |
| "Helps account for long-term trends or seasonality that could influence advertising effectiveness" | |
| ) | |
| # Day of Week (week number) checkbox | |
| with parameters[1]: | |
| day_of_week = st.checkbox( | |
| "**Day of Week**", | |
| value=st.session_state["project_dct"]["model_tuning"].get( | |
| "week_num_check", False | |
| ), | |
| ) | |
| st.markdown( | |
| "Assists in detecting and incorporating weekly patterns or seasonality" | |
| ) | |
| # Sine and cosine Waves checkbox | |
| with parameters[2]: | |
| sine_cosine = st.checkbox( | |
| "**Sine and Cosine Waves**", | |
| value=st.session_state["project_dct"]["model_tuning"].get( | |
| "sine_cosine_check", False | |
| ), | |
| ) | |
| st.markdown( | |
| "Helps in capturing long term cyclical patterns or seasonality in the data" | |
| ) | |
| if sine_cosine: | |
| # Drop down to select Frequency of waves | |
| xtrain_time_period_months = ( | |
| X_train[date_col].max() - X_train[date_col].min() | |
| ).days / 30 | |
| # If we have 6 months of data, only quarter frequency is possible | |
| if xtrain_time_period_months <= HALF_YEAR_THRESHOLD: | |
| available_frequencies = ["Quarter"] | |
| # If we have less than 12 months of data, we have quarter and semi-annual frequencies | |
| elif xtrain_time_period_months < FULL_YEAR_THRESHOLD: | |
| available_frequencies = ["Quarter", "Semi-Annual"] | |
| # If we have 12 months of data or more, we have quarter, semi-annual and annual frequencies | |
| elif xtrain_time_period_months >= FULL_YEAR_THRESHOLD: | |
| available_frequencies = ["Quarter", "Semi-Annual", "Annual"] | |
| wave_freq = st.selectbox("Select Frequency", available_frequencies) | |
| except: | |
| # Capture the error details | |
| exc_type, exc_value, exc_traceback = sys.exc_info() | |
| error_message = "".join( | |
| traceback.format_exception(exc_type, exc_value, exc_traceback) | |
| ) | |
| log_message( | |
| "error", | |
| f"Error while selecting tuning parameters: {error_message}", | |
| "Model Tuning", | |
| ) | |
| st.warning("An error occured, please try again", icon="⚠️") | |
| try: | |
| # Build tuned model | |
| if st.button( | |
| "Build model with Selected Parameters and Flags", | |
| key="build_tuned_model", | |
| use_container_width=True, | |
| ): | |
| new_features = features_set | |
| st.header("2.1 Results Summary") | |
| ss = MaxAbsScaler() | |
| if is_panel == True: | |
| X_train_tuned = X_train[features_set] | |
| X_train_tuned[target_col] = X_train[target_col] | |
| X_train_tuned[date_col] = X_train[date_col] | |
| X_train_tuned[panel_col] = X_train[panel_col] | |
| X_test_tuned = X_test[features_set] | |
| X_test_tuned[target_col] = X_test[target_col] | |
| X_test_tuned[date_col] = X_test[date_col] | |
| X_test_tuned[panel_col] = X_test[panel_col] | |
| else: | |
| X_train_tuned = X_train[features_set] | |
| X_test_tuned = X_test[features_set] | |
| for flag in selected_options: | |
| # Get the flag values of train and test and add to the data | |
| X_train_tuned[flag] = st.session_state["Flags"][ | |
| flag + "__" + target_col | |
| ]["train"] | |
| X_test_tuned[flag] = st.session_state["Flags"][ | |
| flag + "__" + target_col | |
| ]["test"] | |
| if Trend: | |
| st.session_state["project_dct"]["model_tuning"][ | |
| "trend_check" | |
| ] = True | |
| # group by panel, calculate trend of each panel spearately. Add trend to new feature set | |
| if is_panel: | |
| newdata = pd.DataFrame() | |
| panel_wise_end_point_train = {} | |
| for panel, groupdf in X_train_tuned.groupby(panel_col): | |
| groupdf.sort_values(date_col, inplace=True) | |
| groupdf["Trend"] = np.arange( | |
| TREND_MIN, len(groupdf) + TREND_MIN, 1 | |
| ) # Trend is a straight line with starting point as TREND_MIN | |
| newdata = pd.concat([newdata, groupdf]) | |
| panel_wise_end_point_train[panel] = len(groupdf) + TREND_MIN | |
| X_train_tuned = newdata.copy() | |
| test_newdata = pd.DataFrame() | |
| for panel, test_groupdf in X_test_tuned.groupby(panel_col): | |
| test_groupdf.sort_values(date_col, inplace=True) | |
| start = panel_wise_end_point_train[panel] | |
| end = start + len(test_groupdf) | |
| test_groupdf["Trend"] = np.arange(start, end, 1) | |
| test_newdata = pd.concat([test_newdata, test_groupdf]) | |
| X_test_tuned = test_newdata.copy() | |
| new_features = new_features + ["Trend"] | |
| else: | |
| X_train_tuned["Trend"] = np.arange( | |
| TREND_MIN, len(X_train_tuned) + TREND_MIN, 1 | |
| ) # Trend is a straight line with starting point as TREND_MIN | |
| X_test_tuned["Trend"] = np.arange( | |
| len(X_train_tuned) + TREND_MIN, | |
| len(X_train_tuned) + len(X_test_tuned) + TREND_MIN, | |
| 1, | |
| ) | |
| new_features = new_features + ["Trend"] | |
| else: | |
| st.session_state["project_dct"]["model_tuning"][ | |
| "trend_check" | |
| ] = False # persistence | |
| # Add day of week (Week_num) to test & train | |
| if day_of_week: | |
| st.session_state["project_dct"]["model_tuning"][ | |
| "week_num_check" | |
| ] = True | |
| if is_panel: | |
| X_train_tuned[date_col] = pd.to_datetime( | |
| X_train_tuned[date_col] | |
| ) | |
| X_train_tuned["day_of_week"] = X_train_tuned[ | |
| date_col | |
| ].dt.day_of_week # Day of week | |
| # if all the dates in the data have the same day of week number this feature cant be used | |
| if X_train_tuned["day_of_week"].nunique() == 1: | |
| st.error( | |
| "All dates in the data are of the same week day. Hence Week number can't be used." | |
| ) | |
| else: | |
| X_test_tuned[date_col] = pd.to_datetime( | |
| X_test_tuned[date_col] | |
| ) | |
| X_test_tuned["day_of_week"] = X_test_tuned[ | |
| date_col | |
| ].dt.day_of_week # Day of week | |
| new_features = new_features + ["day_of_week"] | |
| else: | |
| date = pd.to_datetime(date.values) | |
| X_train_tuned["day_of_week"] = pd.to_datetime( | |
| X_train[date_col] | |
| ).dt.day_of_week # Day of week | |
| X_test_tuned["day_of_week"] = pd.to_datetime( | |
| X_test[date_col] | |
| ).dt.day_of_week # Day of week | |
| # if all the dates in the data have the same day of week number this feature cant be used | |
| if X_train_tuned["day_of_week"].nunique() == 1: | |
| st.error( | |
| "All dates in the data are of the same week day. Hence Week number can't be used." | |
| ) | |
| else: | |
| new_features = new_features + ["day_of_week"] | |
| else: | |
| st.session_state["project_dct"]["model_tuning"][ | |
| "week_num_check" | |
| ] = False | |
| # create sine and cosine wave and add to data | |
| if sine_cosine: | |
| st.session_state["project_dct"]["model_tuning"][ | |
| "sine_cosine_check" | |
| ] = True | |
| frequency = ANNUAL_FREQUENCY # Annual Frequency | |
| if wave_freq == "Quarter": | |
| frequency = frequency * QTR_FREQUENCY_FACTOR | |
| elif wave_freq == "Semi-Annual": | |
| frequency = frequency * HALF_YEARLY_FREQUENCY_FACTOR | |
| # create panel wise sine cosine waves in xtrain tuned. add to new feature set | |
| if is_panel: | |
| new_features = new_features + ["sine_wave", "cosine_wave"] | |
| newdata = pd.DataFrame() | |
| newdata_test = pd.DataFrame() | |
| groups = X_train_tuned.groupby(panel_col) | |
| train_panel_wise_end_point = {} | |
| for panel, groupdf in groups: | |
| num_samples = len(groupdf) | |
| train_panel_wise_end_point[panel] = num_samples | |
| days_since_start = np.arange(num_samples) | |
| sine_wave = np.sin(frequency * days_since_start) | |
| cosine_wave = np.cos(frequency * days_since_start) | |
| sine_cosine_df = pd.DataFrame( | |
| {"sine_wave": sine_wave, "cosine_wave": cosine_wave} | |
| ) | |
| assert len(sine_cosine_df) == len(groupdf) | |
| groupdf["sine_wave"] = sine_wave | |
| groupdf["cosine_wave"] = cosine_wave | |
| newdata = pd.concat([newdata, groupdf]) | |
| X_train_tuned = newdata.copy() | |
| test_groups = X_test_tuned.groupby(panel_col) | |
| for panel, test_groupdf in test_groups: | |
| num_samples = len(test_groupdf) | |
| start = train_panel_wise_end_point[panel] | |
| days_since_start = np.arange(start, start + num_samples, 1) | |
| # print("##", panel, num_samples, start, len(np.arange(start, start+num_samples, 1))) | |
| sine_wave = np.sin(frequency * days_since_start) | |
| cosine_wave = np.cos(frequency * days_since_start) | |
| sine_cosine_df = pd.DataFrame( | |
| {"sine_wave": sine_wave, "cosine_wave": cosine_wave} | |
| ) | |
| assert len(sine_cosine_df) == len(test_groupdf) | |
| # groupdf = pd.concat([groupdf, sine_cosine_df], axis=1) | |
| test_groupdf["sine_wave"] = sine_wave | |
| test_groupdf["cosine_wave"] = cosine_wave | |
| newdata_test = pd.concat([newdata_test, test_groupdf]) | |
| X_test_tuned = newdata_test.copy() | |
| else: | |
| new_features = new_features + ["sine_wave", "cosine_wave"] | |
| num_samples = len(X_train_tuned) | |
| days_since_start = np.arange(num_samples) | |
| sine_wave = np.sin(frequency * days_since_start) | |
| cosine_wave = np.cos(frequency * days_since_start) | |
| sine_cosine_df = pd.DataFrame( | |
| {"sine_wave": sine_wave, "cosine_wave": cosine_wave} | |
| ) | |
| # Concatenate the sine and cosine waves with the scaled X DataFrame | |
| X_train_tuned = pd.concat( | |
| [X_train_tuned, sine_cosine_df], axis=1 | |
| ) | |
| test_num_samples = len(X_test_tuned) | |
| start = num_samples | |
| days_since_start = np.arange(start, start + test_num_samples, 1) | |
| sine_wave = np.sin(frequency * days_since_start) | |
| cosine_wave = np.cos(frequency * days_since_start) | |
| sine_cosine_df = pd.DataFrame( | |
| {"sine_wave": sine_wave, "cosine_wave": cosine_wave} | |
| ) | |
| # Concatenate the sine and cosine waves with the scaled X DataFrame | |
| X_test_tuned = pd.concat([X_test_tuned, sine_cosine_df], axis=1) | |
| else: | |
| st.session_state["project_dct"]["model_tuning"][ | |
| "sine_cosine_check" | |
| ] = False | |
| # Build model | |
| # Get list of parameters added and scale | |
| # previous features are scaled already during model build | |
| added_params = list(set(new_features) - set(features_set)) | |
| if len(added_params) > 0: | |
| concat_df = pd.concat([X_train_tuned, X_test_tuned]).reset_index( | |
| drop=True | |
| ) | |
| if is_panel: | |
| train_max_date = X_train_tuned[date_col].max() | |
| # concat_df = concat_df.reset_index(drop=True) | |
| # concat_df=concat_df[added_params] | |
| train_idx = X_train_tuned.index[-1] | |
| concat_df[added_params] = ss.fit_transform(concat_df[added_params]) | |
| # added_params_df = pd.DataFrame(added_params_df) | |
| # added_params_df.columns = added_params | |
| if is_panel: | |
| X_train_tuned[added_params] = concat_df[ | |
| concat_df[date_col] <= train_max_date | |
| ][added_params].reset_index(drop=True) | |
| X_test_tuned[added_params] = concat_df[ | |
| concat_df[date_col] > train_max_date | |
| ][added_params].reset_index(drop=True) | |
| else: | |
| added_params_df = concat_df[added_params] | |
| X_train_tuned[added_params] = added_params_df[: train_idx + 1] | |
| X_test_tuned[added_params] = added_params_df.loc[ | |
| train_idx + 1 : | |
| ].reset_index(drop=True) | |
| # Add flags (flags are 0, 1 only so need to scale) | |
| if selected_options: | |
| new_features = new_features + selected_options | |
| # Build Mixed LM model for panel level data | |
| if is_panel: | |
| X_train_tuned.sort_values([date_col, panel_col]).reset_index( | |
| drop=True, inplace=True | |
| ) | |
| new_features = list(set(new_features)) | |
| inp_vars_str = " + ".join(new_features) | |
| md_str = target_col + " ~ " + inp_vars_str | |
| md_tuned = smf.mixedlm( | |
| md_str, | |
| data=X_train_tuned[[target_col] + new_features], | |
| groups=X_train_tuned[panel_col], | |
| ) | |
| model_tuned = md_tuned.fit() | |
| # plot actual vs predicted for original model and tuned model | |
| metrics_table, line, actual_vs_predicted_plot = ( | |
| plot_actual_vs_predicted( | |
| X_train[date_col], | |
| y_train, | |
| model.fittedvalues, | |
| model, | |
| target_column=sel_target_col, | |
| is_panel=True, | |
| ) | |
| ) | |
| metrics_table_tuned, line, actual_vs_predicted_plot_tuned = ( | |
| plot_actual_vs_predicted( | |
| X_train_tuned[date_col], | |
| X_train_tuned[target_col], | |
| model_tuned.fittedvalues, | |
| model_tuned, | |
| target_column=sel_target_col, | |
| is_panel=True, | |
| ) | |
| ) | |
| # Build OLS model for panel level data | |
| else: | |
| new_features = list(set(new_features)) | |
| model_tuned = sm.OLS(y_train, X_train_tuned[new_features]).fit() | |
| metrics_table, line, actual_vs_predicted_plot = ( | |
| plot_actual_vs_predicted( | |
| X_train[date_col], | |
| y_train, | |
| model.predict(X_train[features_set]), | |
| model, | |
| target_column=sel_target_col, | |
| ) | |
| ) | |
| metrics_table_tuned, line, actual_vs_predicted_plot_tuned = ( | |
| plot_actual_vs_predicted( | |
| X_train[date_col], | |
| y_train, | |
| model_tuned.predict(X_train_tuned[new_features]), | |
| model_tuned, | |
| target_column=sel_target_col, | |
| ) | |
| ) | |
| # # ----------------------------------- TESTING ----------------------------------- | |
| # | |
| # Plot Sine & cosine wave to test | |
| # sine_cosine_plot = plot_actual_vs_predicted( | |
| # X_train[date_col], | |
| # y_train, | |
| # X_train_tuned['sine_wave'], | |
| # model_tuned, | |
| # target_column=sel_target_col, | |
| # is_panel=True, | |
| # ) | |
| # st.plotly_chart(sine_cosine_plot, use_container_width=True) | |
| # # Plot Trend line to test | |
| # trend_plot = plot_tuned_params( | |
| # X_train[date_col], | |
| # y_train, | |
| # X_train_tuned['Trend'], | |
| # model_tuned, | |
| # target_column=sel_target_col, | |
| # is_panel=True, | |
| # ) | |
| # st.plotly_chart(trend_plot, use_container_width=True) | |
| # | |
| # # Plot week number to test | |
| # week_num_plot = plot_tuned_params( | |
| # X_train[date_col], | |
| # y_train, | |
| # X_train_tuned['day_of_week'], | |
| # model_tuned, | |
| # target_column=sel_target_col, | |
| # is_panel=True, | |
| # ) | |
| # st.plotly_chart(week_num_plot, use_container_width=True) | |
| # Get model metrics from metric table & display them | |
| mape = np.round(metrics_table.iloc[0, 1], 2) | |
| r2 = np.round(metrics_table.iloc[1, 1], 2) | |
| adjr2 = np.round(metrics_table.iloc[2, 1], 2) | |
| mape_tuned = np.round(metrics_table_tuned.iloc[0, 1], 2) | |
| r2_tuned = np.round(metrics_table_tuned.iloc[1, 1], 2) | |
| adjr2_tuned = np.round(metrics_table_tuned.iloc[2, 1], 2) | |
| parameters_ = st.columns(3) | |
| with parameters_[0]: | |
| st.metric("R-squared", r2_tuned, np.round(r2_tuned - r2, 2)) | |
| with parameters_[1]: | |
| st.metric( | |
| "Adj. R-squared", adjr2_tuned, np.round(adjr2_tuned - adjr2, 2) | |
| ) | |
| with parameters_[2]: | |
| st.metric( | |
| "MAPE", mape_tuned, np.round(mape_tuned - mape, 2), "inverse" | |
| ) | |
| st.write(model_tuned.summary()) | |
| X_train_tuned[date_col] = X_train[date_col] | |
| X_train_tuned[target_col] = y_train | |
| X_test_tuned[date_col] = X_test[date_col] | |
| X_test_tuned[target_col] = y_test | |
| st.header("2.2 Actual vs. Predicted Plot (Train)") | |
| if is_panel: | |
| metrics_table, line, actual_vs_predicted_plot = ( | |
| plot_actual_vs_predicted( | |
| X_train_tuned[date_col], | |
| X_train_tuned[target_col], | |
| model_tuned.fittedvalues, | |
| model_tuned, | |
| target_column=sel_target_col, | |
| is_panel=True, | |
| ) | |
| ) | |
| else: | |
| metrics_table, line, actual_vs_predicted_plot = ( | |
| plot_actual_vs_predicted( | |
| X_train_tuned[date_col], | |
| X_train_tuned[target_col], | |
| model_tuned.predict(X_train_tuned[new_features]), | |
| model_tuned, | |
| target_column=sel_target_col, | |
| is_panel=False, | |
| ) | |
| ) | |
| st.plotly_chart(actual_vs_predicted_plot, use_container_width=True) | |
| st.markdown("## 2.3 Residual Analysis (Train)") | |
| if is_panel: | |
| columns = st.columns(2) | |
| with columns[0]: | |
| fig = plot_residual_predicted( | |
| y_train, model_tuned.fittedvalues, X_train_tuned | |
| ) | |
| st.plotly_chart(fig) | |
| with columns[1]: | |
| st.empty() | |
| fig = qqplot(y_train, model_tuned.fittedvalues) | |
| st.plotly_chart(fig) | |
| with columns[0]: | |
| fig = residual_distribution(y_train, model_tuned.fittedvalues) | |
| st.pyplot(fig) | |
| else: | |
| columns = st.columns(2) | |
| with columns[0]: | |
| fig = plot_residual_predicted( | |
| y_train, | |
| model_tuned.predict(X_train_tuned[new_features]), | |
| X_train, | |
| ) | |
| st.plotly_chart(fig) | |
| with columns[1]: | |
| st.empty() | |
| fig = qqplot( | |
| y_train, model_tuned.predict(X_train_tuned[new_features]) | |
| ) | |
| st.plotly_chart(fig) | |
| with columns[0]: | |
| fig = residual_distribution( | |
| y_train, model_tuned.predict(X_train_tuned[new_features]) | |
| ) | |
| st.pyplot(fig) | |
| # st.session_state['is_tuned_model'][target_col] = True | |
| # Save tuned model in a dict | |
| st.session_state["Model_Tuned"][sel_model + "__" + target_col] = { | |
| "Model_object": model_tuned, | |
| "feature_set": new_features, | |
| "X_train_tuned": X_train_tuned, | |
| "X_test_tuned": X_test_tuned, | |
| } | |
| with st.expander("Results Summary Test data"): | |
| if is_panel: | |
| random_eff_df = get_random_effects( | |
| st.session_state.media_data.copy(), panel_col, model_tuned | |
| ) | |
| test_pred = mdf_predict( | |
| X_test_tuned, model_tuned, random_eff_df | |
| ) | |
| else: | |
| test_pred = model_tuned.predict(X_test_tuned[new_features]) | |
| st.header("2.2 Actual vs. Predicted Plot (Test)") | |
| metrics_table, line, actual_vs_predicted_plot = ( | |
| plot_actual_vs_predicted( | |
| X_test_tuned[date_col], | |
| y_test, | |
| test_pred, | |
| model, | |
| target_column=sel_target_col, | |
| is_panel=is_panel, | |
| ) | |
| ) | |
| st.plotly_chart(actual_vs_predicted_plot, use_container_width=True) | |
| st.markdown("## 2.3 Residual Analysis (Test)") | |
| columns = st.columns(2) | |
| with columns[0]: | |
| fig = plot_residual_predicted(y_test, test_pred, X_test_tuned) | |
| st.plotly_chart(fig) | |
| with columns[1]: | |
| st.empty() | |
| fig = qqplot(y_test, test_pred) | |
| st.plotly_chart(fig) | |
| with columns[0]: | |
| fig = residual_distribution(y_test, test_pred) | |
| st.pyplot(fig) | |
| except: | |
| # Capture the error details | |
| exc_type, exc_value, exc_traceback = sys.exc_info() | |
| error_message = "".join( | |
| traceback.format_exception(exc_type, exc_value, exc_traceback) | |
| ) | |
| log_message( | |
| "error", | |
| f"Error while building tuned model: {error_message}", | |
| "Model Tuning", | |
| ) | |
| st.warning("An error occured, please try again", icon="⚠️") | |
| if ( | |
| st.session_state["Model_Tuned"] is not None | |
| and len(list(st.session_state["Model_Tuned"].keys())) > 0 | |
| ): | |
| if st.button("Use This model for Media Planning", use_container_width=True): | |
| # remove previous tuned models saved for this target col | |
| _remove = [ | |
| m | |
| for m in st.session_state["Model_Tuned"].keys() | |
| if m.split("__")[1] == target_col and m.split("__")[0] != sel_model | |
| ] | |
| if len(_remove) > 0: | |
| for m in _remove: | |
| del st.session_state["Model_Tuned"][m] | |
| # Flag depicting tuned model for selected response metric | |
| st.session_state["is_tuned_model"][target_col] = True | |
| tuned_model_pkl = pickle.dumps(st.session_state["Model_Tuned"]) | |
| update_db( | |
| st.session_state["project_number"], | |
| "Model_Tuning", | |
| "tuned_model", | |
| tuned_model_pkl, | |
| schema, | |
| # resp_mtrc=None, | |
| ) # db | |
| log_message( | |
| "info", | |
| f"Tuned model {' '.join(_remove)} removed due to overwrite", | |
| "Model Tuning", | |
| ) | |
| # Save session state variables (persistence) | |
| st.session_state["project_dct"]["model_tuning"][ | |
| "session_state_saved" | |
| ] = {} | |
| for key in [ | |
| "bin_dict", | |
| "used_response_metrics", | |
| "is_tuned_model", | |
| "media_data", | |
| "X_test_spends", | |
| "spends_data", | |
| ]: | |
| st.session_state["project_dct"]["model_tuning"][ | |
| "session_state_saved" | |
| ][key] = st.session_state[key] | |
| project_dct_pkl = pickle.dumps(st.session_state["project_dct"]) | |
| update_db( | |
| st.session_state["project_number"], | |
| "Model_Tuning", | |
| "project_dct", | |
| project_dct_pkl, | |
| schema, | |
| # resp_mtrc=None, | |
| ) # db | |
| log_message( | |
| "info", | |
| f'Tuned Model {sel_model + "__" + target_col} Saved', | |
| "Model Tuning", | |
| ) | |
| # Clear page metadata | |
| st.session_state["project_dct"]["scenario_planner"][ | |
| "modified_metadata_file" | |
| ] = None | |
| st.session_state["project_dct"]["response_curves"][ | |
| "modified_metadata_file" | |
| ] = None | |
| st.success(sel_model + " for " + target_col + " Tuned saved!") | |
| except: | |
| # Capture the error details | |
| exc_type, exc_value, exc_traceback = sys.exc_info() | |
| error_message = "".join( | |
| traceback.format_exception(exc_type, exc_value, exc_traceback) | |
| ) | |
| log_message("error", f"An error has occured : {error_message}", "Model Tuning") | |
| st.warning("An error occured, please try again", icon="⚠️") | |
| # st.write(error_message) | |