Spaces:
Sleeping
Sleeping
| ''' | |
| MMO Build Sprint 3 | |
| date : | |
| changes : capability to tune MixedLM as well as simple LR in the same page | |
| ''' | |
| import streamlit as st | |
| import pandas as pd | |
| from Eda_functions import format_numbers | |
| import pickle | |
| from utilities import set_header,load_local_css | |
| import statsmodels.api as sm | |
| import re | |
| from sklearn.preprocessing import MinMaxScaler | |
| import matplotlib.pyplot as plt | |
| from statsmodels.stats.outliers_influence import variance_inflation_factor | |
| st.set_option('deprecation.showPyplotGlobalUse', False) | |
| import statsmodels.formula.api as smf | |
| from Data_prep_functions import * | |
| for i in ["model_tuned", "X_train_tuned", "X_test_tuned", "tuned_model_features"] : | |
| if i not in st.session_state : | |
| st.session_state[i] = None | |
| st.set_page_config( | |
| page_title="Model Tuning", | |
| page_icon=":shark:", | |
| layout="wide", | |
| initial_sidebar_state='collapsed' | |
| ) | |
| load_local_css('styles.css') | |
| set_header() | |
| # Sprint3 | |
| is_panel= True | |
| panel_col= 'markets' # set the panel column | |
| date_col = 'date' | |
| target_col = 'total_approved_accounts_revenue' | |
| st.title('1. Model Tuning') | |
| if "X_train" not in st.session_state: | |
| st.error( | |
| "Oops! It seems there are no saved models available. Please build and save a model from the previous page to proceed.") | |
| st.stop() | |
| X_train=st.session_state['X_train'] | |
| X_test=st.session_state['X_test'] | |
| y_train=st.session_state['y_train'] | |
| y_test=st.session_state['y_test'] | |
| df=st.session_state['media_data'] | |
| # st.write(X_train.columns) | |
| # st.write(X_test.columns) | |
| with open("best_models.pkl", 'rb') as file: | |
| model_dict= pickle.load(file) | |
| if 'selected_model' not in st.session_state: | |
| st.session_state['selected_model']=0 | |
| # st.write(model_dict[st.session_state["selected_model"]]['X_train'].columns) | |
| st.markdown('### 1.1 Event Flags') | |
| st.markdown('Helps in quantifying the impact of specific occurrences of events') | |
| with st.expander('Apply Event Flags'): | |
| st.session_state["selected_model"]=st.selectbox('Select Model to apply flags',model_dict.keys()) | |
| model =model_dict[st.session_state["selected_model"]]['Model_object'] | |
| date=st.session_state['date'] | |
| date=pd.to_datetime(date) | |
| X_train =model_dict[st.session_state["selected_model"]]['X_train'] | |
| features_set= model_dict[st.session_state["selected_model"]]['feature_set'] | |
| col=st.columns(3) | |
| min_date=min(date) | |
| max_date=max(date) | |
| with col[0]: | |
| start_date=st.date_input('Select Start Date',min_date,min_value=min_date,max_value=max_date) | |
| with col[1]: | |
| end_date=st.date_input('Select End Date',max_date,min_value=min_date,max_value=max_date) | |
| with col[2]: | |
| repeat=st.selectbox('Repeat Annually',['Yes','No'],index=1) | |
| if repeat =='Yes': | |
| repeat=True | |
| else: | |
| repeat=False | |
| # X_train=sm.add_constant(X_train) | |
| if 'Flags' not in st.session_state: | |
| st.session_state['Flags']={} | |
| # print("**"*50) | |
| # print(y_train) | |
| # print("**"*50) | |
| # print(model.fittedvalues) | |
| if is_panel : # Sprint3 | |
| met, line_values, fig_flag = plot_actual_vs_predicted(X_train[date_col], y_train, | |
| model.fittedvalues, model, | |
| target_column='Revenue', | |
| flag=(start_date, end_date), | |
| repeat_all_years=repeat, is_panel=True) | |
| st.plotly_chart(fig_flag, use_container_width=True) | |
| # create flag on test | |
| met, test_line_values, fig_flag = plot_actual_vs_predicted(X_test[date_col], y_test, | |
| st.session_state['pred_test'], model, | |
| target_column='Revenue', | |
| flag=(start_date, end_date), | |
| repeat_all_years=repeat, is_panel=True) | |
| else : | |
| met,line_values,fig_flag=plot_actual_vs_predicted(date[:150], y_train, model.predict(X_train), model,flag=(start_date,end_date),repeat_all_years=repeat) | |
| st.plotly_chart(fig_flag,use_container_width=True) | |
| met,test_line_values,fig_flag=plot_actual_vs_predicted(date[150:], y_test, model.predict(X_test), model,flag=(start_date,end_date),repeat_all_years=repeat) | |
| flag_name='f1' | |
| flag_name=st.text_input('Enter Flag Name') | |
| if st.button('Update flag'): | |
| st.session_state['Flags'][flag_name]= {} | |
| st.session_state['Flags'][flag_name]['train']=line_values | |
| st.session_state['Flags'][flag_name]['test']=test_line_values | |
| # st.write(st.session_state['Flags'][flag_name]) | |
| st.success(f'{flag_name} stored') | |
| options=list(st.session_state['Flags'].keys()) | |
| selected_options = [] | |
| num_columns = 4 | |
| num_rows = -(-len(options) // num_columns) | |
| tick=False | |
| if st.checkbox('Select all'): | |
| tick=True | |
| selected_options = [] | |
| for row in range(num_rows): | |
| cols = st.columns(num_columns) | |
| for col in cols: | |
| if options: | |
| option = options.pop(0) | |
| selected = col.checkbox(option,value=tick) | |
| if selected: | |
| selected_options.append(option) | |
| st.markdown('### 1.2 Select Parameters to Apply') | |
| parameters=st.columns(3) | |
| with parameters[0]: | |
| Trend=st.checkbox("**Trend**") | |
| st.markdown('Helps account for long-term trends or seasonality that could influence advertising effectiveness') | |
| with parameters[1]: | |
| week_number=st.checkbox('**Week_number**') | |
| st.markdown('Assists in detecting and incorporating weekly patterns or seasonality') | |
| with parameters[2]: | |
| sine_cosine=st.checkbox('**Sine and Cosine Waves**') | |
| st.markdown('Helps in capturing cyclical patterns or seasonality in the data') | |
| if st.button('Build model with Selected Parameters and Flags'): | |
| st.header('2.1 Results Summary') | |
| # date=list(df.index) | |
| # df = df.reset_index(drop=True) | |
| # st.write(df.head(2)) | |
| # X_train=df[features_set] | |
| ss = MinMaxScaler() | |
| if is_panel == True : | |
| X = X_train[features_set] | |
| X_train_tuned = pd.DataFrame(ss.fit_transform(X), columns=X.columns) | |
| X_train_tuned[target_col] = X_train[target_col] | |
| X_train_tuned[date_col] = X_train[date_col] | |
| X_train_tuned[panel_col] = X_train[panel_col] | |
| X = X_test[features_set] | |
| X_test_tuned = pd.DataFrame(ss.transform(X), columns=X.columns) | |
| X_test_tuned[target_col] = X_test[target_col] | |
| X_test_tuned[date_col] = X_test[date_col] | |
| X_test_tuned[panel_col] = X_test[panel_col] | |
| else : | |
| X_train_tuned = pd.DataFrame(ss.fit_transform(X_train), columns=X_train.columns) | |
| X_train_tuned = sm.add_constant(X_train_tuned) | |
| X_test_tuned = pd.DataFrame(ss.transform(X_test), columns=X_test.columns) | |
| X_test_tuned = sm.add_constant(X_test_tuned) | |
| for flag in selected_options: | |
| X_train_tuned[flag]=st.session_state['Flags'][flag]['train'] | |
| X_test_tuned[flag]=st.session_state['Flags'][flag]['test'] | |
| #test | |
| # X_train_tuned.to_csv("Test/X_train_tuned_flag.csv",index=False) | |
| # X_test_tuned.to_csv("Test/X_test_tuned_flag.csv",index=False) | |
| new_features = features_set | |
| # print("()()"*20,flag, len(st.session_state['Flags'][flag])) | |
| if Trend: | |
| # Sprint3 - group by panel, calculate trend of each panel spearately. Add trend to new feature set | |
| if is_panel : | |
| newdata = pd.DataFrame() | |
| panel_wise_end_point_train = {} | |
| for panel, groupdf in X_train_tuned.groupby(panel_col): | |
| groupdf.sort_values(date_col, inplace=True) | |
| groupdf['Trend'] = np.arange(1, len(groupdf) + 1, 1) | |
| newdata = pd.concat([newdata, groupdf]) | |
| panel_wise_end_point_train[panel] = len(groupdf) | |
| X_train_tuned = newdata.copy() | |
| test_newdata=pd.DataFrame() | |
| for panel, test_groupdf in X_test_tuned.groupby(panel_col): | |
| test_groupdf.sort_values(date_col, inplace=True) | |
| start = panel_wise_end_point_train[panel]+1 | |
| end = start + len(test_groupdf) | |
| # print("??"*20, panel, len(test_groupdf), len(np.arange(start, end, 1)), start) | |
| test_groupdf['Trend'] = np.arange(start, end, 1) | |
| test_newdata = pd.concat([test_newdata, test_groupdf]) | |
| X_test_tuned = test_newdata.copy() | |
| new_features = new_features + ['Trend'] | |
| # test | |
| X_test_tuned.to_csv("Test/X_test_tuned_trend.csv", index=False) | |
| X_train_tuned.to_csv("Test/X_train_tuned_trend.csv", index=False) | |
| pd.concat([X_train_tuned,X_test_tuned]).sort_values([panel_col, date_col]).to_csv("Test/X_train_test_tuned_trend.csv", index=False) | |
| else : | |
| X_train_tuned['Trend']=np.arange(1,len(X_train_tuned)+1,1) | |
| X_test_tuned['Trend'] = np.arange(len(X_train_tuned)+1, len(X_train_tuned)+len(X_test_tuned), 1) | |
| if week_number : | |
| # Sprint3 - create weeknumber from date column in xtrain tuned. add week num to new feature set | |
| if is_panel : | |
| X_train_tuned[date_col] = pd.to_datetime(X_train_tuned[date_col]) | |
| X_train_tuned['Week_number'] = X_train_tuned[date_col].dt.day_of_week | |
| if X_train_tuned['Week_number'].nunique() == 1 : | |
| st.write("All dates in the data are of the same week day. Hence Week number can't be used.") | |
| else : | |
| X_test_tuned[date_col] = pd.to_datetime(X_test_tuned[date_col]) | |
| X_test_tuned['Week_number'] = X_test_tuned[date_col].dt.day_of_week | |
| new_features = new_features + ['Week_number'] | |
| else : | |
| date = pd.to_datetime(date.values) | |
| X_train_tuned['Week_number'] = date.dt.day_of_week[:150] | |
| X_test_tuned['Week_number'] = date.dt.day_of_week[150:] | |
| if sine_cosine : | |
| # Sprint3 - create panel wise sine cosine waves in xtrain tuned. add to new feature set | |
| if is_panel : | |
| new_features = new_features + ['sine_wave', 'cosine_wave'] | |
| newdata = pd.DataFrame() | |
| groups = X_train_tuned.groupby(panel_col) | |
| frequency = 2 * np.pi / 365 # Adjust the frequency as needed | |
| train_panel_wise_end_point = {} | |
| for panel, groupdf in groups: | |
| num_samples = len(groupdf) | |
| train_panel_wise_end_point[panel] = num_samples | |
| days_since_start = np.arange(num_samples) | |
| sine_wave = np.sin(frequency * days_since_start) | |
| cosine_wave = np.cos(frequency * days_since_start) | |
| sine_cosine_df = pd.DataFrame({'sine_wave': sine_wave, 'cosine_wave': cosine_wave}) | |
| assert len(sine_cosine_df) == len(groupdf) | |
| # groupdf = pd.concat([groupdf, sine_cosine_df], axis=1) | |
| groupdf['sine_wave'] = sine_wave | |
| groupdf['cosine_wave'] = cosine_wave | |
| newdata = pd.concat([newdata, groupdf]) | |
| test_groups = X_test_tuned.groupby(panel_col) | |
| for panel, test_groupdf in test_groups: | |
| num_samples = len(test_groupdf) | |
| start = train_panel_wise_end_point[panel] | |
| days_since_start = np.arange(start, start+num_samples, 1) | |
| # print("##", panel, num_samples, start, len(np.arange(start, start+num_samples, 1))) | |
| sine_wave = np.sin(frequency * days_since_start) | |
| cosine_wave = np.cos(frequency * days_since_start) | |
| sine_cosine_df = pd.DataFrame({'sine_wave': sine_wave, 'cosine_wave': cosine_wave}) | |
| assert len(sine_cosine_df) == len(test_groupdf) | |
| # groupdf = pd.concat([groupdf, sine_cosine_df], axis=1) | |
| test_groupdf['sine_wave'] = sine_wave | |
| test_groupdf['cosine_wave'] = cosine_wave | |
| newdata = pd.concat([newdata, test_groupdf]) | |
| X_train_tuned = newdata.copy() | |
| else : | |
| num_samples = len(X_train_tuned) | |
| frequency = 2 * np.pi / 365 # Adjust the frequency as needed | |
| days_since_start = np.arange(num_samples) | |
| sine_wave = np.sin(frequency * days_since_start) | |
| cosine_wave = np.cos(frequency * days_since_start) | |
| sine_cosine_df = pd.DataFrame({'sine_wave': sine_wave, 'cosine_wave': cosine_wave}) | |
| # Concatenate the sine and cosine waves with the scaled X DataFrame | |
| X_train_tuned = pd.concat([X_train_tuned, sine_cosine_df], axis=1) | |
| test_num_samples = len(X_test_tuned) | |
| start = num_samples | |
| days_since_start = np.arange(start, start+test_num_samples, 1) | |
| sine_wave = np.sin(frequency * days_since_start) | |
| cosine_wave = np.cos(frequency * days_since_start) | |
| sine_cosine_df = pd.DataFrame({'sine_wave': sine_wave, 'cosine_wave': cosine_wave}) | |
| # Concatenate the sine and cosine waves with the scaled X DataFrame | |
| X_test_tuned = pd.concat([X_test_tuned, sine_cosine_df], axis=1) | |
| # model | |
| if is_panel : | |
| if selected_options : | |
| new_features = new_features + selected_options | |
| inp_vars_str = " + ".join(new_features) | |
| # X_train_tuned.to_csv("Test/X_train_tuned.csv",index=False) | |
| # st.write(X_train_tuned[['total_approved_accounts_revenue'] + new_features].dtypes) | |
| # st.write(X_train_tuned[['total_approved_accounts_revenue', panel_col] + new_features].isna().sum()) | |
| md_tuned = smf.mixedlm("total_approved_accounts_revenue ~ {}".format(inp_vars_str), | |
| data=X_train_tuned[['total_approved_accounts_revenue'] + new_features], | |
| groups=X_train_tuned[panel_col]) | |
| model_tuned = md_tuned.fit() | |
| # plot act v pred for original model and tuned model | |
| metrics_table, line, actual_vs_predicted_plot = plot_actual_vs_predicted(X_train[date_col], y_train, | |
| model.fittedvalues, model, | |
| target_column='Revenue', | |
| is_panel=True) | |
| metrics_table_tuned, line, actual_vs_predicted_plot_tuned = plot_actual_vs_predicted(X_train_tuned[date_col], | |
| X_train_tuned[target_col], | |
| model_tuned.fittedvalues, | |
| model_tuned, | |
| target_column='Revenue', | |
| is_panel=True) | |
| else : | |
| model_tuned = sm.OLS(y_train, X_train_tuned).fit() | |
| metrics_table, line, actual_vs_predicted_plot = plot_actual_vs_predicted(date[:150], y_train, | |
| model.predict(X_train), model, | |
| target_column='Revenue') | |
| metrics_table_tuned, line, actual_vs_predicted_plot_tuned = plot_actual_vs_predicted(date[:150], y_train, | |
| model_tuned.predict( | |
| X_train_tuned), | |
| model_tuned, | |
| target_column='Revenue') | |
| # st.write(metrics_table_tuned) | |
| mape=np.round(metrics_table.iloc[0,1],2) | |
| r2=np.round(metrics_table.iloc[1,1],2) | |
| adjr2=np.round(metrics_table.iloc[2,1],2) | |
| mape_tuned=np.round(metrics_table_tuned.iloc[0,1],2) | |
| r2_tuned=np.round(metrics_table_tuned.iloc[1,1],2) | |
| adjr2_tuned=np.round(metrics_table_tuned.iloc[2,1],2) | |
| parameters_=st.columns(3) | |
| with parameters_[0]: | |
| st.metric('R2',r2_tuned,np.round(r2_tuned-r2,2)) | |
| with parameters_[1]: | |
| st.metric('Adjusted R2',adjr2_tuned,np.round(adjr2_tuned-adjr2,2)) | |
| with parameters_[2]: | |
| st.metric('MAPE',mape_tuned,np.round(mape_tuned-mape,2),'inverse') | |
| st.header('2.2 Actual vs. Predicted Plot') | |
| # if is_panel: | |
| # metrics_table, line, actual_vs_predicted_plot = plot_actual_vs_predicted(date, y_train, model.predict(X_train), | |
| # model, target_column='Revenue',is_panel=True) | |
| # else: | |
| # metrics_table,line,actual_vs_predicted_plot=plot_actual_vs_predicted(date, y_train, model.predict(X_train), model,target_column='Revenue') | |
| metrics_table,line,actual_vs_predicted_plot=plot_actual_vs_predicted(X_train_tuned[date_col], X_train_tuned[target_col], | |
| model_tuned.fittedvalues, model_tuned, | |
| target_column='Revenue', | |
| is_panel=True) | |
| # plot_actual_vs_predicted(X_train[date_col], y_train, | |
| # model.fittedvalues, model, | |
| # target_column='Revenue', | |
| # is_panel=is_panel) | |
| st.plotly_chart(actual_vs_predicted_plot,use_container_width=True) | |
| st.markdown('## 2.3 Residual Analysis') | |
| columns=st.columns(2) | |
| with columns[0]: | |
| fig=plot_residual_predicted(y_train,model.predict(X_train),X_train) | |
| st.plotly_chart(fig) | |
| with columns[1]: | |
| st.empty() | |
| fig = qqplot(y_train,model.predict(X_train)) | |
| st.plotly_chart(fig) | |
| with columns[0]: | |
| fig=residual_distribution(y_train,model.predict(X_train)) | |
| st.pyplot(fig) | |
| if st.checkbox('Use this model to build response curves',key='123'): | |
| st.session_state["tuned_model"] = model_tuned | |
| st.session_state["X_train_tuned"] = X_train_tuned | |
| st.session_state["X_test_tuned"] = X_test_tuned | |
| st.session_state["X_train_tuned"] = X_train_tuned | |
| st.session_state["X_test_tuned"] = X_test_tuned | |
| if is_panel : | |
| st.session_state["tuned_model_features"] = new_features | |
| with open("tuned_model.pkl", "wb") as f: | |
| pickle.dump(st.session_state['tuned_model'], f) | |
| st.success('Model saved!') | |
| # raw_data=df[features_set] | |
| # columns_raw=[re.split(r"(_lag|_adst)",col)[0] for col in raw_data.columns] | |
| # raw_data.columns=columns_raw | |
| # columns_media=[col for col in columns_raw if Categorised_data[col]['BB']=='Media'] | |
| # raw_data=raw_data[columns_media] | |
| # raw_data['Date']=list(df.index) | |
| # spends_var=[col for col in df.columns if "spends" in col.lower() and 'adst' not in col.lower() and 'lag' not in col.lower()] | |
| # spends_df=df[spends_var] | |
| # spends_df['Week']=list(df.index) | |
| # j=0 | |
| # X1=X.copy() | |
| # col=X1.columns | |
| # for i in model.params.values: | |
| # X1[col[j]]=X1.iloc[:,j]*i | |
| # j+=1 | |
| # contribution_df=X1 | |
| # contribution_df['Date']=list(df.index) | |
| # excel_file='Overview_data.xlsx' | |
| # with pd.ExcelWriter(excel_file,engine='xlsxwriter') as writer: | |
| # raw_data.to_excel(writer,sheet_name='RAW DATA MMM',index=False) | |
| # spends_df.to_excel(writer,sheet_name='SPEND INPUT',index=False) | |
| # contribution_df.to_excel(writer,sheet_name='CONTRIBUTION MMM') | |