Spaces:
Build error
Build error
| import plotly.express as px | |
| import numpy as np | |
| import plotly.graph_objects as go | |
| import streamlit as st | |
| import pandas as pd | |
| import statsmodels.api as sm | |
| # from sklearn.metrics import mean_absolute_percentage_error | |
| import sys | |
| import os | |
| from utilities import set_header, load_local_css | |
| import seaborn as sns | |
| import matplotlib.pyplot as plt | |
| import tempfile | |
| from sklearn.preprocessing import MinMaxScaler | |
| # from st_aggrid import AgGrid | |
| # from st_aggrid import GridOptionsBuilder, GridUpdateMode | |
| # from st_aggrid import GridOptionsBuilder | |
| import sys | |
| import re | |
| import pickle | |
| from sklearn.metrics import r2_score | |
| from data_prep import plot_actual_vs_predicted | |
| import sqlite3 | |
| from utilities import ( | |
| set_header, | |
| load_local_css, | |
| update_db, | |
| project_selection, | |
| retrieve_pkl_object, | |
| ) | |
| from post_gres_cred import db_cred | |
| from log_application import log_message | |
| import sys, traceback | |
| schema = db_cred["schema"] | |
| sys.setrecursionlimit(10**6) | |
| original_stdout = sys.stdout | |
| sys.stdout = open("temp_stdout.txt", "w") | |
| sys.stdout.close() | |
| sys.stdout = original_stdout | |
| st.set_page_config(layout="wide") | |
| load_local_css("styles.css") | |
| set_header() | |
| ## DEFINE ALL FUCNTIONS | |
| def plot_residual_predicted(actual, predicted, df_): | |
| df_["Residuals"] = actual - pd.Series(predicted) | |
| df_["StdResidual"] = (df_["Residuals"] - df_["Residuals"].mean()) / df_[ | |
| "Residuals" | |
| ].std() | |
| # Create a Plotly scatter plot | |
| fig = px.scatter( | |
| df_, | |
| x=predicted, | |
| y="StdResidual", | |
| opacity=0.5, | |
| color_discrete_sequence=["#11B6BD"], | |
| ) | |
| # Add horizontal lines | |
| fig.add_hline(y=0, line_dash="dash", line_color="darkorange") | |
| fig.add_hline(y=2, line_color="red") | |
| fig.add_hline(y=-2, line_color="red") | |
| fig.update_xaxes(title="Predicted") | |
| fig.update_yaxes(title="Standardized Residuals (Actual - Predicted)") | |
| # Set the same width and height for both figures | |
| fig.update_layout( | |
| title="Residuals over Predicted Values", | |
| autosize=False, | |
| width=600, | |
| height=400, | |
| ) | |
| return fig | |
| def residual_distribution(actual, predicted): | |
| Residuals = actual - pd.Series(predicted) | |
| # Create a Seaborn distribution plot | |
| sns.set(style="whitegrid") | |
| plt.figure(figsize=(6, 4)) | |
| sns.histplot(Residuals, kde=True, color="#11B6BD") | |
| plt.title(" Distribution of Residuals") | |
| plt.xlabel("Residuals") | |
| plt.ylabel("Probability Density") | |
| return plt | |
| def qqplot(actual, predicted): | |
| Residuals = actual - pd.Series(predicted) | |
| Residuals = pd.Series(Residuals) | |
| Resud_std = (Residuals - Residuals.mean()) / Residuals.std() | |
| # Create a QQ plot using Plotly with custom colors | |
| fig = go.Figure() | |
| fig.add_trace( | |
| go.Scatter( | |
| x=sm.ProbPlot(Resud_std).theoretical_quantiles, | |
| y=sm.ProbPlot(Resud_std).sample_quantiles, | |
| mode="markers", | |
| marker=dict(size=5, color="#11B6BD"), | |
| name="QQ Plot", | |
| ) | |
| ) | |
| # Add the 45-degree reference line | |
| diagonal_line = go.Scatter( | |
| x=[ | |
| -2, | |
| 2, | |
| ], # Adjust the x values as needed to fit the range of your data | |
| y=[-2, 2], # Adjust the y values accordingly | |
| mode="lines", | |
| line=dict(color="red"), # Customize the line color and style | |
| name=" ", | |
| ) | |
| fig.add_trace(diagonal_line) | |
| # Customize the layout | |
| fig.update_layout( | |
| title="QQ Plot of Residuals", | |
| title_x=0.5, | |
| autosize=False, | |
| width=600, | |
| height=400, | |
| xaxis_title="Theoretical Quantiles", | |
| yaxis_title="Sample Quantiles", | |
| ) | |
| return fig | |
| def get_random_effects(media_data, panel_col, mdf): | |
| random_eff_df = pd.DataFrame(columns=[panel_col, "random_effect"]) | |
| for i, market in enumerate(media_data[panel_col].unique()): | |
| print(i, end="\r") | |
| intercept = mdf.random_effects[market].values[0] | |
| random_eff_df.loc[i, "random_effect"] = intercept | |
| random_eff_df.loc[i, panel_col] = market | |
| return random_eff_df | |
| def mdf_predict(X_df, mdf, random_eff_df): | |
| X = X_df.copy() | |
| X = pd.merge( | |
| X, | |
| random_eff_df[[panel_col, "random_effect"]], | |
| on=panel_col, | |
| how="left", | |
| ) | |
| X["pred_fixed_effect"] = mdf.predict(X) | |
| X["pred"] = X["pred_fixed_effect"] + X["random_effect"] | |
| X.drop(columns=["pred_fixed_effect", "random_effect"], inplace=True) | |
| return X | |
| def metrics_df_panel(model_dict, is_panel): | |
| def wmape(actual, forecast): | |
| # Weighted MAPE (WMAPE) eliminates the following shortcomings of MAPE & SMAPE | |
| ## 1. MAPE becomes insanely high when actual is close to 0 | |
| ## 2. MAPE is more favourable to underforecast than overforecast | |
| return np.sum(np.abs(actual - forecast)) / np.sum(np.abs(actual)) | |
| metrics_df = pd.DataFrame( | |
| columns=[ | |
| "Model", | |
| "R2", | |
| "ADJR2", | |
| "Train Mape", | |
| "Test Mape", | |
| "Summary", | |
| "Model_object", | |
| ] | |
| ) | |
| i = 0 | |
| for key in model_dict.keys(): | |
| target = key.split("__")[1] | |
| metrics_df.at[i, "Model"] = target | |
| y = model_dict[key]["X_train_tuned"][target] | |
| feature_set = model_dict[key]["feature_set"] | |
| if is_panel: | |
| random_df = get_random_effects( | |
| media_data, panel_col, model_dict[key]["Model_object"] | |
| ) | |
| pred = mdf_predict( | |
| model_dict[key]["X_train_tuned"], | |
| model_dict[key]["Model_object"], | |
| random_df, | |
| )["pred"] | |
| else: | |
| pred = model_dict[key]["Model_object"].predict( | |
| model_dict[key]["X_train_tuned"][feature_set] | |
| ) | |
| ytest = model_dict[key]["X_test_tuned"][target] | |
| if is_panel: | |
| predtest = mdf_predict( | |
| model_dict[key]["X_test_tuned"], | |
| model_dict[key]["Model_object"], | |
| random_df, | |
| )["pred"] | |
| else: | |
| predtest = model_dict[key]["Model_object"].predict( | |
| model_dict[key]["X_test_tuned"][feature_set] | |
| ) | |
| metrics_df.at[i, "R2"] = r2_score(y, pred) | |
| metrics_df.at[i, "ADJR2"] = 1 - (1 - metrics_df.loc[i, "R2"]) * (len(y) - 1) / ( | |
| len(y) - len(model_dict[key]["feature_set"]) - 1 | |
| ) | |
| # metrics_df.at[i, "Train Mape"] = mean_absolute_percentage_error(y, pred) | |
| # metrics_df.at[i, "Test Mape"] = mean_absolute_percentage_error( | |
| # ytest, predtest | |
| # ) | |
| metrics_df.at[i, "Train Mape"] = wmape(y, pred) | |
| metrics_df.at[i, "Test Mape"] = wmape(ytest, predtest) | |
| metrics_df.at[i, "Summary"] = model_dict[key]["Model_object"].summary() | |
| metrics_df.at[i, "Model_object"] = model_dict[key]["Model_object"] | |
| i += 1 | |
| metrics_df = np.round(metrics_df, 2) | |
| metrics_df.rename( | |
| columns={"R2": "R-squared", "ADJR2": "Adj. R-squared"}, inplace=True | |
| ) | |
| return metrics_df | |
| def map_channel(transformed_var, channel_dict): | |
| for key, value_list in channel_dict.items(): | |
| if any(raw_var in transformed_var for raw_var in value_list): | |
| return key | |
| return transformed_var # Return the original value if no match is found | |
| def contributions_nonpanel(model_dict): | |
| # with open(os.path.join(st.session_state["project_path"], "channel_groups.pkl"), "rb") as f: | |
| # channels = pickle.load(f) | |
| channels = st.session_state["project_dct"]["data_import"]["group_dict"] # db | |
| media_data = st.session_state["media_data"] | |
| contribution_df = pd.DataFrame(columns=["Channel"]) | |
| for key in model_dict.keys(): | |
| best_feature_set = model_dict[key]["feature_set"] | |
| model = model_dict[key]["Model_object"] | |
| target = key.split("__")[1] | |
| X_train = model_dict[key]["X_train_tuned"] | |
| contri_df = pd.DataFrame() | |
| y = [] | |
| y_pred = [] | |
| coef_df = pd.DataFrame(model.params) | |
| coef_df.reset_index(inplace=True) | |
| coef_df.columns = ["feature", "coef"] | |
| x_train_contribution = X_train.copy() | |
| x_train_contribution["pred"] = model.predict(X_train[best_feature_set]) | |
| for i in range(len(coef_df)): | |
| coef = coef_df.loc[i, "coef"] | |
| col = coef_df.loc[i, "feature"] | |
| if col != "const": | |
| x_train_contribution[str(col) + "_contr"] = ( | |
| coef * x_train_contribution[col] | |
| ) | |
| else: | |
| x_train_contribution["const"] = coef | |
| tuning_cols = [ | |
| c | |
| for c in x_train_contribution.filter(regex="contr").columns | |
| if c | |
| in [ | |
| "day_of_week_contr", | |
| "Trend_contr", | |
| "sine_wave_contr", | |
| "cosine_wave_contr", | |
| ] | |
| ] | |
| flag_cols = [ | |
| c | |
| for c in x_train_contribution.filter(regex="contr").columns | |
| if "_flag" in c | |
| ] | |
| # add exogenous contribution to base | |
| all_exog_vars = st.session_state["bin_dict"]["Exogenous"] | |
| all_exog_vars = [ | |
| var.lower() | |
| .replace(".", "_") | |
| .replace("@", "_") | |
| .replace(" ", "_") | |
| .replace("-", "") | |
| .replace(":", "") | |
| .replace("__", "_") | |
| for var in all_exog_vars | |
| ] | |
| exog_cols = [] | |
| if len(all_exog_vars) > 0: | |
| for col in x_train_contribution.filter(regex="contr").columns: | |
| if len([exog_var for exog_var in all_exog_vars if exog_var in col]) > 0: | |
| exog_cols.append(col) | |
| base_cols = ["const"] + flag_cols + tuning_cols + exog_cols | |
| x_train_contribution["base_contr"] = x_train_contribution[base_cols].sum(axis=1) | |
| x_train_contribution.drop(columns=base_cols, inplace=True) | |
| contri_df = pd.DataFrame(x_train_contribution.filter(regex="contr").sum(axis=0)) | |
| contri_df.reset_index(inplace=True) | |
| contri_df.columns = ["Channel", target] | |
| contri_df["Channel"] = contri_df["Channel"].apply( | |
| lambda x: map_channel(x, channels) | |
| ) | |
| contri_df[target] = 100 * contri_df[target] / contri_df[target].sum() | |
| contri_df["Channel"].replace("base_contr", "base", inplace=True) | |
| contribution_df = pd.merge( | |
| contribution_df, contri_df, on="Channel", how="outer" | |
| ) | |
| return contribution_df | |
| def contributions_panel(model_dict): | |
| channels = st.session_state["project_dct"]["data_import"]["group_dict"] # db | |
| media_data = st.session_state["media_data"] | |
| contribution_df = pd.DataFrame(columns=["Channel"]) | |
| for key in model_dict.keys(): | |
| best_feature_set = model_dict[key]["feature_set"] | |
| model = model_dict[key]["Model_object"] | |
| target = key.split("__")[1] | |
| X_train = model_dict[key]["X_train_tuned"] | |
| contri_df = pd.DataFrame() | |
| y = [] | |
| y_pred = [] | |
| random_eff_df = get_random_effects(media_data, panel_col, model) | |
| random_eff_df["fixed_effect"] = model.fe_params["Intercept"] | |
| random_eff_df["panel_effect"] = ( | |
| random_eff_df["random_effect"] + random_eff_df["fixed_effect"] | |
| ) | |
| coef_df = pd.DataFrame(model.fe_params) | |
| coef_df.reset_index(inplace=True) | |
| coef_df.columns = ["feature", "coef"] | |
| x_train_contribution = X_train.copy() | |
| x_train_contribution = mdf_predict(x_train_contribution, model, random_eff_df) | |
| x_train_contribution = pd.merge( | |
| x_train_contribution, | |
| random_eff_df[[panel_col, "panel_effect"]], | |
| on=panel_col, | |
| how="left", | |
| ) | |
| for i in range(len(coef_df)): | |
| coef = coef_df.loc[i, "coef"] | |
| col = coef_df.loc[i, "feature"] | |
| if col.lower() != "intercept": | |
| x_train_contribution[str(col) + "_contr"] = ( | |
| coef * x_train_contribution[col] | |
| ) | |
| # x_train_contribution['sum_contributions'] = x_train_contribution.filter(regex="contr").sum(axis=1) | |
| # x_train_contribution['sum_contributions'] = x_train_contribution['sum_contributions'] + x_train_contribution[ | |
| # 'panel_effect'] | |
| # base_cols = ["panel_effect"] + [ | |
| # c | |
| # for c in x_train_contribution.filter(regex="contr").columns | |
| # if c | |
| # in [ | |
| # "day_of_week_contr", | |
| # "Trend_contr", | |
| # "sine_wave_contr", | |
| # "cosine_wave_contr", | |
| # ] | |
| # ] | |
| tuning_cols = [ | |
| c | |
| for c in x_train_contribution.filter(regex="contr").columns | |
| if c | |
| in [ | |
| "day_of_week_contr", | |
| "Trend_contr", | |
| "sine_wave_contr", | |
| "cosine_wave_contr", | |
| ] | |
| ] | |
| flag_cols = [ | |
| c | |
| for c in x_train_contribution.filter(regex="contr").columns | |
| if "_flag" in c | |
| ] | |
| # add exogenous contribution to base | |
| all_exog_vars = st.session_state["bin_dict"]["Exogenous"] | |
| all_exog_vars = [ | |
| var.lower() | |
| .replace(".", "_") | |
| .replace("@", "_") | |
| .replace(" ", "_") | |
| .replace("-", "") | |
| .replace(":", "") | |
| .replace("__", "_") | |
| for var in all_exog_vars | |
| ] | |
| exog_cols = [] | |
| if len(all_exog_vars) > 0: | |
| for col in x_train_contribution.filter(regex="contr").columns: | |
| if len([exog_var for exog_var in all_exog_vars if exog_var in col]) > 0: | |
| exog_cols.append(col) | |
| base_cols = ["panel_effect"] + flag_cols + tuning_cols + exog_cols | |
| x_train_contribution["base_contr"] = x_train_contribution[base_cols].sum(axis=1) | |
| x_train_contribution.drop(columns=base_cols, inplace=True) | |
| contri_df = pd.DataFrame(x_train_contribution.filter(regex="contr").sum(axis=0)) | |
| contri_df.reset_index(inplace=True) | |
| contri_df.columns = ["Channel", target] | |
| contri_df[target] = 100 * contri_df[target] / contri_df[target].sum() | |
| contri_df["Channel"] = contri_df["Channel"].apply( | |
| lambda x: map_channel(x, channels) | |
| ) | |
| contri_df["Channel"].replace("base_contr", "base", inplace=True) | |
| contribution_df = pd.merge( | |
| contribution_df, contri_df, on="Channel", how="outer" | |
| ) | |
| # st.session_state["contribution_df"] = contributions_panel(tuned_model_dict) | |
| return contribution_df | |
| def create_grouped_bar_plot(contribution_df, contribution_selections): | |
| # Extract the 'Channel' names | |
| channel_names = contribution_df["Channel"].tolist() | |
| # Dictionary to store all contributions except 'const' and 'base' | |
| all_contributions = { | |
| name: [] for name in channel_names if name not in ["const", "base"] | |
| } | |
| # Dictionary to store base sales for each selection | |
| base_sales_dict = {} | |
| # Accumulate contributions for each channel from each selection | |
| for selection in contribution_selections: | |
| contributions = contribution_df[selection].values.astype(float) | |
| base_sales = 0 # Initialize base sales for the current selection | |
| for channel_name, contribution in zip(channel_names, contributions): | |
| if channel_name in all_contributions: | |
| all_contributions[channel_name].append(contribution) | |
| elif channel_name == "base": | |
| base_sales = ( | |
| contribution # Capture base sales for the current selection | |
| ) | |
| # Store base sales for each selection | |
| base_sales_dict[selection] = base_sales | |
| # Calculate the average of contributions and sort by this average | |
| sorted_channels = sorted(all_contributions.items(), key=lambda x: -np.mean(x[1])) | |
| sorted_channel_names = [name for name, _ in sorted_channels] | |
| sorted_channel_names = [ | |
| "Base Sales" | |
| ] + sorted_channel_names # Adding 'Base Sales' at the start | |
| trace_data = [] | |
| max_value = 0 # Initialize max_value to find the highest bar for y-axis adjustment | |
| # Create traces for the grouped bar chart | |
| for i, selection in enumerate(contribution_selections): | |
| display_name = sorted_channel_names | |
| display_contribution = [base_sales_dict[selection]] + [ | |
| all_contributions[name][i] for name in sorted_channel_names[1:] | |
| ] # Start with base sales for the current selection | |
| # Generating text labels for each bar | |
| text_values = [ | |
| f"{val}%" for val in np.round(display_contribution, 0).astype(int) | |
| ] | |
| # Find the max value for y-axis calculation | |
| max_contribution = max(display_contribution) | |
| if max_contribution > max_value: | |
| max_value = max_contribution | |
| # Create a bar trace for each selection | |
| trace = go.Bar( | |
| x=display_name, | |
| y=display_contribution, | |
| name=selection, | |
| text=text_values, | |
| textposition="outside", | |
| ) | |
| trace_data.append(trace) | |
| # Define layout for the bar chart | |
| layout = go.Layout( | |
| title="Metrics Contribution by Channel (Train)", | |
| xaxis=dict(title="Channel Name"), | |
| yaxis=dict( | |
| title="Metrics Contribution", range=[0, max_value * 1.2] | |
| ), # Set y-axis 20% higher than the max bar | |
| barmode="group", | |
| plot_bgcolor="white", | |
| ) | |
| # Create the figure with trace data and layout | |
| fig = go.Figure(data=trace_data, layout=layout) | |
| return fig | |
| def preprocess_and_plot(contribution_df, contribution_selections): | |
| # Extract the 'Channel' names | |
| channel_names = contribution_df["Channel"].tolist() | |
| # Dictionary to store all contributions except 'const' and 'base' | |
| all_contributions = { | |
| name: [] for name in channel_names if name not in ["const", "base"] | |
| } | |
| # Dictionary to store base sales for each selection | |
| base_sales_dict = {} | |
| # Accumulate contributions for each channel from each selection | |
| for selection in contribution_selections: | |
| contributions = contribution_df[selection].values.astype(float) | |
| base_sales = 0 # Initialize base sales for the current selection | |
| for channel_name, contribution in zip(channel_names, contributions): | |
| if channel_name in all_contributions: | |
| all_contributions[channel_name].append(contribution) | |
| elif channel_name == "base": | |
| base_sales = ( | |
| contribution # Capture base sales for the current selection | |
| ) | |
| # Store base sales for each selection | |
| base_sales_dict[selection] = base_sales | |
| # Calculate the average of contributions and sort by this average | |
| sorted_channels = sorted(all_contributions.items(), key=lambda x: -np.mean(x[1])) | |
| sorted_channel_names = [name for name, _ in sorted_channels] | |
| sorted_channel_names = [ | |
| "Base Sales" | |
| ] + sorted_channel_names # Adding 'Base Sales' at the start | |
| # Initialize a Plotly figure | |
| fig = go.Figure() | |
| for i, selection in enumerate(contribution_selections): | |
| display_name = ["Base Sales"] + sorted_channel_names[ | |
| 1: | |
| ] # Channel names for the plot | |
| display_contribution = [ | |
| base_sales_dict[selection] | |
| ] # Start with base sales for the current selection | |
| # Append average contributions for other channels | |
| for name in sorted_channel_names[1:]: | |
| display_contribution.append(all_contributions[name][i]) | |
| # Generating text labels for each bar | |
| text_values = [ | |
| f"{val}%" for val in np.round(display_contribution, 0).astype(int) | |
| ] | |
| # Add a waterfall trace for each selection | |
| fig.add_trace( | |
| go.Waterfall( | |
| orientation="v", | |
| measure=["relative"] * len(display_contribution), | |
| x=display_name, | |
| text=text_values, | |
| textposition="outside", | |
| y=display_contribution, | |
| increasing={"marker": {"color": "green"}}, | |
| decreasing={"marker": {"color": "red"}}, | |
| totals={"marker": {"color": "blue"}}, | |
| name=selection, | |
| ) | |
| ) | |
| # Update layout of the figure | |
| fig.update_layout( | |
| title="Metrics Contribution by Channel (Train)", | |
| xaxis={"title": "Channel Name"}, | |
| yaxis=dict(title="Metrics Contribution", range=[0, 100 * 1.2]), | |
| ) | |
| return fig | |
| def selection_change(): | |
| edited_rows: dict = st.session_state.project_selection["edited_rows"] | |
| st.session_state["selected_row_index_gd_table"] = next(iter(edited_rows)) | |
| st.session_state["gd_table"] = st.session_state["gd_table"].assign(selected=False) | |
| update_dict = {idx: values for idx, values in edited_rows.items()} | |
| st.session_state["gd_table"].update( | |
| pd.DataFrame.from_dict(update_dict, orient="index") | |
| ) | |
| if "username" not in st.session_state: | |
| st.session_state["username"] = None | |
| if "project_name" not in st.session_state: | |
| st.session_state["project_name"] = None | |
| if "project_dct" not in st.session_state: | |
| project_selection() | |
| st.stop() | |
| try: | |
| st.session_state["bin_dict"] = st.session_state["project_dct"]["data_import"][ | |
| "category_dict" | |
| ] # db | |
| except Exception as e: | |
| st.warning("Save atleast one tuned model to proceed") | |
| log_message("warning", "No tuned models available", "AI Model Results") | |
| st.stop() | |
| if "gd_table" not in st.session_state: | |
| st.session_state["gd_table"] = pd.DataFrame() | |
| try: | |
| if "username" in st.session_state and st.session_state["username"] is not None: | |
| if ( | |
| retrieve_pkl_object( | |
| st.session_state["project_number"], | |
| "Model_Tuning", | |
| "tuned_model", | |
| schema, | |
| ) | |
| is None | |
| ): | |
| st.error("Please save a tuned model") | |
| st.stop() | |
| if ( | |
| "session_state_saved" | |
| in st.session_state["project_dct"]["model_tuning"].keys() | |
| and st.session_state["project_dct"]["model_tuning"]["session_state_saved"] | |
| != [] | |
| ): | |
| for key in ["used_response_metrics", "media_data", "bin_dict"]: | |
| if key not in st.session_state: | |
| st.session_state[key] = st.session_state["project_dct"][ | |
| "model_tuning" | |
| ]["session_state_saved"][key] | |
| # st.session_state["bin_dict"] = st.session_state["project_dct"][ | |
| # "model_build" | |
| # ]["session_state_saved"]["bin_dict"] | |
| media_data = st.session_state["media_data"] | |
| # st.write(media_data.columns) | |
| # set the panel column | |
| panel_col = "panel" | |
| is_panel = ( | |
| True if st.session_state["media_data"][panel_col].nunique() > 1 else False | |
| ) | |
| # st.write(is_panel) | |
| date_col = "date" | |
| transformed_data = st.session_state["project_dct"]["transformations"][ | |
| "final_df" | |
| ] # db | |
| tuned_model_dict = retrieve_pkl_object( | |
| st.session_state["project_number"], "Model_Tuning", "tuned_model", schema | |
| ) # db | |
| feature_set_dct = { | |
| key.split("__")[1]: key_dict["feature_set"] | |
| for key, key_dict in tuned_model_dict.items() | |
| } | |
| # """ the above part should be modified so that we are fetching features set from the saved model""" | |
| if "contribution_df" not in st.session_state: | |
| st.session_state["contribution_df"] = None | |
| metrics_table = metrics_df_panel(tuned_model_dict, is_panel) | |
| cols1 = st.columns([2, 1]) | |
| with cols1[0]: | |
| st.markdown(f"**Welcome {st.session_state['username']}**") | |
| with cols1[1]: | |
| st.markdown(f"**Current Project: {st.session_state['project_name']}**") | |
| st.title("AI Model Validation") | |
| st.header("Contribution Overview") | |
| # Get list of response metrics | |
| st.session_state["used_response_metrics"] = list( | |
| set([model.split("__")[1] for model in tuned_model_dict.keys()]) | |
| ) | |
| options = st.session_state["used_response_metrics"] | |
| if len(options) == 0: | |
| st.error("Please save and tune a model") | |
| st.stop() | |
| options = [ | |
| opt.lower() | |
| .replace(" ", "_") | |
| .replace("-", "") | |
| .replace(":", "") | |
| .replace("__", "_") | |
| for opt in options | |
| ] | |
| default_options = ( | |
| st.session_state["project_dct"]["saved_model_results"].get( | |
| "selected_options" | |
| ) | |
| if st.session_state["project_dct"]["saved_model_results"].get( | |
| "selected_options" | |
| ) | |
| is not None | |
| else [options[-1]] | |
| ) | |
| for i in default_options: | |
| if i not in options: | |
| # st.write(i) | |
| default_options.remove(i) | |
| def remove_response_metric(name): | |
| # Convert the name to a lowercase string and remove any leading or trailing spaces | |
| name_str = str(name).lower().strip() | |
| # Check if the name starts with "response metric" or "response_metric" | |
| if name_str.startswith("response metric"): | |
| return name[len("response metric") :].replace("_", " ").strip().title() | |
| elif name_str.startswith("response_metric"): | |
| return name[len("response_metric") :].replace("_", " ").strip().title() | |
| else: | |
| return name | |
| contribution_selections = st.multiselect( | |
| "Select the Response Metrics to compare contributions", | |
| options, | |
| default=default_options, | |
| format_func=remove_response_metric, | |
| ) | |
| trace_data = [] | |
| if is_panel: | |
| st.session_state["contribution_df"] = contributions_panel(tuned_model_dict) | |
| else: | |
| st.session_state["contribution_df"] = contributions_nonpanel( | |
| tuned_model_dict | |
| ) | |
| # st.write(st.session_state["contribution_df"].columns) | |
| # for selection in contribution_selections: | |
| # trace = go.Bar( | |
| # x=st.session_state["contribution_df"]["Channel"], | |
| # y=st.session_state["contribution_df"][selection], | |
| # name=selection, | |
| # text=np.round(st.session_state["contribution_df"][selection], 0) | |
| # .astype(int) | |
| # .astype(str) | |
| # + "%", | |
| # textposition="outside", | |
| # ) | |
| # trace_data.append(trace) | |
| # layout = go.Layout( | |
| # title="Metrics Contribution by Channel", | |
| # xaxis=dict(title="Channel Name"), | |
| # yaxis=dict(title="Metrics Contribution"), | |
| # barmode="group", | |
| # ) | |
| # fig = go.Figure(data=trace_data, layout=layout) | |
| # st.plotly_chart(fig, use_container_width=True) | |
| # Display the chart in Streamlit | |
| st.plotly_chart( | |
| create_grouped_bar_plot( | |
| st.session_state["contribution_df"], contribution_selections | |
| ), | |
| use_container_width=True, | |
| ) | |
| ############################################ Waterfall Chart ############################################ | |
| import plotly.graph_objects as go | |
| st.plotly_chart( | |
| preprocess_and_plot( | |
| st.session_state["contribution_df"], contribution_selections | |
| ), | |
| use_container_width=True, | |
| ) | |
| ############################################ Waterfall Chart ############################################ | |
| st.header("Analysis of Models Result") | |
| gd_table = metrics_table.iloc[:, :-2] | |
| target_column = gd_table.at[0, "Model"] # sprint8 | |
| st.session_state["gd_table"] = gd_table | |
| with st.container(): | |
| table = st.data_editor( | |
| st.session_state["gd_table"], | |
| hide_index=True, | |
| # on_change=selection_change, | |
| key="project_selection", | |
| use_container_width=True, | |
| ) | |
| target_column = st.selectbox( | |
| "Select a Model to analyse its results", | |
| options=st.session_state.used_response_metrics, | |
| placeholder=options[0], | |
| ) | |
| feature_set = feature_set_dct[target_column] | |
| model = metrics_table[metrics_table["Model"] == target_column][ | |
| "Model_object" | |
| ].iloc[0] | |
| target = metrics_table[metrics_table["Model"] == target_column]["Model"].iloc[0] | |
| st.header("Model Summary") | |
| st.write(model.summary()) | |
| sel_dict = tuned_model_dict[ | |
| [k for k in tuned_model_dict.keys() if k.split("__")[1] == target][0] | |
| ] | |
| feature_set = sel_dict["feature_set"] | |
| X_train = sel_dict["X_train_tuned"] | |
| y_train = X_train[target] | |
| if is_panel: | |
| random_effects = get_random_effects(media_data, panel_col, model) | |
| pred = mdf_predict(X_train, model, random_effects)["pred"] | |
| else: | |
| pred = model.predict(X_train[feature_set]) | |
| X_test = sel_dict["X_test_tuned"] | |
| y_test = X_test[target] | |
| if is_panel: | |
| predtest = mdf_predict(X_test, model, random_effects)["pred"] | |
| else: | |
| predtest = model.predict(X_test[feature_set]) | |
| metrics_table_train, _, fig_train = plot_actual_vs_predicted( | |
| X_train[date_col], | |
| y_train, | |
| pred, | |
| model, | |
| target_column=target, | |
| flag=None, | |
| repeat_all_years=False, | |
| is_panel=is_panel, | |
| ) | |
| metrics_table_test, _, fig_test = plot_actual_vs_predicted( | |
| X_test[date_col], | |
| y_test, | |
| predtest, | |
| model, | |
| target_column=target, | |
| flag=None, | |
| repeat_all_years=False, | |
| is_panel=is_panel, | |
| ) | |
| metrics_table_train = metrics_table_train.set_index("Metric").transpose() | |
| metrics_table_train.index = ["Train"] | |
| metrics_table_test = metrics_table_test.set_index("Metric").transpose() | |
| metrics_table_test.index = ["Test"] | |
| metrics_table = np.round( | |
| pd.concat([metrics_table_train, metrics_table_test]), 2 | |
| ) | |
| st.markdown("Result Overview") | |
| st.dataframe(np.round(metrics_table, 2), use_container_width=True) | |
| st.header("Model Accuracy") | |
| st.subheader("Actual vs Predicted Plot (Train)") | |
| st.plotly_chart(fig_train, use_container_width=True) | |
| st.subheader("Actual vs Predicted Plot (Test)") | |
| st.plotly_chart(fig_test, use_container_width=True) | |
| st.markdown("## Residual Analysis (Train)") | |
| columns = st.columns(2) | |
| Xtrain1 = X_train.copy() | |
| with columns[0]: | |
| fig = plot_residual_predicted(y_train, pred, Xtrain1) | |
| st.plotly_chart(fig) | |
| with columns[1]: | |
| st.empty() | |
| fig = qqplot(y_train, pred) | |
| st.plotly_chart(fig) | |
| with columns[0]: | |
| fig = residual_distribution(y_train, pred) | |
| st.pyplot(fig) | |
| if st.button("Save this session", use_container_width=True): | |
| project_dct_pkl = pickle.dumps(st.session_state["project_dct"]) | |
| update_db( | |
| st.session_state["project_number"], | |
| "AI_Model_Results", | |
| "project_dct", | |
| project_dct_pkl, | |
| schema, | |
| # resp_mtrc=None, | |
| ) # db | |
| log_message("info", "Session saved!", "AI Model Results") | |
| st.success("Session Saved!") | |
| except: | |
| exc_type, exc_value, exc_traceback = sys.exc_info() | |
| error_message = "".join( | |
| traceback.format_exception(exc_type, exc_value, exc_traceback) | |
| ) | |
| log_message("error", f"Error: {error_message}", "AI Model Results") | |
| st.warning("An error occured, please try again", icon="⚠️") | |