Spaces:

BlendMMM
/

Mastercard

Sleeping

App Files Files Community

BlendMMM commited on Apr 8, 2024

Commit

d8c57ad

verified ·

1 Parent(s): 07855e7

Upload Data_Import.py

Browse files

Files changed (1) hide show

Data_Import.py +1019 -0

Data_Import.py ADDED Viewed

	@@ -0,0 +1,1019 @@

+# Importing necessary libraries
+import streamlit as st
+st.set_page_config(
+    page_title="Data Import",
+    page_icon=":shark:",
+    layout="wide",
+    initial_sidebar_state="collapsed",
+)
+import pickle
+import pandas as pd
+from utilities import set_header, load_local_css
+import streamlit_authenticator as stauth
+import yaml
+from yaml import SafeLoader
+load_local_css("styles.css")
+set_header()
+for k, v in st.session_state.items():
+    if k not in ["logout", "login", "config"] and not k.startswith(
+        "FormSubmitter"
+    ):
+        st.session_state[k] = v
+with open("config.yaml") as file:
+    config = yaml.load(file, Loader=SafeLoader)
+    st.session_state["config"] = config
+authenticator = stauth.Authenticate(
+    config["credentials"],
+    config["cookie"]["name"],
+    config["cookie"]["key"],
+    config["cookie"]["expiry_days"],
+    config["preauthorized"],
+)
+st.session_state["authenticator"] = authenticator
+name, authentication_status, username = authenticator.login("Login", "main")
+auth_status = st.session_state.get("authentication_status")
+if auth_status == True:
+    authenticator.logout("Logout", "main")
+    is_state_initiaized = st.session_state.get("initialized", False)
+    if not is_state_initiaized:
+        if 'session_name' not in st.session_state:
+            st.session_state['session_name']=None
+# Function to validate date column in dataframe
+    def validate_date_column(df):
+        try:
+            # Attempt to convert the 'Date' column to datetime
+            df["date"] = pd.to_datetime(df["date"], format="%d-%m-%Y")
+            return True
+        except:
+            return False
+    # Function to determine data interval
+    def determine_data_interval(common_freq):
+        if common_freq == 1:
+            return "daily"
+        elif common_freq == 7:
+            return "weekly"
+        elif 28 <= common_freq <= 31:
+            return "monthly"
+        else:
+            return "irregular"
+    # Function to read each uploaded Excel file into a pandas DataFrame and stores them in a dictionary
+    st.cache_resource(show_spinner=False)
+    def files_to_dataframes(uploaded_files):
+        df_dict = {}
+        for uploaded_file in uploaded_files:
+            # Extract file name without extension
+            file_name = uploaded_file.name.rsplit(".", 1)[0]
+            # Check for duplicate file names
+            if file_name in df_dict:
+                st.warning(
+                    f"Duplicate File: {file_name}. This file will be skipped.",
+                    icon="⚠️",
+                )
+                continue
+            # Read the file into a DataFrame
+            df = pd.read_excel(uploaded_file)
+            # Convert all column names to lowercase
+            df.columns = df.columns.str.lower().str.strip()
+            # Separate numeric and non-numeric columns
+            numeric_cols = list(df.select_dtypes(include=["number"]).columns)
+            non_numeric_cols = [
+                col
+                for col in df.select_dtypes(exclude=["number"]).columns
+                if col.lower() != "date"
+            ]
+            # Check for 'Date' column
+            if not (validate_date_column(df) and len(numeric_cols) > 0):
+                st.warning(
+                    f"File Name: {file_name} ➜ Please upload data with Date column in 'DD-MM-YYYY' format and at least one media/exogenous column. This file will be skipped.",
+                    icon="⚠️",
+                )
+                continue
+            # Check for interval
+            common_freq = common_freq = (
+                pd.Series(df["date"].unique()).diff().dt.days.dropna().mode()[0]
+            )
+            # Calculate the data interval (daily, weekly, monthly or irregular)
+            interval = determine_data_interval(common_freq)
+            if interval == "irregular":
+                st.warning(
+                    f"File Name: {file_name} ➜ Please upload data in daily, weekly or monthly interval. This file will be skipped.",
+                    icon="⚠️",
+                )
+                continue
+            # Store both DataFrames in the dictionary under their respective keys
+            df_dict[file_name] = {
+                "numeric": numeric_cols,
+                "non_numeric": non_numeric_cols,
+                "interval": interval,
+                "df": df,
+            }
+        return df_dict
+    # Function to adjust dataframe granularity
+    def adjust_dataframe_granularity(df, current_granularity, target_granularity):
+        # Set index
+        df.set_index("date", inplace=True)
+        # Define aggregation rules for resampling
+        aggregation_rules = {
+            col: "sum" if pd.api.types.is_numeric_dtype(df[col]) else "first"
+            for col in df.columns
+        }
+        # Initialize resampled_df
+        resampled_df = df
+        if current_granularity == "daily" and target_granularity == "weekly":
+            resampled_df = df.resample("W-MON", closed="left", label="left").agg(
+                aggregation_rules
+            )
+        elif current_granularity == "daily" and target_granularity == "monthly":
+            resampled_df = df.resample("MS", closed="left", label="left").agg(
+                aggregation_rules
+            )
+        elif current_granularity == "daily" and target_granularity == "daily":
+            resampled_df = df.resample("D").agg(aggregation_rules)
+        elif current_granularity in ["weekly", "monthly"] and target_granularity == "daily":
+            # For higher to lower granularity, distribute numeric and replicate non-numeric values equally across the new period
+            expanded_data = []
+            for _, row in df.iterrows():
+                if current_granularity == "weekly":
+                    period_range = pd.date_range(start=row.name, periods=7)
+                elif current_granularity == "monthly":
+                    period_range = pd.date_range(
+                        start=row.name, periods=row.name.days_in_month
+                    )
+                for date in period_range:
+                    new_row = {}
+                    for col in df.columns:
+                        if pd.api.types.is_numeric_dtype(df[col]):
+                            if current_granularity == "weekly":
+                                new_row[col] = row[col] / 7
+                            elif current_granularity == "monthly":
+                                new_row[col] = row[col] / row.name.days_in_month
+                        else:
+                            new_row[col] = row[col]
+                    expanded_data.append((date, new_row))
+            resampled_df = pd.DataFrame(
+                [data for _, data in expanded_data],
+                index=[date for date, _ in expanded_data],
+            )
+        # Reset index
+        resampled_df = resampled_df.reset_index().rename(columns={"index": "date"})
+        return resampled_df
+    # Function to clean and extract unique values of Panel_1 and Panel_2
+    st.cache_resource(show_spinner=False)
+    def clean_and_extract_unique_values(files_dict, selections):
+        all_panel1_values = set()
+        all_panel2_values = set()
+        for file_name, file_data in files_dict.items():
+            df = file_data["df"]
+            # 'Panel_1' and 'Panel_2' selections
+            selected_panel1 = selections[file_name].get("Panel_1")
+            selected_panel2 = selections[file_name].get("Panel_2")
+            # Clean and standardize Panel_1 column if it exists and is selected
+            if (
+                selected_panel1
+                and selected_panel1 != "N/A"
+                and selected_panel1 in df.columns
+            ):
+                df[selected_panel1] = (
+                    df[selected_panel1].str.lower().str.strip().str.replace("_", " ")
+                )
+                all_panel1_values.update(df[selected_panel1].dropna().unique())
+            # Clean and standardize Panel_2 column if it exists and is selected
+            if (
+                selected_panel2
+                and selected_panel2 != "N/A"
+                and selected_panel2 in df.columns
+            ):
+                df[selected_panel2] = (
+                    df[selected_panel2].str.lower().str.strip().str.replace("_", " ")
+                )
+                all_panel2_values.update(df[selected_panel2].dropna().unique())
+            # Update the processed DataFrame back in the dictionary
+            files_dict[file_name]["df"] = df
+        return all_panel1_values, all_panel2_values
+    # Function to format values for display
+    st.cache_resource(show_spinner=False)
+    def format_values_for_display(values_list):
+        # Capitalize the first letter of each word and replace underscores with spaces
+        formatted_list = [value.replace("_", " ").title() for value in values_list]
+        # Join values with commas and 'and' before the last value
+        if len(formatted_list) > 1:
+            return ", ".join(formatted_list[:-1]) + ", and " + formatted_list[-1]
+        elif formatted_list:
+            return formatted_list[0]
+        return "No values available"
+    # Function to normalizes all data within files_dict to a daily granularity
+    st.cache(show_spinner=False, allow_output_mutation=True)
+    def standardize_data_to_daily(files_dict, selections):
+        # Normalize all data to a daily granularity using a provided function
+        files_dict = apply_granularity_to_all(files_dict, "daily", selections)
+        # Update the "interval" attribute for each dataset to indicate the new granularity
+        for files_name, files_data in files_dict.items():
+            files_data["interval"] = "daily"
+        return files_dict
+    # Function to apply granularity transformation to all DataFrames in files_dict
+    st.cache_resource(show_spinner=False)
+    def apply_granularity_to_all(files_dict, granularity_selection, selections):
+        for file_name, file_data in files_dict.items():
+            df = file_data["df"].copy()
+            # Handling when Panel_1 or Panel_2 might be 'N/A'
+            selected_panel1 = selections[file_name].get("Panel_1")
+            selected_panel2 = selections[file_name].get("Panel_2")
+            # Correcting the segment selection logic & handling 'N/A'
+            if selected_panel1 != "N/A" and selected_panel2 != "N/A":
+                unique_combinations = df[
+                    [selected_panel1, selected_panel2]
+                ].drop_duplicates()
+            elif selected_panel1 != "N/A":
+                unique_combinations = df[[selected_panel1]].drop_duplicates()
+                selected_panel2 = None  # Ensure Panel_2 is ignored if N/A
+            elif selected_panel2 != "N/A":
+                unique_combinations = df[[selected_panel2]].drop_duplicates()
+                selected_panel1 = None  # Ensure Panel_1 is ignored if N/A
+            else:
+                # If both are 'N/A', process the entire dataframe as is
+                df = adjust_dataframe_granularity(
+                    df, file_data["interval"], granularity_selection
+                )
+                files_dict[file_name]["df"] = df
+                continue  # Skip to the next file
+            transformed_segments = []
+            for _, combo in unique_combinations.iterrows():
+                if selected_panel1 and selected_panel2:
+                    segment = df[
+                        (df[selected_panel1] == combo[selected_panel1])
+                        & (df[selected_panel2] == combo[selected_panel2])
+                    ]
+                elif selected_panel1:
+                    segment = df[df[selected_panel1] == combo[selected_panel1]]
+                elif selected_panel2:
+                    segment = df[df[selected_panel2] == combo[selected_panel2]]
+                # Adjust granularity of the segment
+                transformed_segment = adjust_dataframe_granularity(
+                    segment, file_data["interval"], granularity_selection
+                )
+                transformed_segments.append(transformed_segment)
+            # Combine all transformed segments into a single DataFrame for this file
+            transformed_df = pd.concat(transformed_segments, ignore_index=True)
+            files_dict[file_name]["df"] = transformed_df
+        return files_dict
+    # Function to create main dataframe structure
+    st.cache_resource(show_spinner=False)
+    def create_main_dataframe(
+        files_dict, all_panel1_values, all_panel2_values, granularity_selection
+    ):
+        # Determine the global start and end dates across all DataFrames
+        global_start = min(df["df"]["date"].min() for df in files_dict.values())
+        global_end = max(df["df"]["date"].max() for df in files_dict.values())
+        # Adjust the date_range generation based on the granularity_selection
+        if granularity_selection == "weekly":
+            # Generate a weekly range, with weeks starting on Monday
+            date_range = pd.date_range(start=global_start, end=global_end, freq="W-MON")
+        elif granularity_selection == "monthly":
+            # Generate a monthly range, starting from the first day of each month
+            date_range = pd.date_range(start=global_start, end=global_end, freq="MS")
+        else:  # Default to daily if not weekly or monthly
+            date_range = pd.date_range(start=global_start, end=global_end, freq="D")
+        # Collect all unique Panel_1 and Panel_2 values, excluding 'N/A'
+        all_panel1s = all_panel1_values
+        all_panel2s = all_panel2_values
+        # Dynamically build the list of dimensions (Panel_1, Panel_2) to include in the main DataFrame based on availability
+        dimensions, merge_keys = [], []
+        if all_panel1s:
+            dimensions.append(all_panel1s)
+            merge_keys.append("Panel_1")
+        if all_panel2s:
+            dimensions.append(all_panel2s)
+            merge_keys.append("Panel_2")
+        dimensions.append(date_range)  # Date range is always included
+        merge_keys.append("date")  # Date range is always included
+        # Create a main DataFrame template with the dimensions
+        main_df = pd.MultiIndex.from_product(
+            dimensions,
+            names=[name for name, _ in zip(merge_keys, dimensions)],
+        ).to_frame(index=False)
+        return main_df.reset_index(drop=True)
+    # Function to prepare and merge dataFrames
+    st.cache_resource(show_spinner=False)
+    def merge_into_main_df(main_df, files_dict, selections):
+        for file_name, file_data in files_dict.items():
+            df = file_data["df"].copy()
+            # Rename selected Panel_1 and Panel_2 columns if not 'N/A'
+            selected_panel1 = selections[file_name].get("Panel_1", "N/A")
+            selected_panel2 = selections[file_name].get("Panel_2", "N/A")
+            if selected_panel1 != "N/A":
+                df.rename(columns={selected_panel1: "Panel_1"}, inplace=True)
+            if selected_panel2 != "N/A":
+                df.rename(columns={selected_panel2: "Panel_2"}, inplace=True)
+            # Merge current DataFrame into main_df based on 'date', and where applicable, 'Panel_1' and 'Panel_2'
+            merge_keys = ["date"]
+            if "Panel_1" in df.columns:
+                merge_keys.append("Panel_1")
+            if "Panel_2" in df.columns:
+                merge_keys.append("Panel_2")
+            main_df = pd.merge(main_df, df, on=merge_keys, how="left")
+        # After all merges, sort by 'date' and reset index for cleanliness
+        sort_by = ["date"]
+        if "Panel_1" in main_df.columns:
+            sort_by.append("Panel_1")
+        if "Panel_2" in main_df.columns:
+            sort_by.append("Panel_2")
+        main_df.sort_values(by=sort_by, inplace=True)
+        main_df.reset_index(drop=True, inplace=True)
+        return main_df
+    # Function to categorize column
+    def categorize_column(column_name):
+        # Define keywords for each category
+        internal_keywords = [
+            "Price",
+            "Discount",
+            "product_price",
+            "cost",
+            "margin",
+            "inventory",
+            "sales",
+            "revenue",
+            "turnover",
+            "expense",
+        ]
+        exogenous_keywords = [
+            "GDP",
+            "Tax",
+            "Inflation",
+            "interest_rate",
+            "employment_rate",
+            "exchange_rate",
+            "consumer_spending",
+            "retail_sales",
+            "oil_prices",
+            "weather",
+        ]
+        # Check if the column name matches any of the keywords for Internal or Exogenous categories
+        for keyword in internal_keywords:
+            if keyword.lower() in column_name.lower():
+                return "Internal"
+        for keyword in exogenous_keywords:
+            if keyword.lower() in column_name.lower():
+                return "Exogenous"
+        # Default to Media if no match found
+        return "Media"
+    # Function to calculate missing stats and prepare for editable DataFrame
+    st.cache_resource(show_spinner=False)
+    def prepare_missing_stats_df(df):
+        missing_stats = []
+        for column in df.columns:
+            if (
+                column == "date" or column == "Panel_2" or column == "Panel_1"
+            ):  # Skip Date, Panel_1 and Panel_2 column
+                continue
+            missing = df[column].isnull().sum()
+            pct_missing = round((missing / len(df)) * 100, 2)
+            # Dynamically assign category based on column name
+            category = categorize_column(column)
+            # category = "Media"  # Keep default bin as Media
+            missing_stats.append(
+                {
+                    "Column": column,
+                    "Missing Values": missing,
+                    "Missing Percentage": pct_missing,
+                    "Impute Method": "Fill with 0",  # Default value
+                    "Category": category,
+                }
+            )
+        stats_df = pd.DataFrame(missing_stats)
+        return stats_df
+    # Function to add API DataFrame details to the files dictionary
+    st.cache_resource(show_spinner=False)
+    def add_api_dataframe_to_dict(main_df, files_dict):
+        files_dict["API"] = {
+            "numeric": list(main_df.select_dtypes(include=["number"]).columns),
+            "non_numeric": [
+                col
+                for col in main_df.select_dtypes(exclude=["number"]).columns
+                if col.lower() != "date"
+            ],
+            "interval": determine_data_interval(
+                pd.Series(main_df["date"].unique()).diff().dt.days.dropna().mode()[0]
+            ),
+            "df": main_df,
+        }
+        return files_dict
+    # Function to reads an API into a DataFrame, parsing specified columns as datetime
+    @st.cache_resource(show_spinner=False)
+    def read_API_data():
+        return pd.read_excel(r".\upf_data_converted_randomized_resp_metrics.xlsx", parse_dates=["Date"])
+    # Function to set the 'Panel_1_Panel_2_Selected' session state variable to False
+    def set_Panel_1_Panel_2_Selected_false():
+        st.session_state["Panel_1_Panel_2_Selected"] = False
+    # Function to serialize and save the objects into a pickle file
+    @st.cache_resource(show_spinner=False)
+    def save_to_pickle(file_path, final_df, bin_dict):
+        # Open the file in write-binary mode and dump the objects
+        with open(file_path, "wb") as f:
+            pickle.dump({"final_df": final_df, "bin_dict": bin_dict}, f)
+            # Data is now saved to file
+    # Function to processes the merged_df DataFrame based on operations defined in edited_df
+    @st.cache_resource(show_spinner=False)
+    def process_dataframes(merged_df, edited_df, edited_stats_df):
+        # Ensure there are operations defined by the user
+        if edited_df.empty:
+            return merged_df, edited_stats_df  # No operations to apply
+        # Perform operations as defined by the user
+        for index, row in edited_df.iterrows():
+            result_column_name = f"{row['Column 1']}{row['Operator']}{row['Column 2']}"
+            col1 = row["Column 1"]
+            col2 = row["Column 2"]
+            op = row["Operator"]
+            # Apply the specified operation
+            if op == "+":
+                merged_df[result_column_name] = merged_df[col1] + merged_df[col2]
+            elif op == "-":
+                merged_df[result_column_name] = merged_df[col1] - merged_df[col2]
+            elif op == "*":
+                merged_df[result_column_name] = merged_df[col1] * merged_df[col2]
+            elif op == "/":
+                merged_df[result_column_name] = merged_df[col1] / merged_df[col2].replace(
+                    0, 1e-9
+                )
+            # Add summary of operation to edited_stats_df
+            new_row = {
+                "Column": result_column_name,
+                "Missing Values": None,
+                "Missing Percentage": None,
+                "Impute Method": None,
+                "Category": row["Category"],
+            }
+            new_row_df = pd.DataFrame([new_row])
+            # Use pd.concat to add the new_row_df to edited_stats_df
+            edited_stats_df = pd.concat(
+                [edited_stats_df, new_row_df], ignore_index=True, axis=0
+            )
+        # Combine column names from edited_df for cleanup
+        combined_columns = set(edited_df["Column 1"]).union(set(edited_df["Column 2"]))
+        # Filter out rows in edited_stats_df and drop columns from merged_df
+        edited_stats_df = edited_stats_df[~edited_stats_df["Column"].isin(combined_columns)]
+        merged_df.drop(columns=list(combined_columns), errors="ignore", inplace=True)
+        return merged_df, edited_stats_df
+    # Function to prepare a list of numeric column names and initialize an empty DataFrame with predefined structure
+    st.cache_resource(show_spinner=False)
+    def prepare_numeric_columns_and_default_df(merged_df, edited_stats_df):
+        # Get columns categorized as 'Response Metrics'
+        columns_response_metrics = edited_stats_df[
+            edited_stats_df["Category"] == "Response Metrics"
+        ]["Column"].tolist()
+        # Filter numeric columns, excluding those categorized as 'Response Metrics'
+        numeric_columns = [
+            col
+            for col in merged_df.select_dtypes(include=["number"]).columns
+            if col not in columns_response_metrics
+        ]
+        # Define the structure of the empty DataFrame
+        data = {
+            "Column 1": pd.Series([], dtype="str"),
+            "Operator": pd.Series([], dtype="str"),
+            "Column 2": pd.Series([], dtype="str"),
+            "Category": pd.Series([], dtype="str"),
+        }
+        default_df = pd.DataFrame(data)
+        return numeric_columns, default_df
+    # Initialize 'final_df' in session state
+    if "final_df" not in st.session_state:
+        st.session_state["final_df"] = pd.DataFrame()
+    # Initialize 'bin_dict' in session state
+    if "bin_dict" not in st.session_state:
+        st.session_state["bin_dict"] = {}
+    # Initialize 'Panel_1_Panel_2_Selected' in session state
+    if "Panel_1_Panel_2_Selected" not in st.session_state:
+        st.session_state["Panel_1_Panel_2_Selected"] = False
+    # Page Title
+    st.write("")  # Top padding
+    st.title("Data Import")
+    #########################################################################################################################################################
+    # Create a dictionary to hold all DataFrames and collect user input to specify "Panel_2" and "Panel_1" columns for each file
+    #########################################################################################################################################################
+    # Read the Excel file, parsing 'Date' column as datetime
+    main_df = read_API_data()
+    # Convert all column names to lowercase
+    main_df.columns = main_df.columns.str.lower().str.strip()
+    # File uploader
+    uploaded_files = st.file_uploader(
+        "Upload additional data",
+        type=["xlsx"],
+        accept_multiple_files=True,
+        on_change=set_Panel_1_Panel_2_Selected_false,
+    )
+    # Custom HTML for upload instructions
+    recommendation_html = f"""
+    <div style="text-align: justify;">
+    <strong>Recommendation:</strong> For optimal processing, please ensure that all uploaded datasets including panel, media, internal, and exogenous data adhere to the following guidelines: Each dataset must include a <code>Date</code> column formatted as <code>DD-MM-YYYY</code>, be free of missing values.
+    </div>
+    """
+    st.markdown(recommendation_html, unsafe_allow_html=True)
+    # Choose Desired Granularity
+    st.markdown("#### Choose Desired Granularity")
+    # Granularity Selection
+    granularity_selection = st.selectbox(
+        "Choose Date Granularity",
+        ["Daily", "Weekly", "Monthly"],
+        label_visibility="collapsed",
+        on_change=set_Panel_1_Panel_2_Selected_false,
+    )
+    granularity_selection = str(granularity_selection).lower()
+    # Convert files to dataframes
+    files_dict = files_to_dataframes(uploaded_files)
+    # Add API Dataframe
+    if main_df is not None:
+        files_dict = add_api_dataframe_to_dict(main_df, files_dict)
+    # Display a warning message if no files have been uploaded and halt further execution
+    if not files_dict:
+        st.warning(
+            "Please upload at least one file to proceed.",
+            icon="⚠️",
+        )
+        st.stop()  # Halts further execution until file is uploaded
+    # Select Panel_1 and Panel_2 columns
+    st.markdown("#### Select Panel columns")
+    selections = {}
+    with st.expander("Select Panel columns", expanded=False):
+        count = 0  # Initialize counter to manage the visibility of labels and keys
+        for file_name, file_data in files_dict.items():
+            # Determine visibility of the label based on the count
+            if count == 0:
+                label_visibility = "visible"
+            else:
+                label_visibility = "collapsed"
+            # Extract non-numeric columns
+            non_numeric_cols = file_data["non_numeric"]
+            # Prepare Panel_1 and Panel_2 values for dropdown, adding "N/A" as an option
+            panel1_values = non_numeric_cols + ["N/A"]
+            panel2_values = non_numeric_cols + ["N/A"]
+            # Skip if only one option is available
+            if len(panel1_values) == 1 and len(panel2_values) == 1:
+                selected_panel1, selected_panel2 = "N/A", "N/A"
+                # Update the selections for Panel_1 and Panel_2 for the current file
+                selections[file_name] = {
+                    "Panel_1": selected_panel1,
+                    "Panel_2": selected_panel2,
+                }
+                continue
+            # Create layout columns for File Name, Panel_2, and Panel_1 selections
+            file_name_col, Panel_1_col, Panel_2_col = st.columns([2, 4, 4])
+            with file_name_col:
+                # Display "File Name" label only for the first file
+                if count == 0:
+                    st.write("File Name")
+                else:
+                    st.write("")
+                st.write(file_name)  # Display the file name
+            with Panel_1_col:
+                # Display a selectbox for Panel_1 values
+                selected_panel1 = st.selectbox(
+                    "Select Panel Level 1",
+                    panel2_values,
+                    on_change=set_Panel_1_Panel_2_Selected_false,
+                    label_visibility=label_visibility,  # Control visibility of the label
+                    key=f"Panel_1_selectbox{count}",  # Ensure unique key for each selectbox
+                )
+            with Panel_2_col:
+                # Display a selectbox for Panel_2 values
+                selected_panel2 = st.selectbox(
+                    "Select Panel Level 2",
+                    panel1_values,
+                    on_change=set_Panel_1_Panel_2_Selected_false,
+                    label_visibility=label_visibility,  # Control visibility of the label
+                    key=f"Panel_2_selectbox{count}",  # Ensure unique key for each selectbox
+                )
+            # Skip processing if the same column is selected for both Panel_1 and Panel_2 due to potential data integrity issues
+            if selected_panel2 == selected_panel1 and not (
+                selected_panel2 == "N/A" and selected_panel1 == "N/A"
+            ):
+                st.warning(
+                    f"File: {file_name} → The same column cannot serve as both Panel_1 and Panel_2. Please adjust your selections.",
+                )
+                selected_panel1, selected_panel2 = "N/A", "N/A"
+                st.stop()
+            # Update the selections for Panel_1 and Panel_2 for the current file
+            selections[file_name] = {
+                "Panel_1": selected_panel1,
+                "Panel_2": selected_panel2,
+            }
+            count += 1  # Increment the counter after processing each file
+        # Accept Panel_1 and Panel_2 selection
+        if st.button("Accept and Process", use_container_width=True):
+            # Normalize all data to a daily granularity. This initial standardization simplifies subsequent conversions to other levels of granularity
+            with st.spinner("Processing..."):
+                files_dict = standardize_data_to_daily(files_dict, selections)
+                # Convert all data to daily level granularity
+                files_dict = apply_granularity_to_all(
+                    files_dict, granularity_selection, selections
+                )
+            # Update the 'files_dict' in the session state
+            st.session_state["files_dict"] = files_dict
+            # Set a flag in the session state to indicate that selection has been made
+            st.session_state["Panel_1_Panel_2_Selected"] = True
+    #########################################################################################################################################################
+    # Display unique Panel_1 and Panel_2 values
+    #########################################################################################################################################################
+    # Halts further execution until Panel_1 and Panel_2 columns are selected
+    if "files_dict" in st.session_state and st.session_state["Panel_1_Panel_2_Selected"]:
+        files_dict = st.session_state["files_dict"]
+    else:
+        st.stop()
+    # Set to store unique values of Panel_1 and Panel_2
+    with st.spinner("Fetching Panel values..."):
+        all_panel1_values, all_panel2_values = clean_and_extract_unique_values(
+            files_dict, selections
+        )
+        # List of Panel_1 and Panel_2 columns unique values
+        list_of_all_panel1_values = list(all_panel1_values)
+        list_of_all_panel2_values = list(all_panel2_values)
+        # Format Panel_1 and Panel_2 values for display
+        formatted_panel1_values = format_values_for_display(list_of_all_panel1_values)
+        formatted_panel2_values = format_values_for_display(list_of_all_panel2_values)
+    # Unique Panel_1 and Panel_2 values
+    st.markdown("#### Unique Panel values")
+    # Display Panel_1 and Panel_2 values
+    with st.expander("Unique Panel values"):
+        st.write("")
+        st.markdown(
+            f"""
+        <style>
+        .justify-text {{
+        text-align: justify;
+        }}
+        </style>
+        <div class="justify-text">
+        <strong>Panel Level 1 Values:</strong> {formatted_panel1_values}<br>
+        <strong>Panel Level 2 Values:</strong> {formatted_panel2_values}
+        </div>
+        """,
+            unsafe_allow_html=True,
+        )
+        # Display total Panel_1 and Panel_2
+        st.write("")
+        st.markdown(
+            f"""
+        <div style="text-align: justify;">
+            <strong>Number of Level 1 Panels detected:</strong> {len(list_of_all_panel1_values)}<br>
+            <strong>Number of Level 2 Panels detected:</strong> {len(list_of_all_panel2_values)}
+        </div>
+        """,
+            unsafe_allow_html=True,
+        )
+        st.write("")
+    #########################################################################################################################################################
+    # Merge all DataFrames
+    #########################################################################################################################################################
+    # Merge all DataFrames selected
+    main_df = create_main_dataframe(
+        files_dict, all_panel1_values, all_panel2_values, granularity_selection
+    )
+    merged_df = merge_into_main_df(main_df, files_dict, selections)
+    #########################################################################################################################################################
+    # Categorize Variables and Impute Missing Values
+    #########################################################################################################################################################
+    # Create an editable DataFrame in Streamlit
+    st.markdown("#### Select Variables Category & Impute Missing Values")
+    # Prepare missing stats DataFrame for editing
+    missing_stats_df = prepare_missing_stats_df(merged_df)
+    edited_stats_df = st.data_editor(
+        missing_stats_df,
+        column_config={
+            "Impute Method": st.column_config.SelectboxColumn(
+                options=[
+                    "Drop Column",
+                    "Fill with Mean",
+                    "Fill with Median",
+                    "Fill with 0",
+                ],
+                required=True,
+                default="Fill with 0",
+            ),
+            "Category": st.column_config.SelectboxColumn(
+                options=[
+                    "Media",
+                    "Exogenous",
+                    "Internal",
+                    "Response Metrics",
+                ],
+                required=True,
+                default="Media",
+            ),
+        },
+        disabled=["Column", "Missing Values", "Missing Percentage"],
+        hide_index=True,
+        use_container_width=True,
+    )
+    # Apply changes based on edited DataFrame
+    for i, row in edited_stats_df.iterrows():
+        column = row["Column"]
+        if row["Impute Method"] == "Drop Column":
+            merged_df.drop(columns=[column], inplace=True)
+        elif row["Impute Method"] == "Fill with Mean":
+            merged_df[column].fillna(merged_df[column].mean(), inplace=True)
+        elif row["Impute Method"] == "Fill with Median":
+            merged_df[column].fillna(merged_df[column].median(), inplace=True)
+        elif row["Impute Method"] == "Fill with 0":
+            merged_df[column].fillna(0, inplace=True)
+    #########################################################################################################################################################
+    # Group columns
+    #########################################################################################################################################################
+    # Display Group columns header
+    st.markdown("#### Feature engineering")
+    # Prepare the numeric columns and an empty DataFrame for user input
+    numeric_columns, default_df = prepare_numeric_columns_and_default_df(
+        merged_df, edited_stats_df
+    )
+    # Display editable Dataframe
+    edited_df = st.data_editor(
+        default_df,
+        column_config={
+            "Column 1": st.column_config.SelectboxColumn(
+                options=numeric_columns,
+                required=True,
+                default=numeric_columns[0],
+                width=400,
+            ),
+            "Operator": st.column_config.SelectboxColumn(
+                options=["+", "-", "*", "/"],
+                required=True,
+                default="+",
+                width=100,
+            ),
+            "Column 2": st.column_config.SelectboxColumn(
+                options=numeric_columns,
+                required=True,
+                default=numeric_columns[0],
+                width=400,
+            ),
+            "Category": st.column_config.SelectboxColumn(
+                options=[
+                    "Media",
+                    "Exogenous",
+                    "Internal",
+                    "Response Metrics",
+                ],
+                required=True,
+                default="Media",
+                width=200,
+            ),
+        },
+        num_rows="dynamic",
+    )
+    # Process the DataFrame based on user inputs and operations specified in edited_df
+    final_df, edited_stats_df = process_dataframes(merged_df, edited_df, edited_stats_df)
+    #########################################################################################################################################################
+    # Display the Final DataFrame and variables
+    #########################################################################################################################################################
+    # Display the Final DataFrame and variables
+    st.markdown("#### Final DataFrame")
+    st.dataframe(final_df, hide_index=True)
+    # Initialize an empty dictionary to hold categories and their variables
+    category_dict = {}
+    # Iterate over each row in the edited DataFrame to populate the dictionary
+    for i, row in edited_stats_df.iterrows():
+        column = row["Column"]
+        category = row["Category"]  # The category chosen by the user for this variable
+        # Check if the category already exists in the dictionary
+        if category not in category_dict:
+            # If not, initialize it with the current column as its first element
+            category_dict[category] = [column]
+        else:
+            # If it exists, append the current column to the list of variables under this category
+            category_dict[category].append(column)
+    # Add Date, Panel_1 and Panel_12 in category dictionary
+    category_dict.update({"Date": ["date"]})
+    if "Panel_1" in final_df.columns:
+        category_dict["Panel Level 1"] = ["Panel_1"]
+    if "Panel_2" in final_df.columns:
+        category_dict["Panel Level 2"] = ["Panel_2"]
+    # Display the dictionary
+    st.markdown("#### Variable Category")
+    for category, variables in category_dict.items():
+        # Check if there are multiple variables to handle "and" insertion correctly
+        if len(variables) > 1:
+            # Join all but the last variable with ", ", then add " and " before the last variable
+            variables_str = ", ".join(variables[:-1]) + " and " + variables[-1]
+        else:
+            # If there's only one variable, no need for "and"
+            variables_str = variables[0]
+        # Display the category and its variables in the desired format
+        st.markdown(
+            f"<div style='text-align: justify;'><strong>{category}:</strong> {variables_str}</div>",
+            unsafe_allow_html=True,
+        )
+    # Function to check if Response Metrics is selected
+    st.write("")
+    response_metrics_col = category_dict.get("Response Metrics", [])
+    if len(response_metrics_col) == 0:
+        st.warning("Please select Response Metrics column", icon="⚠️")
+        st.stop()
+    # elif len(response_metrics_col) > 1:
+    #     st.warning("Please select only one Response Metrics column", icon="⚠️")
+    #     st.stop()
+    # Store final dataframe and bin dictionary into session state
+    st.session_state["final_df"], st.session_state["bin_dict"] = final_df, category_dict
+    # Save the DataFrame and dictionary from the session state to the pickle file
+    if st.button("Accept and Save", use_container_width=True):
+        save_to_pickle(
+            "data_import.pkl", st.session_state["final_df"], st.session_state["bin_dict"]
+        )
+        st.toast("💾 Saved Successfully!")