Spaces:

seanpedrickcase
/

document_redaction_vlm

Running on Zero

File size: 39,353 Bytes

c584b78

"""Helper functions for the quickstart walkthrough in the redaction app."""

import os

import gradio as gr
import pandas as pd

from tools.config import (
    AWS_LLM_PII_OPTION,
    AWS_PII_OPTION,
    CHOSEN_COMPREHEND_ENTITIES,
    CHOSEN_LLM_ENTITIES,
    CHOSEN_REDACT_ENTITIES,
    DEFAULT_PII_DETECTION_MODEL,
    INFERENCE_SERVER_PII_OPTION,
    LOCAL_OCR_MODEL_TEXT_EXTRACT_OPTION,
    LOCAL_PII_OPTION,
    LOCAL_TRANSFORMERS_LLM_PII_OPTION,
    NO_REDACTION_PII_OPTION,
    SHOW_AWS_TEXT_EXTRACTION_OPTIONS,
    SHOW_INFERENCE_SERVER_VLM_MODEL_OPTIONS,
    SHOW_OCR_GUI_OPTIONS,
    SHOW_PII_IDENTIFICATION_OPTIONS,
    TEXTRACT_TEXT_EXTRACT_OPTION,
)
from tools.helper_functions import put_columns_in_df


def is_data_file_type_walkthrough(files):
    """Check if files are data file types (xlsx, xls, csv, parquet, docx)."""
    if not files:
        return False
    data_file_extensions = {".xlsx", ".xls", ".csv", ".parquet", ".docx"}
    for file in files:
        if file:
            file_path = file.name if hasattr(file, "name") else str(file)
            file_ext = os.path.splitext(file_path)[1].lower()
            if file_ext in data_file_extensions:
                return True
    return False


def route_walkthrough_files(files):
    """Route files from walkthrough to appropriate component and determine if data file.

    Also returns visibility updates for step 2 text extraction components: when the
    upload is CSV/Excel (data file), those are hidden; when it is a document, they
    follow SHOW_OCR_GUI_OPTIONS (radio) and accordions are left unchanged.
    """
    if not files:
        show_text_extract = SHOW_OCR_GUI_OPTIONS
        return (
            None,
            None,
            False,
            gr.Walkthrough(selected=2),
            gr.update(visible=show_text_extract),
            gr.update(),
            gr.update(),
        )

    is_data = is_data_file_type_walkthrough(files)
    doc_files = []
    data_files = []

    data_file_extensions = {".xlsx", ".xls", ".csv", ".parquet", ".docx"}

    for file in files:
        if file:
            file_path = file.name if hasattr(file, "name") else str(file)
            file_ext = os.path.splitext(file_path)[1].lower()
            if file_ext in data_file_extensions:
                data_files.append(file)
            else:
                doc_files.append(file)

    # Hide text extraction options on step 2 when CSV/Excel (data file) was uploaded
    show_text_extract = (not is_data) and SHOW_OCR_GUI_OPTIONS
    if is_data:
        text_extract_updates = (
            gr.update(visible=False),
            gr.update(visible=False),
            gr.update(visible=False),
        )
    else:
        # Document: show radio if enabled; leave accordions unchanged (they follow radio selection)
        text_extract_updates = (
            gr.update(visible=show_text_extract),
            gr.update(),
            gr.update(),
        )

    if is_data:
        return None, data_files, True, gr.Walkthrough(selected=2), *text_extract_updates
    else:
        return doc_files, None, False, gr.Walkthrough(selected=2), *text_extract_updates


def handle_step_2_next(
    files,
    is_data_file,
    walkthrough_colnames_val,
    walkthrough_excel_sheets_val,
    text_extract_method_val,
):
    """Handle step 2 next button - populate dropdowns if data files and sync with main components."""
    # Show text extraction method radio in Step 2 if SHOW_OCR_GUI_OPTIONS is True
    show_text_extract_method = SHOW_OCR_GUI_OPTIONS

    if is_data_file and files:
        # Use put_columns_in_df to populate dropdowns
        colnames_dropdown, excel_sheets_dropdown = put_columns_in_df(files)
        # Use the selected values from walkthrough if available, otherwise use the populated values
        if (
            walkthrough_colnames_val
            and len(walkthrough_colnames_val) > 0
            and walkthrough_colnames_val[0] != "Choose columns to anonymise"
        ):
            main_colnames_update = gr.Dropdown(value=walkthrough_colnames_val)
        else:
            main_colnames_update = colnames_dropdown

        if (
            walkthrough_excel_sheets_val
            and len(walkthrough_excel_sheets_val) > 0
            and walkthrough_excel_sheets_val[0] != "Choose Excel sheets to anonymise"
        ):
            main_excel_sheets_update = gr.Dropdown(
                value=walkthrough_excel_sheets_val, visible=True
            )
        else:
            main_excel_sheets_update = excel_sheets_dropdown

        # Preserve user's column selection in walkthrough_colnames; do not overwrite with colnames_dropdown
        # (colnames_dropdown has value=all columns, which would reset a one-column selection to all four)
        if (
            walkthrough_colnames_val
            and len(walkthrough_colnames_val) > 0
            and walkthrough_colnames_val[0] != "Choose columns to anonymise"
        ):
            walkthrough_colnames_update = gr.update(
                value=walkthrough_colnames_val, visible=True
            )
        else:
            walkthrough_colnames_update = colnames_dropdown

        # Preserve user's sheet selection in walkthrough_excel_sheets when they have made one
        if (
            walkthrough_excel_sheets_val
            and len(walkthrough_excel_sheets_val) > 0
            and walkthrough_excel_sheets_val[0] != "Choose Excel sheets to anonymise"
        ):
            walkthrough_excel_sheets_update = gr.update(
                value=walkthrough_excel_sheets_val, visible=True
            )
        else:
            walkthrough_excel_sheets_update = excel_sheets_dropdown

        # Return updates for both walkthrough and main components, and advance walkthrough
        # Note: walkthrough_local_ocr_method_radio and walkthrough_handwrite_signature_checkbox visibility
        # are controlled by event handler on walkthrough_text_extract_method_radio
        # Note: Step 3 PII components visibility is controlled by event handler on walkthrough_redaction_method_dropdown
        return (
            walkthrough_colnames_update,  # walkthrough_colnames
            walkthrough_excel_sheets_update,  # walkthrough_excel_sheets
            main_colnames_update,  # in_colnames
            main_excel_sheets_update,  # in_excel_sheets (defined in "Word or Excel/CSV files" tab)
            gr.Radio(
                visible=show_text_extract_method
            ),  # walkthrough_text_extract_method_radio
            gr.Walkthrough(selected=3),  # walkthrough
        )
    else:
        # Return unchanged dropdowns and advance
        # Note: walkthrough_local_ocr_method_radio and walkthrough_handwrite_signature_checkbox visibility
        # are controlled by event handler on walkthrough_text_extract_method_radio
        # Note: Step 3 PII components visibility is controlled by event handler on walkthrough_redaction_method_dropdown
        return (
            gr.Dropdown(visible=False),  # walkthrough_colnames
            gr.Dropdown(visible=False),  # walkthrough_excel_sheets
            gr.Dropdown(),  # in_colnames (no change)
            gr.Dropdown(visible=False),  # in_excel_sheets (no change)
            gr.Radio(
                visible=show_text_extract_method
            ),  # walkthrough_text_extract_method_radio
            gr.Walkthrough(selected=3),  # walkthrough
        )


def _data_files_fingerprint(files):
    """Return a stable key for the current file list to detect redundant updates."""
    if not files:
        return ()
    return tuple(getattr(f, "name", str(f)) for f in files if f is not None)


def update_step_2_on_data_file_upload(files, is_data_file, last_processed_keys=None):
    """Update Step 2 components when data files are uploaded.

    When last_processed_keys is provided (from gr.State), returns (colnames, sheets, new_keys)
    and skips recomputation if files are unchanged to avoid Gradio re-firing change and
    causing an infinite loading loop on the column dropdown.
    """
    keys = _data_files_fingerprint(files)
    if last_processed_keys is not None and keys == last_processed_keys:
        return gr.update(), gr.update(), last_processed_keys

    if is_data_file and files:
        colnames_dropdown, excel_sheets_dropdown = put_columns_in_df(files)
        if last_processed_keys is not None:
            return colnames_dropdown, excel_sheets_dropdown, keys
        return colnames_dropdown, excel_sheets_dropdown, keys
    else:
        no_op = gr.Dropdown(visible=False), gr.Dropdown(visible=False), keys
        if last_processed_keys is not None:
            return *no_op, () if not keys else keys
        return no_op


def handle_text_extract_method_selection(text_extract_method: str):
    """Handle text extraction method selection - show local OCR radio only if Local OCR model is selected,
    and show AWS Textract settings only if AWS Textract is selected.

    Args:
        text_extract_method: Selected text extraction method

    Returns:
        Tuple of visibility updates for local OCR radio, and AWS Textract accordion
    """
    # Normalize (Gradio can send None when .change() fires before sync); default so something stays visible
    if isinstance(text_extract_method, str):
        text_extract_method = text_extract_method.strip()
    if text_extract_method is None or text_extract_method == "":
        text_extract_method = LOCAL_OCR_MODEL_TEXT_EXTRACT_OPTION

    # Show local OCR method radio only if "Local OCR model - PDFs without selectable text" is selected
    # When "AWS Bedrock VLM OCR" is selected, the local OCR method is automatically set to "bedrock-vlm" but the component is hidden
    show_local_ocr = text_extract_method == LOCAL_OCR_MODEL_TEXT_EXTRACT_OPTION
    # Show AWS Textract settings accordion only if "AWS Textract service - all PDF types" is selected
    show_aws_textract = (
        text_extract_method == TEXTRACT_TEXT_EXTRACT_OPTION
        and SHOW_AWS_TEXT_EXTRACTION_OPTIONS
    )

    return (
        gr.update(visible=show_local_ocr),  # walkthrough_local_ocr_method_radio
        gr.update(
            visible=show_aws_textract
        ),  # walkthrough_handwrite_signature_checkbox
    )


def _is_llm_pii_method_shown_for_redaction(redaction_method, pii_method):
    """Whether LLM PII controls (entity dropdown + custom instructions accordion) should be visible.

    Mirrors the logic in handle_redaction_method_selection for is_llm_method_init.
    """
    if isinstance(redaction_method, str):
        redaction_method = redaction_method.strip()
    if redaction_method is None or redaction_method == "":
        redaction_method = "Redact all PII"
    if isinstance(pii_method, str):
        pii_method = pii_method.strip()
    if pii_method is None or pii_method == "":
        pii_method = DEFAULT_PII_DETECTION_MODEL

    is_redact_all_pii = redaction_method == "Redact all PII"
    is_redact_selected_terms = redaction_method == "Redact selected terms"
    is_redact_all_pii_or_selected_terms = is_redact_all_pii or is_redact_selected_terms
    show_pii_method = (
        is_redact_all_pii_or_selected_terms
    ) and SHOW_PII_IDENTIFICATION_OPTIONS

    pii_method_for_visibility = pii_method
    if is_redact_all_pii_or_selected_terms and pii_method == NO_REDACTION_PII_OPTION:
        pii_method_for_visibility = DEFAULT_PII_DETECTION_MODEL

    return show_pii_method and (
        pii_method_for_visibility == LOCAL_TRANSFORMERS_LLM_PII_OPTION
        or pii_method_for_visibility == INFERENCE_SERVER_PII_OPTION
        or pii_method_for_visibility == AWS_LLM_PII_OPTION
    )


def handle_redaction_method_selection(redaction_method: str, pii_method: str):
    """Handle redaction method selection in Step 3 - show appropriate components based on selection."""
    # Normalize inputs (Gradio can send whitespace or None when .change() fires before sync)
    if isinstance(redaction_method, str):
        redaction_method = redaction_method.strip()
    if redaction_method is None or redaction_method == "":
        redaction_method = "Redact all PII"
    if isinstance(pii_method, str):
        pii_method = pii_method.strip()
    if pii_method is None or pii_method == "":
        pii_method = DEFAULT_PII_DETECTION_MODEL

    # Check which redaction method is selected
    is_redact_all_pii = redaction_method == "Redact all PII"
    is_redact_selected_terms = redaction_method == "Redact selected terms"
    is_redact_all_pii_or_selected_terms = is_redact_all_pii or is_redact_selected_terms
    is_extract_text_only = (
        isinstance(redaction_method, str)
        and redaction_method.strip() == "Extract text only"
    )

    # When switching from "Extract text only", the PII dropdown may still be
    # NO_REDACTION_PII_OPTION; use DEFAULT_PII_DETECTION_MODEL so exactly one
    # entity dropdown is visible and the UI doesn’t show all three or none.
    pii_method_for_visibility = pii_method
    if is_redact_all_pii_or_selected_terms and pii_method == NO_REDACTION_PII_OPTION:
        pii_method_for_visibility = DEFAULT_PII_DETECTION_MODEL

    # Show PII detection settings if "Redact all PII" OR "Redact selected terms" is selected
    # Both options need PII detection method to determine what to redact
    show_pii_method = (
        is_redact_all_pii_or_selected_terms
    ) and SHOW_PII_IDENTIFICATION_OPTIONS

    # Determine visibility of entity dropdowns based on PII method
    show_local_entities_init = show_pii_method and (
        pii_method_for_visibility == LOCAL_PII_OPTION
    )
    show_comprehend_entities_init = show_pii_method and (
        pii_method_for_visibility == AWS_PII_OPTION
    )
    is_llm_method_init = _is_llm_pii_method_shown_for_redaction(
        redaction_method, pii_method
    )

    # For "Extract text only", hide all components
    # For "Redact all PII", show PII detection components
    # For "Redact selected terms", show both PII detection components AND deny/allow/fully redacted list components

    # When we overrode pii_method for visibility, also update the PII dropdown value
    pii_drop_value = None
    if is_redact_all_pii_or_selected_terms and pii_method == NO_REDACTION_PII_OPTION:
        pii_drop_value = DEFAULT_PII_DETECTION_MODEL

    # Set entity values based on redaction method
    if is_redact_selected_terms:
        # For "Redact selected terms", only show CUSTOM entity
        local_entities_update = gr.Dropdown(
            visible=show_local_entities_init, value=["CUSTOM"]
        )
        comprehend_entities_update = gr.Dropdown(
            visible=show_comprehend_entities_init, value=["CUSTOM"]
        )
        llm_entities_update = gr.Dropdown(visible=is_llm_method_init, value=["CUSTOM"])
        walkthrough_pii_identification_method_drop_update = (
            gr.update(visible=show_pii_method, value=pii_drop_value)
            if pii_drop_value is not None
            else gr.update(visible=show_pii_method)
        )

    elif is_redact_all_pii:
        # For "Redact all PII", use default entities
        # Ensure entities are lists (they should already be parsed in config.py)
        local_entities_val = (
            CHOSEN_REDACT_ENTITIES
            if isinstance(CHOSEN_REDACT_ENTITIES, list)
            else ["CUSTOM"]
        )
        comprehend_entities_val = (
            CHOSEN_COMPREHEND_ENTITIES
            if isinstance(CHOSEN_COMPREHEND_ENTITIES, list)
            else ["CUSTOM"]
        )
        llm_entities_val = (
            CHOSEN_LLM_ENTITIES if isinstance(CHOSEN_LLM_ENTITIES, list) else ["CUSTOM"]
        )
        local_entities_update = gr.Dropdown(
            visible=show_local_entities_init, value=local_entities_val
        )
        comprehend_entities_update = gr.Dropdown(
            visible=show_comprehend_entities_init, value=comprehend_entities_val
        )
        llm_entities_update = gr.Dropdown(
            visible=is_llm_method_init, value=llm_entities_val
        )
        walkthrough_pii_identification_method_drop_update = (
            gr.update(visible=show_pii_method, value=pii_drop_value)
            if pii_drop_value is not None
            else gr.update(visible=show_pii_method)
        )
    elif is_extract_text_only:
        # For "Extract text only", just update visibility without changing value
        local_entities_update = gr.Dropdown(visible=show_local_entities_init)
        comprehend_entities_update = gr.Dropdown(visible=show_comprehend_entities_init)
        llm_entities_update = gr.Dropdown(visible=is_llm_method_init)
        walkthrough_pii_identification_method_drop_update = gr.update(
            visible=show_pii_method, value=NO_REDACTION_PII_OPTION
        )

    return (
        walkthrough_pii_identification_method_drop_update,  # walkthrough_pii_identification_method_drop
        local_entities_update,  # walkthrough_in_redact_entities
        comprehend_entities_update,  # walkthrough_in_redact_comprehend_entities
        gr.update(visible=is_llm_method_init),  # walkthrough_llm_entities_accordion
        llm_entities_update,  # walkthrough_in_redact_llm_entities
        gr.update(
            visible=is_redact_all_pii_or_selected_terms
        ),  # walkthrough_list_accordion
        gr.update(
            visible=is_redact_all_pii_or_selected_terms
        ),  # walkthrough_max_fuzzy_spelling_mistakes_num
    )


# Update visibility of PII-related components and accordions when general redaction method is selected
def handle_main_redaction_method_selection(redaction_method, pii_method):
    """Wrapper that applies handle_redaction_method_selection and updates accordion visibility.

    handle_redaction_method_selection returns (for walkthrough): pii_drop, local_entities,
    comprehend_entities, llm_accordion_visible, llm_entities, list_accordion, checkbox, num.
    The main app expects: pii_drop, local_entities, comprehend_entities, llm_entities,
    custom_llm_entities_accordion, list_accordion, checkbox, num, entity_accordion, terms_accordion.
    So we remap: use inner[4] for in_redact_llm_entities and set custom_llm_entities_accordion
    visibility (same rule as handle_main_pii_method_selection; inner gr.update() would hide the box).
    """
    raw = list(handle_redaction_method_selection(redaction_method, pii_method))
    is_redact_all_pii = redaction_method == "Redact all PII"
    is_redact_selected_terms = redaction_method == "Redact selected terms"
    is_extract_text_only = (
        isinstance(redaction_method, str)
        and redaction_method.strip() == "Extract text only"
    )
    show_pii_method = (
        is_redact_all_pii or is_redact_selected_terms
    ) and SHOW_PII_IDENTIFICATION_OPTIONS
    show_selected_terms_lists = is_redact_selected_terms
    is_llm_method_for_custom_instructions = _is_llm_pii_method_shown_for_redaction(
        redaction_method, pii_method
    )
    # Map to main app outputs: pii_drop, local_entities, comprehend_entities, llm_entities,
    # custom_llm_entities_accordion, list_accordion, checkbox, num,
    # then entity/terms accordions, then only_extract_text_radio.
    # raw[3] is walkthrough llm accordion visibility (unused here); raw[4] is llm_entities.
    # When "Extract text only" is selected, force "Only extract text (no redaction)" checkbox to True.
    results = [
        raw[0],  # pii_identification_method_drop
        raw[1],  # in_redact_entities
        raw[2],  # in_redact_comprehend_entities
        raw[
            4
        ],  # in_redact_llm_entities (was wrongly going to textbox as str(["CUSTOM"]) -> "['CUSTOM']")
        gr.update(
            visible=is_llm_method_for_custom_instructions
        ),  # custom_llm_entities_accordion
        raw[5],  # walkthrough_list_accordion
        raw[6],  # max_fuzzy_spelling_mistakes_num
        gr.update(visible=show_pii_method),  # entity_types_to_redact_accordion
        gr.update(visible=show_selected_terms_lists),  # terms_accordion
        gr.update(value=is_extract_text_only),  # only_extract_text_radio
    ]
    return results


def handle_pii_method_selection(pii_method: str):
    """Handle PII method selection - show appropriate entity dropdowns."""
    # When value is None/empty (e.g. first .change() after loading an example sets the
    # component programmatically), avoid hiding all entity selectors by defaulting to Local.
    if pii_method is None or (isinstance(pii_method, str) and not pii_method.strip()):
        show_local_entities = True
        show_comprehend_entities = False
        is_llm_method = False
    else:
        # Check if method is Local
        show_local_entities = pii_method == LOCAL_PII_OPTION
        # Check if method is AWS Comprehend
        show_comprehend_entities = pii_method == AWS_PII_OPTION
        # Check if method is an LLM option
        is_llm_method = (
            pii_method == LOCAL_TRANSFORMERS_LLM_PII_OPTION
            or pii_method == INFERENCE_SERVER_PII_OPTION
            or pii_method == AWS_LLM_PII_OPTION
        )

    # Use gr.update(visible=...) only to avoid value resets that can trigger change
    # events on the target components and cause loading loops (e.g. tabular PII -> LLM).
    # Only return to the two entity dropdowns and the accordion; components inside the
    # accordion (walkthrough_in_redact_llm_entities, walkthrough_custom_llm_instructions_textbox)
    # are shown/hidden by the accordion visibility.
    return (
        gr.update(visible=show_local_entities),  # walkthrough_in_redact_entities
        gr.update(
            visible=show_comprehend_entities
        ),  # walkthrough_in_redact_comprehend_entities
        gr.update(visible=is_llm_method),  # walkthrough_llm_entities_accordion
    )


def handle_pii_method_selection_tabular(pii_method: str):
    """Handle tabular PII method selection. Updates only accordion visibility for the
    LLM block; leaves walkthrough_in_redact_llm_entities and
    walkthrough_custom_llm_instructions_textbox as no-ops to avoid loading spinners
    hanging on those nested components when switching to LLM (AWS Bedrock).
    """
    if pii_method is None or (isinstance(pii_method, str) and not pii_method.strip()):
        show_local_entities = True
        show_comprehend_entities = False
        is_llm_method = False
    else:
        show_local_entities = pii_method == LOCAL_PII_OPTION
        show_comprehend_entities = pii_method == AWS_PII_OPTION
        is_llm_method = (
            pii_method == LOCAL_TRANSFORMERS_LLM_PII_OPTION
            or pii_method == INFERENCE_SERVER_PII_OPTION
            or pii_method == AWS_LLM_PII_OPTION
        )
    return (
        gr.update(visible=show_local_entities),
        gr.update(visible=show_comprehend_entities),
        gr.update(visible=is_llm_method),  # accordion controls visibility of LLM block
    )


def handle_step_3_next(
    text_extract_method_val,
    local_ocr_method_val,
    handwrite_signature_val,
    pii_method_val,
    redact_entities_val,
    redact_comprehend_entities_val,
    redact_llm_entities_val,
    custom_llm_instructions_val,
    deny_list_val,
    allow_list_val,
    fully_redacted_list_val,
    pii_method_tabular_val,
    anon_strategy_val,
    do_initial_clean_val,
    redact_duplicate_pages_val,
    max_fuzzy_spelling_mistakes_num_val,
):
    """Handle step 3 next button - write values to main components."""
    # Update text extraction method with walkthrough value
    text_extract_method_update = (
        gr.Radio(value=text_extract_method_val)
        if text_extract_method_val
        else gr.Radio()
    )

    # Update OCR components with walkthrough values
    local_ocr_update = (
        gr.Radio(value=local_ocr_method_val) if local_ocr_method_val else gr.Radio()
    )
    handwrite_signature_update = (
        gr.CheckboxGroup(value=handwrite_signature_val)
        if handwrite_signature_val
        else gr.CheckboxGroup()
    )

    # Update PII components with walkthrough values
    pii_method_update = gr.Radio(value=pii_method_val) if pii_method_val else gr.Radio()
    # Always update dropdowns with the value, even if it's an empty list
    # This ensures that empty selections are correctly written to main components
    redact_entities_update = (
        gr.Dropdown(value=redact_entities_val)
        if redact_entities_val is not None
        else gr.Dropdown()
    )
    redact_comprehend_entities_update = (
        gr.Dropdown(value=redact_comprehend_entities_val)
        if redact_comprehend_entities_val is not None
        else gr.Dropdown()
    )
    redact_llm_entities_update = (
        gr.Dropdown(value=redact_llm_entities_val)
        if redact_llm_entities_val is not None
        else gr.Dropdown()
    )
    custom_llm_instructions_update = (
        gr.Textbox(value=custom_llm_instructions_val)
        if custom_llm_instructions_val is not None
        else gr.Textbox()
    )

    # Update deny/allow/fully redacted list components with walkthrough values
    # Convert DataFrame to list if needed (for backward compatibility)
    # Ensure all items are strings for Dropdown components
    if deny_list_val is not None:
        if isinstance(deny_list_val, pd.DataFrame):
            deny_list_val = (
                deny_list_val.iloc[:, 0].tolist() if not deny_list_val.empty else []
            )
        # Ensure all items are strings
        if isinstance(deny_list_val, list):
            deny_list_val = (
                [str(item) for item in deny_list_val if item] if deny_list_val else []
            )
        deny_list_update = (
            gr.Dropdown(value=deny_list_val) if deny_list_val else gr.Dropdown()
        )
    else:
        deny_list_update = gr.Dropdown()

    if allow_list_val is not None:
        if isinstance(allow_list_val, pd.DataFrame):
            allow_list_val = (
                allow_list_val.iloc[:, 0].tolist() if not allow_list_val.empty else []
            )
        # Ensure all items are strings
        if isinstance(allow_list_val, list):
            allow_list_val = (
                [str(item) for item in allow_list_val if item] if allow_list_val else []
            )
        allow_list_update = (
            gr.Dropdown(value=allow_list_val) if allow_list_val else gr.Dropdown()
        )
    else:
        allow_list_update = gr.Dropdown()

    if fully_redacted_list_val is not None:
        if isinstance(fully_redacted_list_val, pd.DataFrame):
            fully_redacted_list_val = (
                fully_redacted_list_val.iloc[:, 0].tolist()
                if not fully_redacted_list_val.empty
                else []
            )
        # Ensure all items are strings
        if isinstance(fully_redacted_list_val, list):
            fully_redacted_list_val = (
                [str(item) for item in fully_redacted_list_val if item]
                if fully_redacted_list_val
                else []
            )
        fully_redacted_list_update = (
            gr.Dropdown(value=fully_redacted_list_val)
            if fully_redacted_list_val
            else gr.Dropdown()
        )
    else:
        fully_redacted_list_update = gr.Dropdown()

    # Update tabular data components with walkthrough values
    pii_method_tabular_update = (
        gr.Radio(value=pii_method_tabular_val)
        if pii_method_tabular_val is not None
        else gr.Radio()
    )
    anon_strategy_update = (
        gr.Radio(value=anon_strategy_val)
        if anon_strategy_val is not None
        else gr.Radio()
    )
    do_initial_clean_update = (
        gr.Checkbox(value=do_initial_clean_val)
        if do_initial_clean_val is not None
        else gr.Checkbox()
    )

    # Update redact duplicate pages checkbox with walkthrough value
    redact_duplicate_pages_update = (
        gr.Checkbox(value=redact_duplicate_pages_val)
        if redact_duplicate_pages_val is not None
        else gr.Checkbox()
    )

    # Update max fuzzy spelling mistakes number with walkthrough value
    max_fuzzy_spelling_mistakes_num_update = (
        gr.Number(value=max_fuzzy_spelling_mistakes_num_val)
        if max_fuzzy_spelling_mistakes_num_val is not None
        else gr.Number()
    )

    return (
        text_extract_method_update,  # text_extract_method_radio
        local_ocr_update,  # local_ocr_method_radio
        handwrite_signature_update,  # handwrite_signature_checkbox
        pii_method_update,  # pii_identification_method_drop
        redact_entities_update,  # in_redact_entities
        redact_comprehend_entities_update,  # in_redact_comprehend_entities
        redact_llm_entities_update,  # in_redact_llm_entities
        custom_llm_instructions_update,  # custom_llm_instructions_textbox
        deny_list_update,  # in_deny_list_state
        allow_list_update,  # in_allow_list_state
        fully_redacted_list_update,  # in_fully_redacted_list_state
        pii_method_tabular_update,  # pii_identification_method_drop_tabular
        anon_strategy_update,  # anon_strategy
        do_initial_clean_update,  # do_initial_clean
        redact_duplicate_pages_update,  # redact_duplicate_pages_checkbox
        gr.Walkthrough(selected=4),  # walkthrough
        max_fuzzy_spelling_mistakes_num_update,  # max_fuzzy_spelling_mistakes_num
    )


def handle_step_4_next(
    page_min_val,
    page_max_val,
    textract_output_found_val,
    relevant_ocr_output_with_words_found_val,
    total_pdf_page_count_val,
    estimated_aws_costs_val,
    estimated_time_taken_val,
    cost_code_dataframe_val,
    cost_code_choice_val,
):
    """Handle step 4 next button - write values to main components."""
    # Update page selection components
    page_min_update = (
        gr.Number(value=page_min_val) if page_min_val is not None else gr.Number()
    )
    page_max_update = (
        gr.Number(value=page_max_val) if page_max_val is not None else gr.Number()
    )

    # Update cost-related components (if SHOW_COSTS is True)
    textract_output_found_update = (
        gr.Checkbox(value=textract_output_found_val)
        if textract_output_found_val is not None
        else gr.Checkbox()
    )
    relevant_ocr_output_with_words_found_update = (
        gr.Checkbox(value=relevant_ocr_output_with_words_found_val)
        if relevant_ocr_output_with_words_found_val is not None
        else gr.Checkbox()
    )
    total_pdf_page_count_update = (
        gr.Number(value=total_pdf_page_count_val)
        if total_pdf_page_count_val is not None
        else gr.Number()
    )
    estimated_aws_costs_update = (
        gr.Number(value=estimated_aws_costs_val)
        if estimated_aws_costs_val is not None
        else gr.Number()
    )
    estimated_time_taken_update = (
        gr.Number(value=estimated_time_taken_val)
        if estimated_time_taken_val is not None
        else gr.Number()
    )

    # Update cost code components (if GET_COST_CODES or ENFORCE_COST_CODES is True)
    cost_code_dataframe_update = (
        gr.Dataframe(value=cost_code_dataframe_val)
        if cost_code_dataframe_val is not None
        else gr.Dataframe()
    )
    cost_code_choice_update = (
        gr.Dropdown(value=cost_code_choice_val)
        if cost_code_choice_val is not None
        else gr.Dropdown()
    )

    return (
        page_min_update,  # page_min
        page_max_update,  # page_max
        textract_output_found_update,  # textract_output_found_checkbox
        relevant_ocr_output_with_words_found_update,  # relevant_ocr_output_with_words_found_checkbox
        total_pdf_page_count_update,  # total_pdf_page_count
        estimated_aws_costs_update,  # estimated_aws_costs_number
        estimated_time_taken_update,  # estimated_time_taken_number
        cost_code_dataframe_update,  # cost_code_dataframe
        cost_code_choice_update,  # cost_code_choice_drop
        gr.Walkthrough(selected=5),  # walkthrough
    )


def sync_walkthrough_outputs_to_original(summary_text, output_file_value):
    """Sync walkthrough output components to original components.

    This function takes the outputs from the redaction process and duplicates
    them to both walkthrough and original output components.

    Args:
        summary_text: The output summary text
        output_file_value: The output file value

    Returns:
        Tuple of (walkthrough_summary, walkthrough_file, original_summary, original_file)
    """
    return (
        summary_text,  # walkthrough_redaction_output_summary_textbox
        output_file_value,  # walkthrough_output_file
        summary_text,  # redaction_output_summary_textbox (original)
        output_file_value,  # output_file (original)
    )


def sync_walkthrough_tabular_outputs_to_original(summary_text, output_file_value):
    """Sync walkthrough tabular output components to original components.

    This function takes the outputs from the tabular redaction process and duplicates
    them to both walkthrough and original output components.

    Args:
        summary_text: The output summary text
        output_file_value: The output file value

    Returns:
        Tuple of (walkthrough_summary, walkthrough_file, original_summary, original_file)
    """
    return (
        summary_text,  # walkthrough_text_output_summary
        output_file_value,  # walkthrough_text_output_file
        summary_text,  # text_output_summary (original)
        output_file_value,  # text_output_file (original)
    )


def update_step_3_tabular_visibility(is_data_file):
    """Update visibility of Step 3 components based on file type.

    When a data file (CSV/Excel) is chosen: show tabular options, hide document-only options.
    When a document is chosen: show document options (PII method, duplicate pages, etc.), hide tabular options.

    Args:
        is_data_file: Boolean indicating if uploaded file is a data file

    Returns:
        Tuple of visibility updates for document-only and tabular components
    """
    show_doc = not is_data_file
    return (
        gr.update(visible=show_doc),  # walkthrough_local_ocr_method_radio
        gr.update(visible=show_doc),  # walkthrough_pii_identification_method_drop
        gr.update(visible=show_doc),  # walkthrough_fully_redacted_list_state
        gr.update(visible=show_doc),  # walkthrough_redact_duplicate_pages_checkbox
        gr.update(
            visible=is_data_file
        ),  # walkthrough_pii_identification_method_drop_tabular
        gr.update(visible=is_data_file),  # walkthrough_anon_strategy
        gr.update(visible=is_data_file),  # walkthrough_do_initial_clean
    )


def update_step_4_visibility(is_data_file):
    """Update visibility of Step 4 components based on file type.

    Args:
        is_data_file: Boolean indicating if uploaded file is a data file

    Returns:
        Tuple of visibility updates for document and tabular components
    """
    # For Row components, we need to update visibility of children
    # Return updates for button and both output components in each row
    return (
        gr.update(visible=not is_data_file),  # step_4_next_document_redact_btn
        gr.update(visible=is_data_file),  # step_4_next_tabular_redact_btn
    )


def handle_main_text_extract_method_selection(text_extract_method: str):
    """Handle text extraction method selection for main components - show local OCR options only if Local OCR model is selected,
    and show AWS Textract settings only if AWS Textract is selected.

    Args:
        text_extract_method: Selected text extraction method

    Returns:
        Tuple of visibility updates for local OCR accordion, inference server accordion, and AWS Textract accordion
    """
    # Normalize (Gradio can send None when .change() fires before sync); default so something stays visible
    if isinstance(text_extract_method, str):
        text_extract_method = text_extract_method.strip()
    if text_extract_method is None or text_extract_method == "":
        text_extract_method = LOCAL_OCR_MODEL_TEXT_EXTRACT_OPTION

    # Show local OCR method accordion only if "Local OCR model - PDFs without selectable text" is selected
    # When "AWS Bedrock VLM OCR" is selected, the local OCR method is automatically set to "bedrock-vlm" but the component is hidden
    show_local_ocr = text_extract_method == LOCAL_OCR_MODEL_TEXT_EXTRACT_OPTION
    # Show AWS Textract settings accordion only if "AWS Textract service - all PDF types" is selected
    show_aws_textract = (
        text_extract_method == TEXTRACT_TEXT_EXTRACT_OPTION
        and SHOW_AWS_TEXT_EXTRACTION_OPTIONS
    )
    # Show inference server VLM model accordion only if local OCR is selected (not Bedrock VLM) and the option is enabled
    show_inference_server = (
        text_extract_method == LOCAL_OCR_MODEL_TEXT_EXTRACT_OPTION
        and SHOW_INFERENCE_SERVER_VLM_MODEL_OPTIONS
    )

    return (
        gr.update(visible=show_local_ocr),  # local_ocr_method_accordion
        gr.update(
            visible=show_inference_server
        ),  # inference_server_vlm_model_accordion
        gr.update(visible=show_aws_textract),  # aws_textract_signature_accordion
    )


def handle_main_pii_method_selection(pii_method):
    """Handle PII method selection for main components - show appropriate entity dropdowns and hide all if No PII redaction is selected.

    Args:
        pii_method: Selected PII detection method

    Returns:
        Tuple of visibility updates for PII method dropdown, local entities accordion, comprehend entities accordion,
        LLM entities accordion, and LLM custom instructions accordion
    """
    # Normalize string (Gradio can send whitespace)
    if isinstance(pii_method, str):
        pii_method = pii_method.strip()
    # When value is None/empty (e.g. .change() fired before component synced), default to Local so at least one section is visible (e.g. when user clicked Local)
    if pii_method is None or pii_method == "":
        return (
            gr.update(visible=True),  #  local_entities
            gr.update(visible=False),  # comprehend_entities
            gr.update(visible=False),  # llm_entities
            gr.update(visible=False),  # llm_custom_instructions
        )

    # Check if "No PII redaction" is selected
    is_no_redaction = pii_method == NO_REDACTION_PII_OPTION

    # If no redaction, hide all PII-related components
    if is_no_redaction:
        return (
            gr.update(visible=False),  # local_entities
            gr.update(visible=False),  # comprehend_entities
            gr.update(visible=False),  # llm_entities
            gr.update(visible=False),  # llm_custom_instructions
        )

    # Check if method is Local
    show_local_entities = pii_method == LOCAL_PII_OPTION
    # Check if method is AWS Comprehend
    show_comprehend_entities = pii_method == AWS_PII_OPTION
    # Check if method is an LLM option
    is_llm_method = (
        pii_method == LOCAL_TRANSFORMERS_LLM_PII_OPTION
        or pii_method == INFERENCE_SERVER_PII_OPTION
        or pii_method == AWS_LLM_PII_OPTION
    )

    return (
        gr.update(visible=show_local_entities),  # local_entities
        gr.update(visible=show_comprehend_entities),  # comprehend_entities
        gr.update(visible=is_llm_method),  # llm_entities
        gr.update(visible=is_llm_method),  # llm_custom_instructions
    )