Spaces:
Runtime error
Runtime error
| """Helper functions for the quickstart walkthrough in the redaction app.""" | |
| import os | |
| import gradio as gr | |
| import pandas as pd | |
| from tools.config import ( | |
| AWS_LLM_PII_OPTION, | |
| AWS_PII_OPTION, | |
| CHOSEN_COMPREHEND_ENTITIES, | |
| CHOSEN_LLM_ENTITIES, | |
| CHOSEN_REDACT_ENTITIES, | |
| DEFAULT_PII_DETECTION_MODEL, | |
| INFERENCE_SERVER_PII_OPTION, | |
| LOCAL_PII_OPTION, | |
| LOCAL_TRANSFORMERS_LLM_PII_OPTION, | |
| NO_REDACTION_PII_OPTION, | |
| SHOW_AWS_TEXT_EXTRACTION_OPTIONS, | |
| SHOW_INFERENCE_SERVER_VLM_MODEL_OPTIONS, | |
| SHOW_OCR_GUI_OPTIONS, | |
| SHOW_PII_IDENTIFICATION_OPTIONS, | |
| TESSERACT_TEXT_EXTRACT_OPTION, | |
| TEXTRACT_TEXT_EXTRACT_OPTION, | |
| ) | |
| from tools.helper_functions import put_columns_in_df | |
| def is_data_file_type_walkthrough(files): | |
| """Check if files are data file types (xlsx, xls, csv, parquet, docx).""" | |
| if not files: | |
| return False | |
| data_file_extensions = {".xlsx", ".xls", ".csv", ".parquet", ".docx"} | |
| for file in files: | |
| if file: | |
| file_path = file.name if hasattr(file, "name") else str(file) | |
| file_ext = os.path.splitext(file_path)[1].lower() | |
| if file_ext in data_file_extensions: | |
| return True | |
| return False | |
| def route_walkthrough_files(files): | |
| """Route files from walkthrough to appropriate component and determine if data file. | |
| Also returns visibility updates for step 2 text extraction components: when the | |
| upload is CSV/Excel (data file), those are hidden; when it is a document, they | |
| follow SHOW_OCR_GUI_OPTIONS (radio) and accordions are left unchanged. | |
| """ | |
| if not files: | |
| show_text_extract = SHOW_OCR_GUI_OPTIONS | |
| return ( | |
| None, | |
| None, | |
| False, | |
| gr.Walkthrough(selected=2), | |
| gr.update(visible=show_text_extract), | |
| gr.update(), | |
| gr.update(), | |
| ) | |
| is_data = is_data_file_type_walkthrough(files) | |
| doc_files = [] | |
| data_files = [] | |
| data_file_extensions = {".xlsx", ".xls", ".csv", ".parquet", ".docx"} | |
| for file in files: | |
| if file: | |
| file_path = file.name if hasattr(file, "name") else str(file) | |
| file_ext = os.path.splitext(file_path)[1].lower() | |
| if file_ext in data_file_extensions: | |
| data_files.append(file) | |
| else: | |
| doc_files.append(file) | |
| # Hide text extraction options on step 2 when CSV/Excel (data file) was uploaded | |
| show_text_extract = (not is_data) and SHOW_OCR_GUI_OPTIONS | |
| if is_data: | |
| text_extract_updates = ( | |
| gr.update(visible=False), | |
| gr.update(visible=False), | |
| gr.update(visible=False), | |
| ) | |
| else: | |
| # Document: show radio if enabled; leave accordions unchanged (they follow radio selection) | |
| text_extract_updates = ( | |
| gr.update(visible=show_text_extract), | |
| gr.update(), | |
| gr.update(), | |
| ) | |
| if is_data: | |
| return None, data_files, True, gr.Walkthrough(selected=2), *text_extract_updates | |
| else: | |
| return doc_files, None, False, gr.Walkthrough(selected=2), *text_extract_updates | |
| def handle_step_2_next( | |
| files, | |
| is_data_file, | |
| walkthrough_colnames_val, | |
| walkthrough_excel_sheets_val, | |
| text_extract_method_val, | |
| ): | |
| """Handle step 2 next button - populate dropdowns if data files and sync with main components.""" | |
| # Show text extraction method radio in Step 2 if SHOW_OCR_GUI_OPTIONS is True | |
| show_text_extract_method = SHOW_OCR_GUI_OPTIONS | |
| if is_data_file and files: | |
| # Use put_columns_in_df to populate dropdowns | |
| colnames_dropdown, excel_sheets_dropdown = put_columns_in_df(files) | |
| # Use the selected values from walkthrough if available, otherwise use the populated values | |
| if ( | |
| walkthrough_colnames_val | |
| and len(walkthrough_colnames_val) > 0 | |
| and walkthrough_colnames_val[0] != "Choose columns to anonymise" | |
| ): | |
| main_colnames_update = gr.Dropdown(value=walkthrough_colnames_val) | |
| else: | |
| main_colnames_update = colnames_dropdown | |
| if ( | |
| walkthrough_excel_sheets_val | |
| and len(walkthrough_excel_sheets_val) > 0 | |
| and walkthrough_excel_sheets_val[0] != "Choose Excel sheets to anonymise" | |
| ): | |
| main_excel_sheets_update = gr.Dropdown( | |
| value=walkthrough_excel_sheets_val, visible=True | |
| ) | |
| else: | |
| main_excel_sheets_update = excel_sheets_dropdown | |
| # Preserve user's column selection in walkthrough_colnames; do not overwrite with colnames_dropdown | |
| # (colnames_dropdown has value=all columns, which would reset a one-column selection to all four) | |
| if ( | |
| walkthrough_colnames_val | |
| and len(walkthrough_colnames_val) > 0 | |
| and walkthrough_colnames_val[0] != "Choose columns to anonymise" | |
| ): | |
| walkthrough_colnames_update = gr.update( | |
| value=walkthrough_colnames_val, visible=True | |
| ) | |
| else: | |
| walkthrough_colnames_update = colnames_dropdown | |
| # Preserve user's sheet selection in walkthrough_excel_sheets when they have made one | |
| if ( | |
| walkthrough_excel_sheets_val | |
| and len(walkthrough_excel_sheets_val) > 0 | |
| and walkthrough_excel_sheets_val[0] != "Choose Excel sheets to anonymise" | |
| ): | |
| walkthrough_excel_sheets_update = gr.update( | |
| value=walkthrough_excel_sheets_val, visible=True | |
| ) | |
| else: | |
| walkthrough_excel_sheets_update = excel_sheets_dropdown | |
| # Return updates for both walkthrough and main components, and advance walkthrough | |
| # Note: walkthrough_local_ocr_method_radio and walkthrough_handwrite_signature_checkbox visibility | |
| # are controlled by event handler on walkthrough_text_extract_method_radio | |
| # Note: Step 3 PII components visibility is controlled by event handler on walkthrough_redaction_method_dropdown | |
| return ( | |
| walkthrough_colnames_update, # walkthrough_colnames | |
| walkthrough_excel_sheets_update, # walkthrough_excel_sheets | |
| main_colnames_update, # in_colnames | |
| main_excel_sheets_update, # in_excel_sheets (defined in "Word or Excel/CSV files" tab) | |
| gr.Radio( | |
| visible=show_text_extract_method | |
| ), # walkthrough_text_extract_method_radio | |
| gr.Walkthrough(selected=3), # walkthrough | |
| ) | |
| else: | |
| # Return unchanged dropdowns and advance | |
| # Note: walkthrough_local_ocr_method_radio and walkthrough_handwrite_signature_checkbox visibility | |
| # are controlled by event handler on walkthrough_text_extract_method_radio | |
| # Note: Step 3 PII components visibility is controlled by event handler on walkthrough_redaction_method_dropdown | |
| return ( | |
| gr.Dropdown(visible=False), # walkthrough_colnames | |
| gr.Dropdown(visible=False), # walkthrough_excel_sheets | |
| gr.Dropdown(), # in_colnames (no change) | |
| gr.Dropdown(visible=False), # in_excel_sheets (no change) | |
| gr.Radio( | |
| visible=show_text_extract_method | |
| ), # walkthrough_text_extract_method_radio | |
| gr.Walkthrough(selected=3), # walkthrough | |
| ) | |
| def _data_files_fingerprint(files): | |
| """Return a stable key for the current file list to detect redundant updates.""" | |
| if not files: | |
| return () | |
| return tuple(getattr(f, "name", str(f)) for f in files if f is not None) | |
| def update_step_2_on_data_file_upload(files, is_data_file, last_processed_keys=None): | |
| """Update Step 2 components when data files are uploaded. | |
| When last_processed_keys is provided (from gr.State), returns (colnames, sheets, new_keys) | |
| and skips recomputation if files are unchanged to avoid Gradio re-firing change and | |
| causing an infinite loading loop on the column dropdown. | |
| """ | |
| keys = _data_files_fingerprint(files) | |
| if last_processed_keys is not None and keys == last_processed_keys: | |
| return gr.update(), gr.update(), last_processed_keys | |
| if is_data_file and files: | |
| colnames_dropdown, excel_sheets_dropdown = put_columns_in_df(files) | |
| if last_processed_keys is not None: | |
| return colnames_dropdown, excel_sheets_dropdown, keys | |
| return colnames_dropdown, excel_sheets_dropdown, keys | |
| else: | |
| no_op = gr.Dropdown(visible=False), gr.Dropdown(visible=False), keys | |
| if last_processed_keys is not None: | |
| return *no_op, () if not keys else keys | |
| return no_op | |
| def handle_text_extract_method_selection(text_extract_method: str): | |
| """Handle text extraction method selection - show local OCR radio only if Local OCR model is selected, | |
| and show AWS Textract settings only if AWS Textract is selected. | |
| Args: | |
| text_extract_method: Selected text extraction method | |
| Returns: | |
| Tuple of visibility updates for local OCR radio, and AWS Textract accordion | |
| """ | |
| # Normalize (Gradio can send None when .change() fires before sync); default so something stays visible | |
| if isinstance(text_extract_method, str): | |
| text_extract_method = text_extract_method.strip() | |
| if text_extract_method is None or text_extract_method == "": | |
| text_extract_method = TESSERACT_TEXT_EXTRACT_OPTION | |
| # Show local OCR method radio only if "Local OCR model - PDFs without selectable text" is selected | |
| # When "AWS Bedrock VLM OCR" is selected, the local OCR method is automatically set to "bedrock-vlm" but the component is hidden | |
| show_local_ocr = text_extract_method == TESSERACT_TEXT_EXTRACT_OPTION | |
| # Show AWS Textract settings accordion only if "AWS Textract service - all PDF types" is selected | |
| show_aws_textract = ( | |
| text_extract_method == TEXTRACT_TEXT_EXTRACT_OPTION | |
| and SHOW_AWS_TEXT_EXTRACTION_OPTIONS | |
| ) | |
| return ( | |
| gr.update(visible=show_local_ocr), # walkthrough_local_ocr_method_radio | |
| gr.update( | |
| visible=show_aws_textract | |
| ), # walkthrough_handwrite_signature_checkbox | |
| ) | |
| def handle_redaction_method_selection(redaction_method: str, pii_method: str): | |
| """Handle redaction method selection in Step 3 - show appropriate components based on selection.""" | |
| # Normalize inputs (Gradio can send whitespace or None when .change() fires before sync) | |
| if isinstance(redaction_method, str): | |
| redaction_method = redaction_method.strip() | |
| if redaction_method is None or redaction_method == "": | |
| redaction_method = "Redact all PII" | |
| if isinstance(pii_method, str): | |
| pii_method = pii_method.strip() | |
| if pii_method is None or pii_method == "": | |
| pii_method = DEFAULT_PII_DETECTION_MODEL | |
| # Check which redaction method is selected | |
| is_redact_all_pii = redaction_method == "Redact all PII" | |
| is_redact_selected_terms = redaction_method == "Redact selected terms" | |
| is_redact_all_pii_or_selected_terms = is_redact_all_pii or is_redact_selected_terms | |
| is_extract_text_only = ( | |
| isinstance(redaction_method, str) | |
| and redaction_method.strip() == "Extract text only" | |
| ) | |
| # When switching from "Extract text only", the PII dropdown may still be | |
| # NO_REDACTION_PII_OPTION; use DEFAULT_PII_DETECTION_MODEL so exactly one | |
| # entity dropdown is visible and the UI doesn’t show all three or none. | |
| pii_method_for_visibility = pii_method | |
| if is_redact_all_pii_or_selected_terms and pii_method == NO_REDACTION_PII_OPTION: | |
| pii_method_for_visibility = DEFAULT_PII_DETECTION_MODEL | |
| # Show PII detection settings if "Redact all PII" OR "Redact selected terms" is selected | |
| # Both options need PII detection method to determine what to redact | |
| show_pii_method = ( | |
| is_redact_all_pii_or_selected_terms | |
| ) and SHOW_PII_IDENTIFICATION_OPTIONS | |
| # Determine visibility of entity dropdowns based on PII method | |
| show_local_entities_init = show_pii_method and ( | |
| pii_method_for_visibility == LOCAL_PII_OPTION | |
| ) | |
| show_comprehend_entities_init = show_pii_method and ( | |
| pii_method_for_visibility == AWS_PII_OPTION | |
| ) | |
| is_llm_method_init = show_pii_method and ( | |
| pii_method_for_visibility == LOCAL_TRANSFORMERS_LLM_PII_OPTION | |
| or pii_method_for_visibility == INFERENCE_SERVER_PII_OPTION | |
| or pii_method_for_visibility == AWS_LLM_PII_OPTION | |
| ) | |
| # For "Extract text only", hide all components | |
| # For "Redact all PII", show PII detection components | |
| # For "Redact selected terms", show both PII detection components AND deny/allow/fully redacted list components | |
| # When we overrode pii_method for visibility, also update the PII dropdown value | |
| pii_drop_value = None | |
| if is_redact_all_pii_or_selected_terms and pii_method == NO_REDACTION_PII_OPTION: | |
| pii_drop_value = DEFAULT_PII_DETECTION_MODEL | |
| # Set entity values based on redaction method | |
| if is_redact_selected_terms: | |
| # For "Redact selected terms", only show CUSTOM entity | |
| local_entities_update = gr.Dropdown( | |
| visible=show_local_entities_init, value=["CUSTOM"] | |
| ) | |
| comprehend_entities_update = gr.Dropdown( | |
| visible=show_comprehend_entities_init, value=["CUSTOM"] | |
| ) | |
| llm_entities_update = gr.Dropdown(visible=is_llm_method_init, value=["CUSTOM"]) | |
| walkthrough_pii_identification_method_drop_update = ( | |
| gr.update(visible=show_pii_method, value=pii_drop_value) | |
| if pii_drop_value is not None | |
| else gr.update(visible=show_pii_method) | |
| ) | |
| elif is_redact_all_pii: | |
| # For "Redact all PII", use default entities | |
| # Ensure entities are lists (they should already be parsed in config.py) | |
| local_entities_val = ( | |
| CHOSEN_REDACT_ENTITIES | |
| if isinstance(CHOSEN_REDACT_ENTITIES, list) | |
| else ["CUSTOM"] | |
| ) | |
| comprehend_entities_val = ( | |
| CHOSEN_COMPREHEND_ENTITIES | |
| if isinstance(CHOSEN_COMPREHEND_ENTITIES, list) | |
| else ["CUSTOM"] | |
| ) | |
| llm_entities_val = ( | |
| CHOSEN_LLM_ENTITIES if isinstance(CHOSEN_LLM_ENTITIES, list) else ["CUSTOM"] | |
| ) | |
| local_entities_update = gr.Dropdown( | |
| visible=show_local_entities_init, value=local_entities_val | |
| ) | |
| comprehend_entities_update = gr.Dropdown( | |
| visible=show_comprehend_entities_init, value=comprehend_entities_val | |
| ) | |
| llm_entities_update = gr.Dropdown( | |
| visible=is_llm_method_init, value=llm_entities_val | |
| ) | |
| walkthrough_pii_identification_method_drop_update = ( | |
| gr.update(visible=show_pii_method, value=pii_drop_value) | |
| if pii_drop_value is not None | |
| else gr.update(visible=show_pii_method) | |
| ) | |
| elif is_extract_text_only: | |
| # For "Extract text only", just update visibility without changing value | |
| local_entities_update = gr.Dropdown(visible=show_local_entities_init) | |
| comprehend_entities_update = gr.Dropdown(visible=show_comprehend_entities_init) | |
| llm_entities_update = gr.Dropdown(visible=is_llm_method_init) | |
| walkthrough_pii_identification_method_drop_update = gr.update( | |
| visible=show_pii_method, value=NO_REDACTION_PII_OPTION | |
| ) | |
| return ( | |
| walkthrough_pii_identification_method_drop_update, # walkthrough_pii_identification_method_drop | |
| local_entities_update, # walkthrough_in_redact_entities | |
| comprehend_entities_update, # walkthrough_in_redact_comprehend_entities | |
| gr.update(visible=is_llm_method_init), # walkthrough_llm_entities_accordion | |
| llm_entities_update, # walkthrough_in_redact_llm_entities | |
| gr.update( | |
| visible=is_redact_all_pii_or_selected_terms | |
| ), # walkthrough_list_accordion | |
| gr.update( | |
| visible=is_redact_all_pii_or_selected_terms | |
| ), # walkthrough_max_fuzzy_spelling_mistakes_num | |
| ) | |
| # Update visibility of PII-related components and accordions when general redaction method is selected | |
| def handle_main_redaction_method_selection(redaction_method, pii_method): | |
| """Wrapper that applies handle_redaction_method_selection and updates accordion visibility. | |
| handle_redaction_method_selection returns (for walkthrough): pii_drop, local_entities, | |
| comprehend_entities, llm_accordion_visible, llm_entities, list_accordion, checkbox, num. | |
| The main app expects: pii_drop, local_entities, comprehend_entities, llm_entities, | |
| custom_llm_instructions_textbox, list_accordion, checkbox, num, entity_accordion, terms_accordion. | |
| So we remap: use inner[4] for in_redact_llm_entities and insert gr.update() for | |
| custom_llm_instructions_textbox (avoid applying the Dropdown value ["CUSTOM"] to the textbox). | |
| """ | |
| raw = list(handle_redaction_method_selection(redaction_method, pii_method)) | |
| is_redact_all_pii = redaction_method == "Redact all PII" | |
| is_redact_selected_terms = redaction_method == "Redact selected terms" | |
| is_extract_text_only = ( | |
| isinstance(redaction_method, str) | |
| and redaction_method.strip() == "Extract text only" | |
| ) | |
| show_pii_method = ( | |
| is_redact_all_pii or is_redact_selected_terms | |
| ) and SHOW_PII_IDENTIFICATION_OPTIONS | |
| show_selected_terms_lists = is_redact_selected_terms | |
| # Map to main app outputs: pii_drop, local_entities, comprehend_entities, llm_entities, | |
| # custom_llm_instructions_textbox (no value change), list_accordion, checkbox, num, | |
| # then entity/terms accordions, then only_extract_text_radio. | |
| # raw[3] is llm_accordion visibility (unused here); raw[4] is llm_entities. | |
| # When "Extract text only" is selected, force "Only extract text (no redaction)" checkbox to True. | |
| results = [ | |
| raw[0], # pii_identification_method_drop | |
| raw[1], # in_redact_entities | |
| raw[2], # in_redact_comprehend_entities | |
| raw[ | |
| 4 | |
| ], # in_redact_llm_entities (was wrongly going to textbox as str(["CUSTOM"]) -> "['CUSTOM']") | |
| gr.update(), # custom_llm_instructions_textbox - leave value unchanged | |
| raw[5], # walkthrough_list_accordion | |
| raw[6], # max_fuzzy_spelling_mistakes_num | |
| gr.update(visible=show_pii_method), # entity_types_to_redact_accordion | |
| gr.update(visible=show_selected_terms_lists), # terms_accordion | |
| gr.update(value=is_extract_text_only), # only_extract_text_radio | |
| ] | |
| return results | |
| def handle_pii_method_selection(pii_method: str): | |
| """Handle PII method selection - show appropriate entity dropdowns.""" | |
| # When value is None/empty (e.g. first .change() after loading an example sets the | |
| # component programmatically), avoid hiding all entity selectors by defaulting to Local. | |
| if pii_method is None or (isinstance(pii_method, str) and not pii_method.strip()): | |
| show_local_entities = True | |
| show_comprehend_entities = False | |
| is_llm_method = False | |
| else: | |
| # Check if method is Local | |
| show_local_entities = pii_method == LOCAL_PII_OPTION | |
| # Check if method is AWS Comprehend | |
| show_comprehend_entities = pii_method == AWS_PII_OPTION | |
| # Check if method is an LLM option | |
| is_llm_method = ( | |
| pii_method == LOCAL_TRANSFORMERS_LLM_PII_OPTION | |
| or pii_method == INFERENCE_SERVER_PII_OPTION | |
| or pii_method == AWS_LLM_PII_OPTION | |
| ) | |
| # Use gr.update(visible=...) only to avoid value resets that can trigger change | |
| # events on the target components and cause loading loops (e.g. tabular PII -> LLM). | |
| # Only return to the two entity dropdowns and the accordion; components inside the | |
| # accordion (walkthrough_in_redact_llm_entities, walkthrough_custom_llm_instructions_textbox) | |
| # are shown/hidden by the accordion visibility. | |
| return ( | |
| gr.update(visible=show_local_entities), # walkthrough_in_redact_entities | |
| gr.update( | |
| visible=show_comprehend_entities | |
| ), # walkthrough_in_redact_comprehend_entities | |
| gr.update(visible=is_llm_method), # walkthrough_llm_entities_accordion | |
| ) | |
| def handle_pii_method_selection_tabular(pii_method: str): | |
| """Handle tabular PII method selection. Updates only accordion visibility for the | |
| LLM block; leaves walkthrough_in_redact_llm_entities and | |
| walkthrough_custom_llm_instructions_textbox as no-ops to avoid loading spinners | |
| hanging on those nested components when switching to LLM (AWS Bedrock). | |
| """ | |
| if pii_method is None or (isinstance(pii_method, str) and not pii_method.strip()): | |
| show_local_entities = True | |
| show_comprehend_entities = False | |
| is_llm_method = False | |
| else: | |
| show_local_entities = pii_method == LOCAL_PII_OPTION | |
| show_comprehend_entities = pii_method == AWS_PII_OPTION | |
| is_llm_method = ( | |
| pii_method == LOCAL_TRANSFORMERS_LLM_PII_OPTION | |
| or pii_method == INFERENCE_SERVER_PII_OPTION | |
| or pii_method == AWS_LLM_PII_OPTION | |
| ) | |
| return ( | |
| gr.update(visible=show_local_entities), | |
| gr.update(visible=show_comprehend_entities), | |
| gr.update(visible=is_llm_method), # accordion controls visibility of LLM block | |
| ) | |
| def handle_step_3_next( | |
| text_extract_method_val, | |
| local_ocr_method_val, | |
| handwrite_signature_val, | |
| pii_method_val, | |
| redact_entities_val, | |
| redact_comprehend_entities_val, | |
| redact_llm_entities_val, | |
| custom_llm_instructions_val, | |
| deny_list_val, | |
| allow_list_val, | |
| fully_redacted_list_val, | |
| pii_method_tabular_val, | |
| anon_strategy_val, | |
| do_initial_clean_val, | |
| redact_duplicate_pages_val, | |
| max_fuzzy_spelling_mistakes_num_val, | |
| ): | |
| """Handle step 3 next button - write values to main components.""" | |
| # Update text extraction method with walkthrough value | |
| text_extract_method_update = ( | |
| gr.Radio(value=text_extract_method_val) | |
| if text_extract_method_val | |
| else gr.Radio() | |
| ) | |
| # Update OCR components with walkthrough values | |
| local_ocr_update = ( | |
| gr.Radio(value=local_ocr_method_val) if local_ocr_method_val else gr.Radio() | |
| ) | |
| handwrite_signature_update = ( | |
| gr.CheckboxGroup(value=handwrite_signature_val) | |
| if handwrite_signature_val | |
| else gr.CheckboxGroup() | |
| ) | |
| # Update PII components with walkthrough values | |
| pii_method_update = gr.Radio(value=pii_method_val) if pii_method_val else gr.Radio() | |
| # Always update dropdowns with the value, even if it's an empty list | |
| # This ensures that empty selections are correctly written to main components | |
| redact_entities_update = ( | |
| gr.Dropdown(value=redact_entities_val) | |
| if redact_entities_val is not None | |
| else gr.Dropdown() | |
| ) | |
| redact_comprehend_entities_update = ( | |
| gr.Dropdown(value=redact_comprehend_entities_val) | |
| if redact_comprehend_entities_val is not None | |
| else gr.Dropdown() | |
| ) | |
| redact_llm_entities_update = ( | |
| gr.Dropdown(value=redact_llm_entities_val) | |
| if redact_llm_entities_val is not None | |
| else gr.Dropdown() | |
| ) | |
| custom_llm_instructions_update = ( | |
| gr.Textbox(value=custom_llm_instructions_val) | |
| if custom_llm_instructions_val is not None | |
| else gr.Textbox() | |
| ) | |
| # Update deny/allow/fully redacted list components with walkthrough values | |
| # Convert DataFrame to list if needed (for backward compatibility) | |
| # Ensure all items are strings for Dropdown components | |
| if deny_list_val is not None: | |
| if isinstance(deny_list_val, pd.DataFrame): | |
| deny_list_val = ( | |
| deny_list_val.iloc[:, 0].tolist() if not deny_list_val.empty else [] | |
| ) | |
| # Ensure all items are strings | |
| if isinstance(deny_list_val, list): | |
| deny_list_val = ( | |
| [str(item) for item in deny_list_val if item] if deny_list_val else [] | |
| ) | |
| deny_list_update = ( | |
| gr.Dropdown(value=deny_list_val) if deny_list_val else gr.Dropdown() | |
| ) | |
| else: | |
| deny_list_update = gr.Dropdown() | |
| if allow_list_val is not None: | |
| if isinstance(allow_list_val, pd.DataFrame): | |
| allow_list_val = ( | |
| allow_list_val.iloc[:, 0].tolist() if not allow_list_val.empty else [] | |
| ) | |
| # Ensure all items are strings | |
| if isinstance(allow_list_val, list): | |
| allow_list_val = ( | |
| [str(item) for item in allow_list_val if item] if allow_list_val else [] | |
| ) | |
| allow_list_update = ( | |
| gr.Dropdown(value=allow_list_val) if allow_list_val else gr.Dropdown() | |
| ) | |
| else: | |
| allow_list_update = gr.Dropdown() | |
| if fully_redacted_list_val is not None: | |
| if isinstance(fully_redacted_list_val, pd.DataFrame): | |
| fully_redacted_list_val = ( | |
| fully_redacted_list_val.iloc[:, 0].tolist() | |
| if not fully_redacted_list_val.empty | |
| else [] | |
| ) | |
| # Ensure all items are strings | |
| if isinstance(fully_redacted_list_val, list): | |
| fully_redacted_list_val = ( | |
| [str(item) for item in fully_redacted_list_val if item] | |
| if fully_redacted_list_val | |
| else [] | |
| ) | |
| fully_redacted_list_update = ( | |
| gr.Dropdown(value=fully_redacted_list_val) | |
| if fully_redacted_list_val | |
| else gr.Dropdown() | |
| ) | |
| else: | |
| fully_redacted_list_update = gr.Dropdown() | |
| # Update tabular data components with walkthrough values | |
| pii_method_tabular_update = ( | |
| gr.Radio(value=pii_method_tabular_val) | |
| if pii_method_tabular_val is not None | |
| else gr.Radio() | |
| ) | |
| anon_strategy_update = ( | |
| gr.Radio(value=anon_strategy_val) | |
| if anon_strategy_val is not None | |
| else gr.Radio() | |
| ) | |
| do_initial_clean_update = ( | |
| gr.Checkbox(value=do_initial_clean_val) | |
| if do_initial_clean_val is not None | |
| else gr.Checkbox() | |
| ) | |
| # Update redact duplicate pages checkbox with walkthrough value | |
| redact_duplicate_pages_update = ( | |
| gr.Checkbox(value=redact_duplicate_pages_val) | |
| if redact_duplicate_pages_val is not None | |
| else gr.Checkbox() | |
| ) | |
| # Update max fuzzy spelling mistakes number with walkthrough value | |
| max_fuzzy_spelling_mistakes_num_update = ( | |
| gr.Number(value=max_fuzzy_spelling_mistakes_num_val) | |
| if max_fuzzy_spelling_mistakes_num_val is not None | |
| else gr.Number() | |
| ) | |
| return ( | |
| text_extract_method_update, # text_extract_method_radio | |
| local_ocr_update, # local_ocr_method_radio | |
| handwrite_signature_update, # handwrite_signature_checkbox | |
| pii_method_update, # pii_identification_method_drop | |
| redact_entities_update, # in_redact_entities | |
| redact_comprehend_entities_update, # in_redact_comprehend_entities | |
| redact_llm_entities_update, # in_redact_llm_entities | |
| custom_llm_instructions_update, # custom_llm_instructions_textbox | |
| deny_list_update, # in_deny_list_state | |
| allow_list_update, # in_allow_list_state | |
| fully_redacted_list_update, # in_fully_redacted_list_state | |
| pii_method_tabular_update, # pii_identification_method_drop_tabular | |
| anon_strategy_update, # anon_strategy | |
| do_initial_clean_update, # do_initial_clean | |
| redact_duplicate_pages_update, # redact_duplicate_pages_checkbox | |
| gr.Walkthrough(selected=4), # walkthrough | |
| max_fuzzy_spelling_mistakes_num_update, # max_fuzzy_spelling_mistakes_num | |
| ) | |
| def handle_step_4_next( | |
| page_min_val, | |
| page_max_val, | |
| textract_output_found_val, | |
| relevant_ocr_output_with_words_found_val, | |
| total_pdf_page_count_val, | |
| estimated_aws_costs_val, | |
| estimated_time_taken_val, | |
| cost_code_dataframe_val, | |
| cost_code_choice_val, | |
| ): | |
| """Handle step 4 next button - write values to main components.""" | |
| # Update page selection components | |
| page_min_update = ( | |
| gr.Number(value=page_min_val) if page_min_val is not None else gr.Number() | |
| ) | |
| page_max_update = ( | |
| gr.Number(value=page_max_val) if page_max_val is not None else gr.Number() | |
| ) | |
| # Update cost-related components (if SHOW_COSTS is True) | |
| textract_output_found_update = ( | |
| gr.Checkbox(value=textract_output_found_val) | |
| if textract_output_found_val is not None | |
| else gr.Checkbox() | |
| ) | |
| relevant_ocr_output_with_words_found_update = ( | |
| gr.Checkbox(value=relevant_ocr_output_with_words_found_val) | |
| if relevant_ocr_output_with_words_found_val is not None | |
| else gr.Checkbox() | |
| ) | |
| total_pdf_page_count_update = ( | |
| gr.Number(value=total_pdf_page_count_val) | |
| if total_pdf_page_count_val is not None | |
| else gr.Number() | |
| ) | |
| estimated_aws_costs_update = ( | |
| gr.Number(value=estimated_aws_costs_val) | |
| if estimated_aws_costs_val is not None | |
| else gr.Number() | |
| ) | |
| estimated_time_taken_update = ( | |
| gr.Number(value=estimated_time_taken_val) | |
| if estimated_time_taken_val is not None | |
| else gr.Number() | |
| ) | |
| # Update cost code components (if GET_COST_CODES or ENFORCE_COST_CODES is True) | |
| cost_code_dataframe_update = ( | |
| gr.Dataframe(value=cost_code_dataframe_val) | |
| if cost_code_dataframe_val is not None | |
| else gr.Dataframe() | |
| ) | |
| cost_code_choice_update = ( | |
| gr.Dropdown(value=cost_code_choice_val) | |
| if cost_code_choice_val is not None | |
| else gr.Dropdown() | |
| ) | |
| return ( | |
| page_min_update, # page_min | |
| page_max_update, # page_max | |
| textract_output_found_update, # textract_output_found_checkbox | |
| relevant_ocr_output_with_words_found_update, # relevant_ocr_output_with_words_found_checkbox | |
| total_pdf_page_count_update, # total_pdf_page_count | |
| estimated_aws_costs_update, # estimated_aws_costs_number | |
| estimated_time_taken_update, # estimated_time_taken_number | |
| cost_code_dataframe_update, # cost_code_dataframe | |
| cost_code_choice_update, # cost_code_choice_drop | |
| gr.Walkthrough(selected=5), # walkthrough | |
| ) | |
| def sync_walkthrough_outputs_to_original(summary_text, output_file_value): | |
| """Sync walkthrough output components to original components. | |
| This function takes the outputs from the redaction process and duplicates | |
| them to both walkthrough and original output components. | |
| Args: | |
| summary_text: The output summary text | |
| output_file_value: The output file value | |
| Returns: | |
| Tuple of (walkthrough_summary, walkthrough_file, original_summary, original_file) | |
| """ | |
| return ( | |
| summary_text, # walkthrough_redaction_output_summary_textbox | |
| output_file_value, # walkthrough_output_file | |
| summary_text, # redaction_output_summary_textbox (original) | |
| output_file_value, # output_file (original) | |
| ) | |
| def sync_walkthrough_tabular_outputs_to_original(summary_text, output_file_value): | |
| """Sync walkthrough tabular output components to original components. | |
| This function takes the outputs from the tabular redaction process and duplicates | |
| them to both walkthrough and original output components. | |
| Args: | |
| summary_text: The output summary text | |
| output_file_value: The output file value | |
| Returns: | |
| Tuple of (walkthrough_summary, walkthrough_file, original_summary, original_file) | |
| """ | |
| return ( | |
| summary_text, # walkthrough_text_output_summary | |
| output_file_value, # walkthrough_text_output_file | |
| summary_text, # text_output_summary (original) | |
| output_file_value, # text_output_file (original) | |
| ) | |
| def update_step_3_tabular_visibility(is_data_file): | |
| """Update visibility of Step 3 components based on file type. | |
| When a data file (CSV/Excel) is chosen: show tabular options, hide document-only options. | |
| When a document is chosen: show document options (PII method, duplicate pages, etc.), hide tabular options. | |
| Args: | |
| is_data_file: Boolean indicating if uploaded file is a data file | |
| Returns: | |
| Tuple of visibility updates for document-only and tabular components | |
| """ | |
| show_doc = not is_data_file | |
| return ( | |
| gr.update(visible=show_doc), # walkthrough_local_ocr_method_radio | |
| gr.update(visible=show_doc), # walkthrough_pii_identification_method_drop | |
| gr.update(visible=show_doc), # walkthrough_fully_redacted_list_state | |
| gr.update(visible=show_doc), # walkthrough_redact_duplicate_pages_checkbox | |
| gr.update( | |
| visible=is_data_file | |
| ), # walkthrough_pii_identification_method_drop_tabular | |
| gr.update(visible=is_data_file), # walkthrough_anon_strategy | |
| gr.update(visible=is_data_file), # walkthrough_do_initial_clean | |
| ) | |
| def update_step_4_visibility(is_data_file): | |
| """Update visibility of Step 4 components based on file type. | |
| Args: | |
| is_data_file: Boolean indicating if uploaded file is a data file | |
| Returns: | |
| Tuple of visibility updates for document and tabular components | |
| """ | |
| # For Row components, we need to update visibility of children | |
| # Return updates for button and both output components in each row | |
| return ( | |
| gr.update(visible=not is_data_file), # step_4_next_document_redact_btn | |
| gr.update(visible=is_data_file), # step_4_next_tabular_redact_btn | |
| ) | |
| def handle_main_text_extract_method_selection(text_extract_method: str): | |
| """Handle text extraction method selection for main components - show local OCR options only if Local OCR model is selected, | |
| and show AWS Textract settings only if AWS Textract is selected. | |
| Args: | |
| text_extract_method: Selected text extraction method | |
| Returns: | |
| Tuple of visibility updates for local OCR accordion, inference server accordion, and AWS Textract accordion | |
| """ | |
| # Normalize (Gradio can send None when .change() fires before sync); default so something stays visible | |
| if isinstance(text_extract_method, str): | |
| text_extract_method = text_extract_method.strip() | |
| if text_extract_method is None or text_extract_method == "": | |
| text_extract_method = TESSERACT_TEXT_EXTRACT_OPTION | |
| # Show local OCR method accordion only if "Local OCR model - PDFs without selectable text" is selected | |
| # When "AWS Bedrock VLM OCR" is selected, the local OCR method is automatically set to "bedrock-vlm" but the component is hidden | |
| show_local_ocr = text_extract_method == TESSERACT_TEXT_EXTRACT_OPTION | |
| # Show AWS Textract settings accordion only if "AWS Textract service - all PDF types" is selected | |
| show_aws_textract = ( | |
| text_extract_method == TEXTRACT_TEXT_EXTRACT_OPTION | |
| and SHOW_AWS_TEXT_EXTRACTION_OPTIONS | |
| ) | |
| # Show inference server VLM model accordion only if local OCR is selected (not Bedrock VLM) and the option is enabled | |
| show_inference_server = ( | |
| text_extract_method == TESSERACT_TEXT_EXTRACT_OPTION | |
| and SHOW_INFERENCE_SERVER_VLM_MODEL_OPTIONS | |
| ) | |
| return ( | |
| gr.update(visible=show_local_ocr), # local_ocr_method_accordion | |
| gr.update( | |
| visible=show_inference_server | |
| ), # inference_server_vlm_model_accordion | |
| gr.update(visible=show_aws_textract), # aws_textract_signature_accordion | |
| ) | |
| def handle_main_pii_method_selection(pii_method): | |
| """Handle PII method selection for main components - show appropriate entity dropdowns and hide all if No PII redaction is selected. | |
| Args: | |
| pii_method: Selected PII detection method | |
| Returns: | |
| Tuple of visibility updates for PII method dropdown, local entities accordion, comprehend entities accordion, | |
| LLM entities accordion, and LLM custom instructions accordion | |
| """ | |
| # Normalize string (Gradio can send whitespace) | |
| if isinstance(pii_method, str): | |
| pii_method = pii_method.strip() | |
| # When value is None/empty (e.g. .change() fired before component synced), default to Local so at least one section is visible (e.g. when user clicked Local) | |
| if pii_method is None or pii_method == "": | |
| return ( | |
| gr.update(visible=True), # local_entities | |
| gr.update(visible=False), # comprehend_entities | |
| gr.update(visible=False), # llm_entities | |
| gr.update(visible=False), # llm_custom_instructions | |
| ) | |
| # Check if "No PII redaction" is selected | |
| is_no_redaction = pii_method == NO_REDACTION_PII_OPTION | |
| # If no redaction, hide all PII-related components | |
| if is_no_redaction: | |
| return ( | |
| gr.update(visible=False), # local_entities | |
| gr.update(visible=False), # comprehend_entities | |
| gr.update(visible=False), # llm_entities | |
| gr.update(visible=False), # llm_custom_instructions | |
| ) | |
| # Check if method is Local | |
| show_local_entities = pii_method == LOCAL_PII_OPTION | |
| # Check if method is AWS Comprehend | |
| show_comprehend_entities = pii_method == AWS_PII_OPTION | |
| # Check if method is an LLM option | |
| is_llm_method = ( | |
| pii_method == LOCAL_TRANSFORMERS_LLM_PII_OPTION | |
| or pii_method == INFERENCE_SERVER_PII_OPTION | |
| or pii_method == AWS_LLM_PII_OPTION | |
| ) | |
| return ( | |
| gr.update(visible=show_local_entities), # local_entities | |
| gr.update(visible=show_comprehend_entities), # comprehend_entities | |
| gr.update(visible=is_llm_method), # llm_entities | |
| gr.update(visible=is_llm_method), # llm_custom_instructions | |
| ) | |