diff --git "a/cli_redact.py" "b/cli_redact.py"
new file mode 100644--- /dev/null
+++ "b/cli_redact.py"
@@ -0,0 +1,2447 @@
+import argparse
+import os
+import re
+import time
+import uuid
+from datetime import datetime
+
+import pandas as pd
+
+from tools.aws_functions import download_file_from_s3, export_outputs_to_s3
+from tools.config import (
+    ACCESS_LOGS_FOLDER,
+    ALLOW_LIST_PATH,
+    AWS_ACCESS_KEY,
+    AWS_LLM_PII_OPTION,
+    AWS_PII_OPTION,
+    AWS_REGION,
+    AWS_SECRET_KEY,
+    AZURE_OPENAI_API_KEY,
+    AZURE_OPENAI_INFERENCE_ENDPOINT,
+    CHOSEN_COMPREHEND_ENTITIES,
+    CHOSEN_LLM_ENTITIES,
+    CHOSEN_LLM_PII_INFERENCE_METHOD,
+    CHOSEN_REDACT_ENTITIES,
+    CLOUD_LLM_PII_MODEL_CHOICE,
+    CLOUD_VLM_MODEL_CHOICE,
+    COMPRESS_REDACTED_PDF,
+    CUSTOM_ENTITIES,
+    DEFAULT_COMBINE_PAGES,
+    DEFAULT_COST_CODE,
+    DEFAULT_DUPLICATE_DETECTION_THRESHOLD,
+    DEFAULT_FUZZY_SPELLING_MISTAKES_NUM,
+    DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX,
+    DEFAULT_INFERENCE_SERVER_PII_MODEL,
+    DEFAULT_INFERENCE_SERVER_VLM_MODEL,
+    DEFAULT_LANGUAGE,
+    DEFAULT_LOCAL_OCR_MODEL,
+    DEFAULT_MIN_CONSECUTIVE_PAGES,
+    DEFAULT_MIN_WORD_COUNT,
+    DEFAULT_TABULAR_ANONYMISATION_STRATEGY,
+    DENY_LIST_PATH,
+    DIRECT_MODE_DEFAULT_USER,
+    DISPLAY_FILE_NAMES_IN_LOGS,
+    DO_INITIAL_TABULAR_DATA_CLEAN,
+    DOCUMENT_REDACTION_BUCKET,
+    EFFICIENT_OCR,
+    EFFICIENT_OCR_MIN_IMAGE_COVERAGE_FRACTION,
+    EFFICIENT_OCR_MIN_WORDS,
+    FEEDBACK_LOGS_FOLDER,
+    FULL_COMPREHEND_ENTITY_LIST,
+    FULL_ENTITY_LIST,
+    FULL_LLM_ENTITY_LIST,
+    GEMINI_API_KEY,
+    GRADIO_TEMP_DIR,
+    HYBRID_TEXTRACT_BEDROCK_VLM,
+    IMAGES_DPI,
+    INFERENCE_SERVER_API_URL,
+    INFERENCE_SERVER_PII_OPTION,
+    INPUT_FOLDER,
+    LLM_MAX_NEW_TOKENS,
+    LLM_PII_INFERENCE_METHODS,
+    LLM_TEMPERATURE,
+    LOCAL_OCR_MODEL_OPTIONS,
+    LOCAL_PII_OPTION,
+    LOCAL_TRANSFORMERS_LLM_PII_OPTION,
+    OCR_FIRST_PASS_MAX_WORKERS,
+    OUTPUT_FOLDER,
+    OVERWRITE_EXISTING_OCR_RESULTS,
+    PADDLE_MODEL_PATH,
+    PREPROCESS_LOCAL_OCR_IMAGES,
+    REMOVE_DUPLICATE_ROWS,
+    RETURN_REDACTED_PDF,
+    RUN_AWS_FUNCTIONS,
+    S3_OUTPUTS_BUCKET,
+    S3_OUTPUTS_FOLDER,
+    S3_USAGE_LOGS_FOLDER,
+    SAVE_LOGS_TO_CSV,
+    SAVE_LOGS_TO_DYNAMODB,
+    SAVE_OUTPUTS_TO_S3,
+    SAVE_PAGE_OCR_VISUALISATIONS,
+    SESSION_OUTPUT_FOLDER,
+    SPACY_MODEL_PATH,
+    SUMMARY_PAGE_GROUP_MAX_WORKERS,
+    TEXTRACT_JOBS_LOCAL_LOC,
+    TEXTRACT_JOBS_S3_LOC,
+    TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET,
+    TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER,
+    TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER,
+    USAGE_LOGS_FOLDER,
+    USE_GREEDY_DUPLICATE_DETECTION,
+    WHOLE_PAGE_REDACTION_LIST_PATH,
+    convert_string_to_boolean,
+)
+
+
+def _generate_session_hash() -> str:
+    """Generate a unique session hash for logging purposes."""
+    return str(uuid.uuid4())[:8]
+
+
+def _sanitize_folder_name(folder_name: str, max_length: int = 50) -> str:
+    """
+    Sanitize folder name for S3 compatibility.
+
+    Replaces 'strange' characters (anything that's not alphanumeric, dash, underscore, or full stop)
+    with underscores, and limits the length to max_length characters.
+
+    Args:
+        folder_name: Original folder name to sanitize
+        max_length: Maximum length for the folder name (default: 50)
+
+    Returns:
+        Sanitized folder name
+    """
+    if not folder_name:
+        return folder_name
+
+    # Replace any character that's not alphanumeric, dash, underscore, or full stop with underscore
+    # This handles @, commas, exclamation marks, spaces, etc.
+    sanitized = re.sub(r"[^a-zA-Z0-9._-]", "_", folder_name)
+
+    # Limit length to max_length
+    if len(sanitized) > max_length:
+        sanitized = sanitized[:max_length]
+
+    return sanitized
+
+
+def get_username_and_folders(
+    username: str = "",
+    output_folder_textbox: str = OUTPUT_FOLDER,
+    input_folder_textbox: str = INPUT_FOLDER,
+    session_output_folder: bool = SESSION_OUTPUT_FOLDER,
+    textract_document_upload_input_folder: str = TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER,
+    textract_document_upload_output_folder: str = TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER,
+    s3_textract_document_logs_subfolder: str = TEXTRACT_JOBS_S3_LOC,
+    local_textract_document_logs_subfolder: str = TEXTRACT_JOBS_LOCAL_LOC,
+):
+
+    # Generate session hash for logging. Either from input user name or generated
+    if username:
+        out_session_hash = username
+    else:
+        out_session_hash = _generate_session_hash()
+
+    # Sanitize session hash for S3 compatibility (especially important for S3 folder paths)
+    sanitized_session_hash = _sanitize_folder_name(out_session_hash)
+
+    if session_output_folder:
+        output_folder = output_folder_textbox + sanitized_session_hash + "/"
+        input_folder = input_folder_textbox + sanitized_session_hash + "/"
+
+        textract_document_upload_input_folder = (
+            textract_document_upload_input_folder + "/" + sanitized_session_hash
+        )
+        textract_document_upload_output_folder = (
+            textract_document_upload_output_folder + "/" + sanitized_session_hash
+        )
+
+        s3_textract_document_logs_subfolder = (
+            s3_textract_document_logs_subfolder + "/" + sanitized_session_hash
+        )
+        local_textract_document_logs_subfolder = (
+            local_textract_document_logs_subfolder + "/" + sanitized_session_hash + "/"
+        )
+
+    else:
+        output_folder = output_folder_textbox
+        input_folder = input_folder_textbox
+
+    if not os.path.exists(output_folder):
+        os.mkdir(output_folder)
+    if not os.path.exists(input_folder):
+        os.mkdir(input_folder)
+
+    return (
+        out_session_hash,
+        output_folder,
+        out_session_hash,
+        input_folder,
+        textract_document_upload_input_folder,
+        textract_document_upload_output_folder,
+        s3_textract_document_logs_subfolder,
+        local_textract_document_logs_subfolder,
+    )
+
+
+def _get_env_list(env_var_name: str) -> list[str]:
+    """Parses a comma-separated environment variable into a list of strings."""
+    value = env_var_name[1:-1].strip().replace('"', "").replace("'", "")
+    if not value:
+        return []
+    # Split by comma and filter out any empty strings that might result from extra commas
+    return [s.strip() for s in value.split(",") if s.strip()]
+
+
+def _download_s3_file_if_needed(
+    file_path: str, default_filename: str = "downloaded_file"
+) -> str:
+    """
+    Download a file from S3 if the path starts with 's3://' or 'S3://', otherwise return the path as-is.
+
+    Args:
+        file_path: File path (either local or S3 URL)
+        default_filename: Default filename to use if S3 key doesn't have a filename
+
+    Returns:
+        Local file path (downloaded from S3 or original path)
+    """
+    if not file_path:
+        return file_path
+
+    # Check for S3 URL (case-insensitive)
+    file_path_stripped = file_path.strip()
+    file_path_upper = file_path_stripped.upper()
+    if not file_path_upper.startswith("S3://"):
+        return file_path
+
+    # Use GRADIO_TEMP_DIR if available, otherwise use INPUT_FOLDER as fallback
+    temp_dir = GRADIO_TEMP_DIR if GRADIO_TEMP_DIR else INPUT_FOLDER
+    os.makedirs(temp_dir, exist_ok=True)
+
+    # Parse S3 URL: s3://bucket/key (preserve original case for bucket/key)
+    # Remove 's3://' prefix (case-insensitive)
+    s3_path = (
+        file_path_stripped.split("://", 1)[1]
+        if "://" in file_path_stripped
+        else file_path_stripped
+    )
+    # Split bucket and key (first '/' separates bucket from key)
+    if "/" in s3_path:
+        bucket_name_s3, s3_key = s3_path.split("/", 1)
+    else:
+        # If no key provided, use bucket name as key (unlikely but handle it)
+        bucket_name_s3 = s3_path
+        s3_key = ""
+
+    # Get the filename from the S3 key
+    filename = os.path.basename(s3_key) if s3_key else bucket_name_s3
+    if not filename:
+        filename = default_filename
+
+    # Create local file path in temp directory
+    local_file_path = os.path.join(temp_dir, filename)
+
+    # Download file from S3
+    try:
+        download_file_from_s3(
+            bucket_name=bucket_name_s3,
+            key=s3_key,
+            local_file_path_and_name=local_file_path,
+        )
+        print(f"S3 file downloaded successfully: {file_path} -> {local_file_path}")
+        return local_file_path
+    except Exception as e:
+        print(f"Error downloading file from S3 ({file_path}): {e}")
+        raise Exception(f"Failed to download file from S3: {e}")
+
+
+def _build_s3_output_folder(
+    s3_outputs_folder: str,
+    session_hash: str,
+    save_to_user_folders: bool,
+) -> str:
+    """
+    Build the S3 output folder path with session hash and date suffix if needed.
+
+    Args:
+        s3_outputs_folder: Base S3 folder path
+        session_hash: Session hash/username
+        save_to_user_folders: Whether to append session hash to folder path
+
+    Returns:
+        Final S3 folder path with session hash and date suffix
+    """
+    if not s3_outputs_folder:
+        return ""
+
+    # Append session hash if save_to_user_folders is enabled
+    if save_to_user_folders and session_hash:
+        sanitized_session_hash = _sanitize_folder_name(session_hash)
+        s3_outputs_folder = (
+            s3_outputs_folder.rstrip("/") + "/" + sanitized_session_hash + "/"
+        )
+    else:
+        # Ensure trailing slash
+        if not s3_outputs_folder.endswith("/"):
+            s3_outputs_folder = s3_outputs_folder + "/"
+
+    # Append today's date (YYYYMMDD/)
+    today_suffix = datetime.now().strftime("%Y%m%d") + "/"
+    s3_outputs_folder = s3_outputs_folder.rstrip("/") + "/" + today_suffix
+
+    return s3_outputs_folder
+
+
+# Add custom spacy recognisers to the Comprehend list, so that local Spacy model can be used to pick up e.g. titles, streetnames, UK postcodes that are sometimes missed by comprehend
+CHOSEN_COMPREHEND_ENTITIES.extend(CUSTOM_ENTITIES)
+FULL_COMPREHEND_ENTITY_LIST.extend(CUSTOM_ENTITIES)
+
+chosen_redact_entities = CHOSEN_REDACT_ENTITIES
+full_entity_list = FULL_ENTITY_LIST
+chosen_comprehend_entities = CHOSEN_COMPREHEND_ENTITIES
+full_comprehend_entity_list = FULL_COMPREHEND_ENTITY_LIST
+chosen_llm_entities = CHOSEN_LLM_ENTITIES
+full_llm_entity_list = FULL_LLM_ENTITY_LIST
+default_handwrite_signature_checkbox = DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX
+
+
+# --- Main CLI Function ---
+def main(direct_mode_args={}):
+    """
+    A unified command-line interface to prepare, redact, and anonymise various document types.
+
+    Args:
+        direct_mode_args (dict, optional): Dictionary of arguments for direct mode execution.
+                                          If provided, uses these instead of parsing command line arguments.
+    """
+    parser = argparse.ArgumentParser(
+        description="A versatile CLI for redacting PII from PDF/image files and anonymising Word/tabular data.",
+        formatter_class=argparse.RawTextHelpFormatter,
+        epilog="""
+Examples:
+
+To run these, you need to do the following:
+
+- Open a terminal window
+
+- CD to the app folder that contains this file (cli_redact.py)
+
+- Load the virtual environment using either conda or venv depending on your setup
+
+- Run one of the example commands below
+
+- Look in the output/ folder to see output files:
+
+# Redaction
+
+## Redact a PDF with default settings (local OCR):
+python cli_redact.py --input_file example_data/example_of_emails_sent_to_a_professor_before_applying.pdf
+
+## Extract text from a PDF only (i.e. no redaction), using local OCR:
+python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --redact_whole_page_file example_data/partnership_toolkit_redact_some_pages.csv --pii_detector None
+
+## Extract text from a PDF only (i.e. no redaction), using local OCR, with a whole page redaction list:
+python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --redact_whole_page_file example_data/partnership_toolkit_redact_some_pages.csv --pii_detector Local --local_redact_entities CUSTOM
+
+## Redact a PDF with allow list (local OCR) and custom list of redaction entities:
+python cli_redact.py --input_file example_data/graduate-job-example-cover-letter.pdf --allow_list_file example_data/test_allow_list_graduate.csv --local_redact_entities TITLES PERSON DATE_TIME
+
+## Redact a PDF with limited pages and text extraction method (local text) with custom fuzzy matching:
+python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --deny_list_file example_data/Partnership-Agreement-Toolkit_test_deny_list_para_single_spell.csv --local_redact_entities CUSTOM_FUZZY --page_min 1 --page_max 3 --ocr_method "Local text" --fuzzy_mistakes 3
+
+## Redaction with custom deny list, allow list, and whole page redaction list:
+python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --deny_list_file example_data/partnership_toolkit_redact_custom_deny_list.csv --redact_whole_page_file example_data/partnership_toolkit_redact_some_pages.csv --allow_list_file example_data/test_allow_list_partnership.csv
+
+## Redact an image:
+python cli_redact.py --input_file example_data/example_complaint_letter.jpg
+
+## Anonymise csv file with specific columns:
+python cli_redact.py --input_file example_data/combined_case_notes.csv --text_columns "Case Note" "Client" --anon_strategy replace_redacted
+
+## Anonymise csv file with a different strategy (remove text completely):
+python cli_redact.py --input_file example_data/combined_case_notes.csv --text_columns "Case Note" "Client" --anon_strategy redact
+
+## Anonymise Excel file, remove text completely:
+python cli_redact.py --input_file example_data/combined_case_notes.xlsx --text_columns "Case Note" "Client" --excel_sheets combined_case_notes --anon_strategy redact
+
+## Anonymise a word document:
+python cli_redact.py --input_file "example_data/Bold minimalist professional cover letter.docx" --anon_strategy replace_redacted
+
+# Redaction with AWS services:
+
+## Use Textract and Comprehend:
+python cli_redact.py --input_file example_data/example_of_emails_sent_to_a_professor_before_applying.pdf --ocr_method "AWS Textract" --pii_detector "AWS Comprehend"
+
+# LLM PII identification (entity subset and custom instructions)
+
+## Redact with LLM PII entity subset (NAME, EMAIL_ADDRESS, etc.) and custom instructions:
+python cli_redact.py --input_file example_data/example_of_emails_sent_to_a_professor_before_applying.pdf --llm_redact_entities NAME EMAIL_ADDRESS PHONE_NUMBER ADDRESS CUSTOM --custom_llm_instructions "Do not redact the name of the university."
+
+## Redact with custom LLM instructions only (use default LLM entities from config):
+python cli_redact.py --input_file example_data/graduate-job-example-cover-letter.pdf --custom_llm_instructions "Redact all company names with the label COMPANY_NAME."
+
+## Redact specific pages with AWS OCR and signature extraction:
+python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --page_min 6 --page_max 7 --ocr_method "AWS Textract" --handwrite_signature_extraction "Extract handwriting" "Extract signatures"
+
+## Redact with AWS OCR and additional layout extraction options:
+python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --ocr_method "AWS Textract" --extract_layout
+
+# Duplicate page detection
+
+## Find duplicate pages in OCR files:
+python cli_redact.py --task deduplicate --input_file example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv --duplicate_type pages --similarity_threshold 0.95
+
+## Find duplicate in OCR files at the line level:
+python cli_redact.py --task deduplicate --input_file example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv --duplicate_type pages --similarity_threshold 0.95 --combine_pages False --min_word_count 3
+
+## Find duplicate rows in tabular data:
+python cli_redact.py --task deduplicate --input_file example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv --duplicate_type tabular --text_columns "text" --similarity_threshold 0.95
+
+# AWS Textract whole document analysis
+
+## Submit document to Textract for basic text analysis:
+python cli_redact.py --task textract --textract_action submit --input_file example_data/example_of_emails_sent_to_a_professor_before_applying.pdf
+
+## Submit document to Textract for analysis with signature extraction (Job ID will be printed to the console, you need this to retrieve the results):
+python cli_redact.py --task textract --textract_action submit --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --extract_signatures 
+
+## Retrieve Textract results by job ID (returns a .json file output):
+python cli_redact.py --task textract --textract_action retrieve --job_id 12345678-1234-1234-1234-123456789012
+
+## List recent Textract jobs:
+python cli_redact.py --task textract --textract_action list
+
+# Document summarisation
+
+# Summarise from a PDF with AWS Bedrock
+python cli_redact.py --task summarise --input_file example_data/example_data/Partnership-Agreement-Toolkit_0_0.pdf --summarisation_inference_method "LLM (AWS Bedrock)"
+
+## Summarise document(s) from OCR output CSV(s) using AWS Bedrock:
+python cli_redact.py --task summarise --input_file example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_ocr_output.csv --summarisation_inference_method "LLM (AWS Bedrock)"
+
+## Summarise with local LLM and detailed format:
+python cli_redact.py --task summarise --input_file example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_ocr_output.csv --summarisation_inference_method "Local transformers LLM" --summarisation_format detailed
+
+## Summarise with additional context and instructions (concise format):
+python cli_redact.py --task summarise --input_file example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_ocr_output.csv --summarisation_context "This is a partnership agreement" --summarisation_additional_instructions "Focus on key obligations and termination clauses" --summarisation_format concise
+
+## Summarise multiple OCR CSV files:
+python cli_redact.py --task summarise --input_file example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_ocr_output.csv example_data/example_outputs/example_of_emails_sent_to_a_professor_before_applying_ocr_output_textract.csv --summarisation_inference_method "LLM (AWS Bedrock)"
+
+# Combine review PDFs
+
+## Merge redaction comments from multiple '_redactions_for_review' PDFs into one file:
+python cli_redact.py --task combine_review_pdfs --input_file path/to/review1.pdf path/to/review2.pdf --output_dir output/
+
+""",
+    )
+
+    # --- Task Selection ---
+    task_group = parser.add_argument_group("Task Selection")
+    task_group.add_argument(
+        "--task",
+        choices=[
+            "redact",
+            "deduplicate",
+            "textract",
+            "summarise",
+            "combine_review_pdfs",
+        ],
+        default="redact",
+        help="Task to perform: redact (PII redaction/anonymisation), deduplicate (find duplicate content), textract (AWS Textract batch operations), summarise (LLM-based document summarisation from OCR CSV files), or combine_review_pdfs (merge redaction comments from multiple '_redactions_for_review' PDFs into one file).",
+    )
+
+    # --- General Arguments (apply to all file types) ---
+    general_group = parser.add_argument_group("General Options")
+    general_group.add_argument(
+        "--input_file",
+        nargs="+",
+        help="Path to the input file(s) to process. Separate multiple files with a space, and use quotes if there are spaces in the file name.",
+    )
+    general_group.add_argument(
+        "--output_dir", default=OUTPUT_FOLDER, help="Directory for all output files."
+    )
+    general_group.add_argument(
+        "--input_dir", default=INPUT_FOLDER, help="Directory for all input files."
+    )
+    general_group.add_argument(
+        "--language", default=DEFAULT_LANGUAGE, help="Language of the document content."
+    )
+    general_group.add_argument(
+        "--allow_list",
+        default=ALLOW_LIST_PATH,
+        help="Path to a CSV file with words to exclude from redaction.",
+    )
+    general_group.add_argument(
+        "--pii_detector",
+        choices=[LOCAL_PII_OPTION, AWS_PII_OPTION, "None"],
+        default=LOCAL_PII_OPTION,
+        help="Core PII detection method (Local or AWS Comprehend, or None).",
+    )
+    general_group.add_argument(
+        "--username", default=DIRECT_MODE_DEFAULT_USER, help="Username for the session."
+    )
+    general_group.add_argument(
+        "--save_to_user_folders",
+        default=SESSION_OUTPUT_FOLDER,
+        help="Whether to save to user folders or not.",
+    )
+
+    general_group.add_argument(
+        "--local_redact_entities",
+        nargs="+",
+        choices=full_entity_list,
+        default=chosen_redact_entities,
+        help=f"Local redaction entities to use. Default: {chosen_redact_entities}. Full list: {full_entity_list}.",
+    )
+
+    general_group.add_argument(
+        "--aws_redact_entities",
+        nargs="+",
+        choices=full_comprehend_entity_list,
+        default=chosen_comprehend_entities,
+        help=f"AWS redaction entities to use. Default: {chosen_comprehend_entities}. Full list: {full_comprehend_entity_list}.",
+    )
+
+    general_group.add_argument(
+        "--aws_access_key", default=AWS_ACCESS_KEY, help="Your AWS Access Key ID."
+    )
+    general_group.add_argument(
+        "--aws_secret_key", default=AWS_SECRET_KEY, help="Your AWS Secret Access Key."
+    )
+    general_group.add_argument(
+        "--cost_code", default=DEFAULT_COST_CODE, help="Cost code for tracking usage."
+    )
+    general_group.add_argument(
+        "--aws_region", default=AWS_REGION, help="AWS region for cloud services."
+    )
+    general_group.add_argument(
+        "--s3_bucket",
+        default=DOCUMENT_REDACTION_BUCKET,
+        help="S3 bucket name for cloud operations.",
+    )
+    general_group.add_argument(
+        "--save_outputs_to_s3",
+        default=SAVE_OUTPUTS_TO_S3,
+        help="Upload output files (redacted PDFs, anonymized documents, etc.) to S3 after processing.",
+    )
+    general_group.add_argument(
+        "--s3_outputs_folder",
+        default=S3_OUTPUTS_FOLDER,
+        help="S3 folder (key prefix) for saving output files. If left blank, outputs will not be uploaded even if --save_outputs_to_s3 is enabled.",
+    )
+    general_group.add_argument(
+        "--s3_outputs_bucket",
+        default=S3_OUTPUTS_BUCKET,
+        help="S3 bucket name for output files (defaults to --s3_bucket if not specified).",
+    )
+    general_group.add_argument(
+        "--do_initial_clean",
+        default=DO_INITIAL_TABULAR_DATA_CLEAN,
+        help="Perform initial text cleaning for tabular data.",
+    )
+    general_group.add_argument(
+        "--save_logs_to_csv",
+        default=SAVE_LOGS_TO_CSV,
+        help="Save processing logs to CSV files.",
+    )
+    general_group.add_argument(
+        "--save_logs_to_dynamodb",
+        default=SAVE_LOGS_TO_DYNAMODB,
+        help="Save processing logs to DynamoDB.",
+    )
+    general_group.add_argument(
+        "--display_file_names_in_logs",
+        default=DISPLAY_FILE_NAMES_IN_LOGS,
+        help="Include file names in log outputs.",
+    )
+    general_group.add_argument(
+        "--upload_logs_to_s3",
+        default=RUN_AWS_FUNCTIONS,
+        help="Upload log files to S3 after processing.",
+    )
+    general_group.add_argument(
+        "--s3_logs_prefix",
+        default=S3_USAGE_LOGS_FOLDER,
+        help="S3 prefix for usage log files.",
+    )
+    general_group.add_argument(
+        "--feedback_logs_folder",
+        default=FEEDBACK_LOGS_FOLDER,
+        help="Directory for feedback log files.",
+    )
+    general_group.add_argument(
+        "--access_logs_folder",
+        default=ACCESS_LOGS_FOLDER,
+        help="Directory for access log files.",
+    )
+    general_group.add_argument(
+        "--usage_logs_folder",
+        default=USAGE_LOGS_FOLDER,
+        help="Directory for usage log files.",
+    )
+    general_group.add_argument(
+        "--paddle_model_path",
+        default=PADDLE_MODEL_PATH,
+        help="Directory for PaddleOCR model storage.",
+    )
+    general_group.add_argument(
+        "--spacy_model_path",
+        default=SPACY_MODEL_PATH,
+        help="Directory for spaCy model storage.",
+    )
+
+    # --- PDF/Image Redaction Arguments ---
+    pdf_group = parser.add_argument_group(
+        "PDF/Image Redaction Options (.pdf, .png, .jpg)"
+    )
+    pdf_group.add_argument(
+        "--ocr_method",
+        choices=["AWS Textract", "Local OCR", "Local text"],
+        default="Local OCR",
+        help="OCR method for text extraction from images.",
+    )
+    pdf_group.add_argument(
+        "--page_min", type=int, default=0, help="First page to redact."
+    )
+    pdf_group.add_argument(
+        "--page_max", type=int, default=0, help="Last page to redact."
+    )
+    pdf_group.add_argument(
+        "--images_dpi",
+        type=float,
+        default=float(IMAGES_DPI),
+        help="DPI for image processing.",
+    )
+    pdf_group.add_argument(
+        "--chosen_local_ocr_model",
+        choices=LOCAL_OCR_MODEL_OPTIONS,
+        default=DEFAULT_LOCAL_OCR_MODEL,
+        help="Local OCR model to use.",
+    )
+    pdf_group.add_argument(
+        "--preprocess_local_ocr_images",
+        default=PREPROCESS_LOCAL_OCR_IMAGES,
+        help="Preprocess images before OCR.",
+    )
+    pdf_group.add_argument(
+        "--compress_redacted_pdf",
+        default=COMPRESS_REDACTED_PDF,
+        help="Compress the final redacted PDF.",
+    )
+    pdf_group.add_argument(
+        "--return_pdf_end_of_redaction",
+        default=RETURN_REDACTED_PDF,
+        help="Return PDF at end of redaction process.",
+    )
+    pdf_group.add_argument(
+        "--deny_list_file",
+        default=DENY_LIST_PATH,
+        help="Custom words file to recognize for redaction.",
+    )
+    pdf_group.add_argument(
+        "--allow_list_file",
+        default=ALLOW_LIST_PATH,
+        help="Custom words file to recognize for redaction.",
+    )
+    pdf_group.add_argument(
+        "--redact_whole_page_file",
+        default=WHOLE_PAGE_REDACTION_LIST_PATH,
+        help="File for pages to redact completely.",
+    )
+    pdf_group.add_argument(
+        "--handwrite_signature_extraction",
+        nargs="+",
+        default=default_handwrite_signature_checkbox,
+        help='Handwriting and signature extraction options. Choose from "Extract handwriting", "Extract signatures".',
+    )
+    pdf_group.add_argument(
+        "--extract_forms",
+        action="store_true",
+        help="Extract forms during Textract analysis.",
+    )
+    pdf_group.add_argument(
+        "--extract_tables",
+        action="store_true",
+        help="Extract tables during Textract analysis.",
+    )
+    pdf_group.add_argument(
+        "--extract_layout",
+        action="store_true",
+        help="Extract layout during Textract analysis.",
+    )
+    pdf_group.add_argument(
+        "--vlm_model_choice",
+        default=CLOUD_VLM_MODEL_CHOICE,
+        help="VLM model choice for OCR (e.g., 'qwen.qwen3-vl-235b-a22b' for Bedrock, or model name for other providers).",
+    )
+    pdf_group.add_argument(
+        "--inference_server_vlm_model",
+        default=DEFAULT_INFERENCE_SERVER_VLM_MODEL,
+        help="Inference server VLM model name for OCR.",
+    )
+    pdf_group.add_argument(
+        "--inference_server_api_url",
+        default=INFERENCE_SERVER_API_URL,
+        help="Inference server API URL.",
+    )
+    pdf_group.add_argument(
+        "--gemini_api_key",
+        default=GEMINI_API_KEY,
+        help="Google Gemini API key for VLM OCR.",
+    )
+    pdf_group.add_argument(
+        "--azure_openai_api_key",
+        default=AZURE_OPENAI_API_KEY,
+        help="Azure OpenAI API key for VLM OCR.",
+    )
+    pdf_group.add_argument(
+        "--azure_openai_endpoint",
+        default=AZURE_OPENAI_INFERENCE_ENDPOINT,
+        help="Azure OpenAI endpoint URL for VLM OCR.",
+    )
+    pdf_group.add_argument(
+        "--efficient_ocr",
+        action="store_true",
+        default=None,
+        help="Use efficient OCR: try selectable text first per page, run OCR only when needed (saves time/cost). Defaults to EFFICIENT_OCR config.",
+    )
+    pdf_group.add_argument(
+        "--no_efficient_ocr",
+        action="store_false",
+        dest="efficient_ocr",
+        help="Disable efficient OCR (use selected OCR method for all pages).",
+    )
+    pdf_group.add_argument(
+        "--efficient_ocr_min_words",
+        type=int,
+        default=None,
+        metavar="N",
+        help="Minimum words on a page to use text-only route; below this use OCR. Defaults to EFFICIENT_OCR_MIN_WORDS config (e.g. 20).",
+    )
+    pdf_group.add_argument(
+        "--efficient_ocr_min_image_coverage_fraction",
+        type=float,
+        default=None,
+        metavar="F",
+        help="Efficient OCR: min fraction of page area (0-1) for an embedded image to force OCR; 0 disables. Defaults to EFFICIENT_OCR_MIN_IMAGE_COVERAGE_FRACTION config (e.g. 0.03).",
+    )
+    pdf_group.add_argument(
+        "--ocr_first_pass_max_workers",
+        type=int,
+        default=None,
+        metavar="N",
+        help="Max threads for OCR first pass (1 = sequential). Defaults to OCR_FIRST_PASS_MAX_WORKERS config (e.g. 3).",
+    )
+    pdf_group.add_argument(
+        "--hybrid_textract_bedrock_vlm",
+        action="store_true",
+        default=None,
+        help="When using AWS Textract, re-run low-confidence lines with Bedrock VLM for higher quality. Defaults to HYBRID_TEXTRACT_BEDROCK_VLM config.",
+    )
+    pdf_group.add_argument(
+        "--no_hybrid_textract_bedrock_vlm",
+        action="store_false",
+        dest="hybrid_textract_bedrock_vlm",
+        help="Disable hybrid Textract + Bedrock VLM (use Textract only).",
+    )
+    pdf_group.add_argument(
+        "--overwrite_existing_ocr_results",
+        action="store_true",
+        default=None,
+        help="Ignore cached OCR JSON files and re-run OCR. Defaults to OVERWRITE_EXISTING_OCR_RESULTS config (e.g. False).",
+    )
+    pdf_group.add_argument(
+        "--no_overwrite_existing_ocr_results",
+        action="store_false",
+        dest="overwrite_existing_ocr_results",
+        help="Use existing OCR results when available (do not overwrite cached JSON).",
+    )
+    pdf_group.add_argument(
+        "--save_page_ocr_visualisations",
+        action="store_true",
+        default=None,
+        help="Save page OCR visualisations (debug bounding boxes). Defaults to SAVE_PAGE_OCR_VISUALISATIONS config.",
+    )
+    pdf_group.add_argument(
+        "--no_save_page_ocr_visualisations",
+        action="store_false",
+        dest="save_page_ocr_visualisations",
+        help="Do not save page OCR visualisations (debug bounding boxes).",
+    )
+
+    # --- LLM PII Detection Arguments ---
+    llm_group = parser.add_argument_group("LLM PII Detection Options")
+    llm_group.add_argument(
+        "--llm_model_choice",
+        default=CLOUD_LLM_PII_MODEL_CHOICE,
+        help="LLM model choice for PII detection. Defaults to CLOUD_LLM_PII_MODEL_CHOICE for Bedrock. "
+        "Note: The actual model used is determined by pii_identification_method - "
+        "CLOUD_LLM_PII_MODEL_CHOICE for Bedrock, INFERENCE_SERVER_LLM_PII_MODEL_CHOICE for inference server, "
+        "LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE for local transformers.",
+    )
+    llm_group.add_argument(
+        "--llm_inference_method",
+        choices=LLM_PII_INFERENCE_METHODS,
+        default=CHOSEN_LLM_PII_INFERENCE_METHOD,
+        help="LLM inference method for PII detection: aws-bedrock, local, inference-server, azure-openai, or gemini.",
+    )
+    llm_group.add_argument(
+        "--inference_server_pii_model",
+        default=DEFAULT_INFERENCE_SERVER_PII_MODEL,
+        help="Inference server PII detection model name.",
+    )
+    llm_group.add_argument(
+        "--llm_temperature",
+        type=float,
+        default=LLM_TEMPERATURE,
+        help="Temperature for LLM PII detection (lower = more deterministic).",
+    )
+    llm_group.add_argument(
+        "--llm_max_tokens",
+        type=int,
+        default=LLM_MAX_NEW_TOKENS,
+        help="Maximum tokens in LLM response for PII detection.",
+    )
+    llm_group.add_argument(
+        "--llm_redact_entities",
+        nargs="+",
+        choices=full_llm_entity_list,
+        default=chosen_llm_entities,
+        help=f"Subset of entities for LLM PII detection (when pii_detector uses an LLM). Default: {chosen_llm_entities}. Full list: {full_llm_entity_list}.",
+    )
+    llm_group.add_argument(
+        "--custom_llm_instructions",
+        default="",
+        help="Custom instructions for LLM-based entity detection (e.g. 'don't redact anything related to Mark Wilson' or 'redact all company names with the label COMPANY_NAME').",
+    )
+
+    # --- Word/Tabular Anonymisation Arguments ---
+    tabular_group = parser.add_argument_group(
+        "Word/Tabular Anonymisation Options (.docx, .csv, .xlsx)"
+    )
+    tabular_group.add_argument(
+        "--anon_strategy",
+        choices=[
+            "redact",
+            "redact completely",
+            "replace_redacted",
+            "entity_type",
+            "encrypt",
+            "hash",
+            "replace with 'REDACTED'",
+            "replace with <ENTITY_NAME>",
+            "mask",
+            "fake_first_name",
+        ],
+        default=DEFAULT_TABULAR_ANONYMISATION_STRATEGY,
+        help="The anonymisation strategy to apply.",
+    )
+    tabular_group.add_argument(
+        "--text_columns",
+        nargs="+",
+        default=list(),
+        help="A list of column names to anonymise or deduplicate in tabular data.",
+    )
+    tabular_group.add_argument(
+        "--excel_sheets",
+        nargs="+",
+        default=list(),
+        help="Specific Excel sheet names to process.",
+    )
+    tabular_group.add_argument(
+        "--fuzzy_mistakes",
+        type=int,
+        default=DEFAULT_FUZZY_SPELLING_MISTAKES_NUM,
+        help="Number of allowed spelling mistakes for fuzzy matching.",
+    )
+    tabular_group.add_argument(
+        "--match_fuzzy_whole_phrase_bool",
+        default=True,
+        help="Match fuzzy whole phrase boolean.",
+    )
+    # --- Duplicate Detection Arguments ---
+    duplicate_group = parser.add_argument_group("Duplicate Detection Options")
+    duplicate_group.add_argument(
+        "--duplicate_type",
+        choices=["pages", "tabular"],
+        default="pages",
+        help="Type of duplicate detection: pages (for OCR files) or tabular (for CSV/Excel files).",
+    )
+    duplicate_group.add_argument(
+        "--similarity_threshold",
+        type=float,
+        default=DEFAULT_DUPLICATE_DETECTION_THRESHOLD,
+        help="Similarity threshold (0-1) to consider content as duplicates.",
+    )
+    duplicate_group.add_argument(
+        "--min_word_count",
+        type=int,
+        default=DEFAULT_MIN_WORD_COUNT,
+        help="Minimum word count for text to be considered in duplicate analysis.",
+    )
+    duplicate_group.add_argument(
+        "--min_consecutive_pages",
+        type=int,
+        default=DEFAULT_MIN_CONSECUTIVE_PAGES,
+        help="Minimum number of consecutive pages to consider as a match.",
+    )
+    duplicate_group.add_argument(
+        "--greedy_match",
+        default=USE_GREEDY_DUPLICATE_DETECTION,
+        help="Use greedy matching strategy for consecutive pages.",
+    )
+    duplicate_group.add_argument(
+        "--combine_pages",
+        default=DEFAULT_COMBINE_PAGES,
+        help="Combine text from the same page number within a file. Alternative will enable line-level duplicate detection.",
+    )
+    duplicate_group.add_argument(
+        "--remove_duplicate_rows",
+        default=REMOVE_DUPLICATE_ROWS,
+        help="Remove duplicate rows from the output.",
+    )
+
+    # --- Document Summarisation Arguments ---
+    summarisation_group = parser.add_argument_group("Document Summarisation Options")
+    summarisation_group.add_argument(
+        "--summarisation_inference_method",
+        choices=[
+            AWS_LLM_PII_OPTION,
+            LOCAL_TRANSFORMERS_LLM_PII_OPTION,
+            INFERENCE_SERVER_PII_OPTION,
+        ],
+        default=AWS_LLM_PII_OPTION,
+        help="LLM inference method for summarisation (same options as GUI).",
+    )
+    summarisation_group.add_argument(
+        "--summarisation_temperature",
+        type=float,
+        default=0.6,
+        help="Temperature for summarisation (0.0-2.0). Lower is more deterministic.",
+    )
+    summarisation_group.add_argument(
+        "--summarisation_max_pages_per_group",
+        type=int,
+        default=30,
+        help="Maximum pages per page-group summary (in addition to context-length limits).",
+    )
+    summarisation_group.add_argument(
+        "--summary_page_group_max_workers",
+        type=int,
+        default=SUMMARY_PAGE_GROUP_MAX_WORKERS,
+        metavar="N",
+        help="Max threads for page-group summarisation (1 = sequential). Defaults to SUMMARY_PAGE_GROUP_MAX_WORKERS config (e.g. 1).",
+    )
+    summarisation_group.add_argument(
+        "--summarisation_api_key",
+        default="",
+        help="API key for summarisation (if required by the chosen LLM).",
+    )
+    summarisation_group.add_argument(
+        "--summarisation_context",
+        default="",
+        help="Additional context for summarisation (e.g. 'This is a consultation response document').",
+    )
+    summarisation_group.add_argument(
+        "--summarisation_format",
+        choices=["concise", "detailed"],
+        default="detailed",
+        help="Summary format: concise (key themes only) or detailed (as much detail as possible).",
+    )
+    summarisation_group.add_argument(
+        "--summarisation_additional_instructions",
+        default="",
+        help="Additional summary instructions (e.g. 'Focus on key decisions and recommendations').",
+    )
+
+    # --- Textract Batch Operations Arguments ---
+    textract_group = parser.add_argument_group("Textract Batch Operations Options")
+    textract_group.add_argument(
+        "--textract_action",
+        choices=["submit", "retrieve", "list"],
+        help="Textract action to perform: submit (submit document for analysis), retrieve (get results by job ID), or list (show recent jobs).",
+    )
+    textract_group.add_argument("--job_id", help="Textract job ID for retrieve action.")
+    textract_group.add_argument(
+        "--extract_signatures",
+        action="store_true",
+        help="Extract signatures during Textract analysis (for submit action).",
+    )
+    textract_group.add_argument(
+        "--textract_bucket",
+        default=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET,
+        help="S3 bucket name for Textract operations (overrides default).",
+    )
+    textract_group.add_argument(
+        "--textract_input_prefix",
+        default=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER,
+        help="S3 prefix for input files in Textract operations.",
+    )
+    textract_group.add_argument(
+        "--textract_output_prefix",
+        default=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER,
+        help="S3 prefix for output files in Textract operations.",
+    )
+    textract_group.add_argument(
+        "--s3_textract_document_logs_subfolder",
+        default=TEXTRACT_JOBS_S3_LOC,
+        help="S3 prefix for logs in Textract operations.",
+    )
+    textract_group.add_argument(
+        "--local_textract_document_logs_subfolder",
+        default=TEXTRACT_JOBS_LOCAL_LOC,
+        help="Local prefix for logs in Textract operations.",
+    )
+    textract_group.add_argument(
+        "--poll_interval",
+        type=int,
+        default=30,
+        help="Polling interval in seconds for Textract job status.",
+    )
+    textract_group.add_argument(
+        "--max_poll_attempts",
+        type=int,
+        default=120,
+        help="Maximum number of polling attempts for Textract job completion.",
+    )
+    # Parse arguments - either from command line or direct mode
+    if direct_mode_args:
+        # Use direct mode arguments
+        args = argparse.Namespace(**direct_mode_args)
+    else:
+        # Parse command line arguments
+        args = parser.parse_args()
+
+    # --- Handle S3 file downloads ---
+    # Download input files from S3 if needed
+    # Note: args.input_file is typically a list (from CLI nargs="+" or from direct mode)
+    # but we also handle pipe-separated strings for compatibility
+    if args.input_file:
+        if isinstance(args.input_file, list):
+            # Handle list of files (may include S3 paths)
+            downloaded_files = []
+            for file_path in args.input_file:
+                downloaded_path = _download_s3_file_if_needed(file_path)
+                downloaded_files.append(downloaded_path)
+            args.input_file = downloaded_files
+        elif isinstance(args.input_file, str):
+            # Handle pipe-separated string (for direct mode compatibility)
+            if "|" in args.input_file:
+                file_list = [f.strip() for f in args.input_file.split("|") if f.strip()]
+                downloaded_files = []
+                for file_path in file_list:
+                    downloaded_path = _download_s3_file_if_needed(file_path)
+                    downloaded_files.append(downloaded_path)
+                args.input_file = downloaded_files
+            else:
+                # Single file path
+                args.input_file = [_download_s3_file_if_needed(args.input_file)]
+
+    # Download other file arguments from S3 if needed
+    if args.deny_list_file:
+        args.deny_list_file = _download_s3_file_if_needed(
+            args.deny_list_file, default_filename="downloaded_deny_list.csv"
+        )
+    if args.allow_list_file:
+        args.allow_list_file = _download_s3_file_if_needed(
+            args.allow_list_file, default_filename="downloaded_allow_list.csv"
+        )
+    if args.redact_whole_page_file:
+        args.redact_whole_page_file = _download_s3_file_if_needed(
+            args.redact_whole_page_file,
+            default_filename="downloaded_redact_whole_page.csv",
+        )
+
+    # --- Initial Setup ---
+    # Convert string boolean variables to boolean
+    if args.preprocess_local_ocr_images == "True":
+        args.preprocess_local_ocr_images = True
+    else:
+        args.preprocess_local_ocr_images = False
+    if args.greedy_match == "True":
+        args.greedy_match = True
+    else:
+        args.greedy_match = False
+    if args.combine_pages == "True":
+        args.combine_pages = True
+    else:
+        args.combine_pages = False
+    if args.remove_duplicate_rows == "True":
+        args.remove_duplicate_rows = True
+    else:
+        args.remove_duplicate_rows = False
+    if args.return_pdf_end_of_redaction == "True":
+        args.return_pdf_end_of_redaction = True
+    else:
+        args.return_pdf_end_of_redaction = False
+    if args.compress_redacted_pdf == "True":
+        args.compress_redacted_pdf = True
+    else:
+        args.compress_redacted_pdf = False
+    if args.do_initial_clean == "True":
+        args.do_initial_clean = True
+    else:
+        args.do_initial_clean = False
+    if args.save_logs_to_csv == "True":
+        args.save_logs_to_csv = True
+    else:
+        args.save_logs_to_csv = False
+    if args.save_logs_to_dynamodb == "True":
+        args.save_logs_to_dynamodb = True
+    else:
+        args.save_logs_to_dynamodb = False
+    if args.display_file_names_in_logs == "True":
+        args.display_file_names_in_logs = True
+    else:
+        args.display_file_names_in_logs = False
+    if args.match_fuzzy_whole_phrase_bool == "True":
+        args.match_fuzzy_whole_phrase_bool = True
+    else:
+        args.match_fuzzy_whole_phrase_bool = False
+    # Convert save_to_user_folders to boolean (handles both string and boolean values)
+    args.save_to_user_folders = convert_string_to_boolean(args.save_to_user_folders)
+    # Convert save_outputs_to_s3 to boolean (handles both string and boolean values)
+    args.save_outputs_to_s3 = convert_string_to_boolean(args.save_outputs_to_s3)
+
+    # Combine extraction options
+    extraction_options = (
+        list(args.handwrite_signature_extraction)
+        if args.handwrite_signature_extraction
+        else []
+    )
+    if args.extract_forms:
+        extraction_options.append("Extract forms")
+    if args.extract_tables:
+        extraction_options.append("Extract tables")
+    if args.extract_layout:
+        extraction_options.append("Extract layout")
+    args.handwrite_signature_extraction = extraction_options
+
+    if args.task in ["redact", "deduplicate", "summarise", "combine_review_pdfs"]:
+        if args.input_file:
+            if isinstance(args.input_file, str):
+                args.input_file = [args.input_file]
+
+            _, file_extension = os.path.splitext(args.input_file[0])
+            file_extension = file_extension.lower()
+        else:
+            raise ValueError(f"Error: --input_file is required for '{args.task}' task.")
+
+    # Initialise usage logger if logging is enabled
+    usage_logger = None
+    if args.save_logs_to_csv or args.save_logs_to_dynamodb:
+        from tools.cli_usage_logger import create_cli_usage_logger
+
+        try:
+            usage_logger = create_cli_usage_logger(logs_folder=args.usage_logs_folder)
+        except Exception as e:
+            print(f"Warning: Could not initialise usage logger: {e}")
+
+    # Get username and folders
+    (
+        session_hash,
+        args.output_dir,
+        _,
+        args.input_dir,
+        args.textract_input_prefix,
+        args.textract_output_prefix,
+        args.s3_textract_document_logs_subfolder,
+        args.local_textract_document_logs_subfolder,
+    ) = get_username_and_folders(
+        username=args.username,
+        output_folder_textbox=args.output_dir,
+        input_folder_textbox=args.input_dir,
+        session_output_folder=args.save_to_user_folders,
+        textract_document_upload_input_folder=args.textract_input_prefix,
+        textract_document_upload_output_folder=args.textract_output_prefix,
+        s3_textract_document_logs_subfolder=args.s3_textract_document_logs_subfolder,
+        local_textract_document_logs_subfolder=args.local_textract_document_logs_subfolder,
+    )
+
+    print(
+        f"Conducting analyses with user {args.username}. Outputs will be saved to {args.output_dir}."
+    )
+
+    # Build S3 output folder path if S3 uploads are enabled
+    s3_output_folder = ""
+    if args.save_outputs_to_s3 and args.s3_outputs_folder:
+        s3_output_folder = _build_s3_output_folder(
+            s3_outputs_folder=args.s3_outputs_folder,
+            session_hash=session_hash,
+            save_to_user_folders=args.save_to_user_folders,
+        )
+        if s3_output_folder:
+            print(f"S3 output folder: s3://{args.s3_outputs_bucket}/{s3_output_folder}")
+    elif args.save_outputs_to_s3 and not args.s3_outputs_folder:
+        print(
+            "Warning: --save_outputs_to_s3 is enabled but --s3_outputs_folder is not set. Outputs will not be uploaded to S3."
+        )
+
+    # --- Route to the Correct Workflow Based on Task and File Type ---
+
+    # Validate input_file requirement for tasks that need it
+    if (
+        args.task in ["redact", "deduplicate", "summarise", "combine_review_pdfs"]
+        and not args.input_file
+    ):
+        print(f"Error: --input_file is required for '{args.task}' task.")
+        return
+
+    if args.ocr_method in ["Local OCR", "AWS Textract"]:
+        args.prepare_images = True
+    else:
+        args.prepare_images = False
+
+    from tools.cli_usage_logger import create_cli_usage_logger, log_redaction_usage
+
+    # Task 1: Redaction/Anonymisation
+    if args.task == "redact":
+
+        # Workflow 1: PDF/Image Redaction
+        if file_extension in [".pdf", ".png", ".jpg", ".jpeg"]:
+            print("--- Detected PDF/Image file. Starting Redaction Workflow... ---")
+            start_time = time.time()
+            try:
+                from tools.file_conversion import prepare_image_or_pdf
+                from tools.file_redaction import choose_and_run_redactor
+
+                # Step 1: Prepare the document
+                print("\nStep 1: Preparing document...")
+                (
+                    prep_summary,
+                    prepared_pdf_paths,
+                    image_file_paths,
+                    _,
+                    _,
+                    pdf_doc,
+                    image_annotations,
+                    _,
+                    original_cropboxes,
+                    page_sizes,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                ) = prepare_image_or_pdf(
+                    file_paths=args.input_file,
+                    text_extract_method=args.ocr_method,
+                    all_line_level_ocr_results_df=pd.DataFrame(),
+                    all_page_line_level_ocr_results_with_words_df=pd.DataFrame(),
+                    first_loop_state=True,
+                    prepare_for_review=False,
+                    output_folder=args.output_dir,
+                    input_folder=args.input_dir,
+                    prepare_images=args.prepare_images,
+                    page_min=args.page_min,
+                    page_max=args.page_max,
+                )
+                print(f"Preparation complete. {prep_summary}")
+
+                # Note: VLM and LLM clients are initialized inside choose_and_run_redactor
+                # based on text_extraction_method and pii_identification_method.
+                # Model choices (vlm_model_choice, llm_model_choice) can be overridden via
+                # environment variables (CLOUD_VLM_MODEL_CHOICE, CLOUD_LLM_PII_MODEL_CHOICE) before running the CLI.
+                # For CLI, we pass inference_server_vlm_model and custom_llm_instructions.
+                # Other LLM parameters (temperature, max_tokens, inference_method) are set via
+                # environment variables or config defaults.
+
+                # Step 2: Redact the prepared document
+                print("\nStep 2: Running redaction...")
+                (
+                    output_summary,
+                    output_files,
+                    _,
+                    _,
+                    log_files,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    comprehend_query_number,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    page_sizes,
+                    _,
+                    _,
+                    _,
+                    _,
+                    total_textract_query_number,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    vlm_model_name,
+                    vlm_total_input_tokens,
+                    vlm_total_output_tokens,
+                    llm_model_name,
+                    llm_total_input_tokens,
+                    llm_total_output_tokens,
+                    _,
+                ) = choose_and_run_redactor(
+                    file_paths=args.input_file,
+                    prepared_pdf_file_paths=prepared_pdf_paths,
+                    pdf_image_file_paths=image_file_paths,
+                    chosen_redact_entities=args.local_redact_entities,
+                    chosen_redact_comprehend_entities=args.aws_redact_entities,
+                    chosen_llm_entities=args.llm_redact_entities,
+                    text_extraction_method=args.ocr_method,
+                    in_allow_list=args.allow_list_file,
+                    in_deny_list=args.deny_list_file,
+                    redact_whole_page_list=args.redact_whole_page_file,
+                    first_loop_state=True,
+                    page_min=args.page_min,
+                    page_max=args.page_max,
+                    handwrite_signature_checkbox=args.handwrite_signature_extraction,
+                    max_fuzzy_spelling_mistakes_num=args.fuzzy_mistakes,
+                    match_fuzzy_whole_phrase_bool=args.match_fuzzy_whole_phrase_bool,
+                    pymupdf_doc=pdf_doc,
+                    annotations_all_pages=image_annotations,
+                    page_sizes=page_sizes,
+                    document_cropboxes=original_cropboxes,
+                    pii_identification_method=args.pii_detector,
+                    aws_access_key_textbox=args.aws_access_key,
+                    aws_secret_key_textbox=args.aws_secret_key,
+                    language=args.language,
+                    output_folder=args.output_dir,
+                    input_folder=args.input_dir,
+                    custom_llm_instructions=args.custom_llm_instructions,
+                    inference_server_vlm_model=(
+                        args.inference_server_vlm_model
+                        if args.inference_server_vlm_model
+                        else DEFAULT_INFERENCE_SERVER_VLM_MODEL
+                    ),
+                    efficient_ocr=getattr(args, "efficient_ocr", EFFICIENT_OCR),
+                    efficient_ocr_min_words=(
+                        args.efficient_ocr_min_words
+                        if getattr(args, "efficient_ocr_min_words", None) is not None
+                        else EFFICIENT_OCR_MIN_WORDS
+                    ),
+                    efficient_ocr_min_image_coverage_fraction=(
+                        args.efficient_ocr_min_image_coverage_fraction
+                        if getattr(
+                            args, "efficient_ocr_min_image_coverage_fraction", None
+                        )
+                        is not None
+                        else EFFICIENT_OCR_MIN_IMAGE_COVERAGE_FRACTION
+                    ),
+                    ocr_first_pass_max_workers=(
+                        args.ocr_first_pass_max_workers
+                        if getattr(args, "ocr_first_pass_max_workers", None) is not None
+                        else OCR_FIRST_PASS_MAX_WORKERS
+                    ),
+                    hybrid_textract_bedrock_vlm=getattr(
+                        args, "hybrid_textract_bedrock_vlm", HYBRID_TEXTRACT_BEDROCK_VLM
+                    ),
+                    overwrite_existing_ocr_results=getattr(
+                        args,
+                        "overwrite_existing_ocr_results",
+                        OVERWRITE_EXISTING_OCR_RESULTS,
+                    ),
+                    save_page_ocr_visualisations=(
+                        getattr(args, "save_page_ocr_visualisations", None)
+                        if getattr(args, "save_page_ocr_visualisations", None)
+                        is not None
+                        else SAVE_PAGE_OCR_VISUALISATIONS
+                    ),
+                    # Note: bedrock_runtime, gemini_client, gemini_config, azure_openai_client
+                    # are initialized inside choose_and_run_redactor based on text_extraction_method
+                    # but we can pass vlm_model_choice through custom_llm_instructions or other means
+                    # The clients will be initialized in choose_and_run_redactor based on the method
+                )
+
+                # Calculate processing time
+                end_time = time.time()
+                processing_time = end_time - start_time
+
+                # Log usage data if logger is available
+                if usage_logger:
+                    try:
+                        # Extract file name for logging
+                        print("Saving logs to CSV")
+                        doc_file_name = (
+                            os.path.basename(args.input_file[0])
+                            if args.display_file_names_in_logs
+                            else "document"
+                        )
+                        data_file_name = ""  # Not applicable for PDF/image redaction
+
+                        # Determine if this was a Textract API call
+                        is_textract_call = args.ocr_method == "AWS Textract"
+
+                        # Count pages (approximate from page_sizes if available)
+                        total_pages = len(page_sizes) if page_sizes else 1
+
+                        # Count API calls (approximate - would need to be tracked in the redaction function)
+                        textract_queries = (
+                            int(total_textract_query_number) if is_textract_call else 0
+                        )
+                        comprehend_queries = (
+                            int(comprehend_query_number)
+                            if args.pii_detector == "AWS Comprehend"
+                            else 0
+                        )
+
+                        # Format handwriting/signature options
+                        handwriting_signature = (
+                            ", ".join(args.handwrite_signature_extraction)
+                            if args.handwrite_signature_extraction
+                            else ""
+                        )
+
+                        log_redaction_usage(
+                            logger=usage_logger,
+                            session_hash=session_hash,
+                            doc_file_name=doc_file_name,
+                            data_file_name=data_file_name,
+                            time_taken=processing_time,
+                            total_pages=total_pages,
+                            textract_queries=textract_queries,
+                            pii_method=args.pii_detector,
+                            comprehend_queries=comprehend_queries,
+                            cost_code=args.cost_code,
+                            handwriting_signature=handwriting_signature,
+                            text_extraction_method=args.ocr_method,
+                            is_textract_call=is_textract_call,
+                            task=args.task,
+                            save_to_dynamodb=args.save_logs_to_dynamodb,
+                            save_to_s3=args.upload_logs_to_s3,
+                            s3_bucket=args.s3_bucket,
+                            s3_key_prefix=args.s3_logs_prefix,
+                            vlm_model_name=vlm_model_name,
+                            vlm_total_input_tokens=vlm_total_input_tokens,
+                            vlm_total_output_tokens=vlm_total_output_tokens,
+                            llm_model_name=llm_model_name,
+                            llm_total_input_tokens=llm_total_input_tokens,
+                            llm_total_output_tokens=llm_total_output_tokens,
+                        )
+                    except Exception as e:
+                        print(f"Warning: Could not log usage data: {e}")
+
+                print("\n--- Redaction Process Complete ---")
+                print(f"Summary: {output_summary}")
+                print(f"Processing time: {processing_time:.2f} seconds")
+                print(f"\nOutput files saved to: {args.output_dir}")
+                print("Generated Files:", sorted(output_files))
+                if log_files:
+                    print("Log Files:", sorted(log_files))
+
+                # Upload output files to S3 if enabled
+                if args.save_outputs_to_s3 and s3_output_folder and output_files:
+                    print("\n--- Uploading output files to S3 ---")
+                    try:
+                        # Get base file name for organizing outputs
+                        (
+                            os.path.splitext(os.path.basename(args.input_file[0]))[0]
+                            if args.input_file
+                            else None
+                        )
+                        export_outputs_to_s3(
+                            file_list_state=output_files,
+                            s3_output_folder_state_value=s3_output_folder,
+                            save_outputs_to_s3_flag=args.save_outputs_to_s3,
+                            base_file_state=(
+                                args.input_file[0] if args.input_file else None
+                            ),
+                            s3_bucket=args.s3_outputs_bucket,
+                        )
+                    except Exception as e:
+                        print(f"Warning: Could not upload output files to S3: {e}")
+
+            except Exception as e:
+                print(
+                    f"\nAn error occurred during the PDF/Image redaction workflow: {e}"
+                )
+
+        # Workflow 2: Word/Tabular Data Anonymisation
+        elif file_extension in [".docx", ".xlsx", ".xls", ".csv", ".parquet"]:
+            print(
+                "--- Detected Word/Tabular file. Starting Anonymisation Workflow... ---"
+            )
+            start_time = time.time()
+            try:
+                from tools.data_anonymise import anonymise_files_with_open_text
+
+                # Note: anonymise_files_with_open_text initializes LLM clients internally
+                # based on pii_identification_method. LLM model choices and parameters
+                # can be set via environment variables (CLOUD_LLM_PII_MODEL_CHOICE, LLM_TEMPERATURE, etc.)
+                # before running the CLI.
+
+                # Run the anonymisation function directly
+                (
+                    output_summary,
+                    output_files,
+                    _,
+                    _,
+                    log_files,
+                    _,
+                    processing_time,
+                    comprehend_query_number,
+                    _,
+                    _,
+                    _,
+                ) = anonymise_files_with_open_text(
+                    file_paths=args.input_file,
+                    in_text="",  # Not used for file-based operations
+                    anon_strategy=args.anon_strategy,
+                    chosen_cols=args.text_columns,
+                    chosen_redact_entities=args.local_redact_entities,
+                    in_allow_list=args.allow_list_file,
+                    in_excel_sheets=args.excel_sheets,
+                    first_loop_state=True,
+                    output_folder=args.output_dir,
+                    in_deny_list=args.deny_list_file,
+                    max_fuzzy_spelling_mistakes_num=args.fuzzy_mistakes,
+                    pii_identification_method=args.pii_detector,
+                    chosen_redact_comprehend_entities=args.aws_redact_entities,
+                    aws_access_key_textbox=args.aws_access_key,
+                    aws_secret_key_textbox=args.aws_secret_key,
+                    language=args.language,
+                    do_initial_clean=args.do_initial_clean,
+                )
+
+                # Calculate processing time
+                end_time = time.time()
+                processing_time = end_time - start_time
+
+                # Log usage data if logger is available
+                if usage_logger:
+                    try:
+                        print("Saving logs to CSV")
+                        # Extract file name for logging
+                        doc_file_name = ""  # Not applicable for tabular data
+                        data_file_name = (
+                            os.path.basename(args.input_file[0])
+                            if args.display_file_names_in_logs
+                            else "data_file"
+                        )
+
+                        # Determine if this was a Textract API call (not applicable for tabular)
+                        is_textract_call = False
+
+                        # Count pages (not applicable for tabular data)
+                        total_pages = 0
+
+                        # Count API calls (approximate - would need to be tracked in the anonymisation function)
+                        textract_queries = 0  # Not applicable for tabular data
+                        comprehend_queries = (
+                            comprehend_query_number
+                            if args.pii_detector == "AWS Comprehend"
+                            else 0
+                        )
+
+                        # Format handwriting/signature options (not applicable for tabular)
+                        handwriting_signature = ""
+
+                        log_redaction_usage(
+                            logger=usage_logger,
+                            session_hash=session_hash,
+                            doc_file_name=doc_file_name,
+                            data_file_name=data_file_name,
+                            time_taken=processing_time,
+                            total_pages=total_pages,
+                            textract_queries=textract_queries,
+                            pii_method=args.pii_detector,
+                            comprehend_queries=comprehend_queries,
+                            cost_code=args.cost_code,
+                            handwriting_signature=handwriting_signature,
+                            text_extraction_method="tabular",  # Indicate this is tabular processing
+                            is_textract_call=is_textract_call,
+                            task=args.task,
+                            save_to_dynamodb=args.save_logs_to_dynamodb,
+                            save_to_s3=args.upload_logs_to_s3,
+                            s3_bucket=args.s3_bucket,
+                            s3_key_prefix=args.s3_logs_prefix,
+                            vlm_model_name="",  # TODO: Track from perform_ocr
+                            vlm_total_input_tokens=0,  # TODO: Track from perform_ocr
+                            vlm_total_output_tokens=0,  # TODO: Track from perform_ocr
+                            llm_model_name="",  # TODO: Track from anonymise_script
+                            llm_total_input_tokens=0,  # TODO: Track from anonymise_script
+                            llm_total_output_tokens=0,  # TODO: Track from anonymise_script
+                        )
+                    except Exception as e:
+                        print(f"Warning: Could not log usage data: {e}")
+
+                print("\n--- Anonymisation Process Complete ---")
+                print(f"Summary: {output_summary}")
+                print(f"Processing time: {processing_time:.2f} seconds")
+                print(f"\nOutput files saved to: {args.output_dir}")
+                print("Generated Files:", sorted(output_files))
+                if log_files:
+                    print("Log Files:", sorted(log_files))
+
+                # Upload output files to S3 if enabled
+                if args.save_outputs_to_s3 and s3_output_folder and output_files:
+                    print("\n--- Uploading output files to S3 ---")
+                    try:
+                        export_outputs_to_s3(
+                            file_list_state=output_files,
+                            s3_output_folder_state_value=s3_output_folder,
+                            save_outputs_to_s3_flag=args.save_outputs_to_s3,
+                            base_file_state=(
+                                args.input_file[0] if args.input_file else None
+                            ),
+                            s3_bucket=args.s3_outputs_bucket,
+                        )
+                    except Exception as e:
+                        print(f"Warning: Could not upload output files to S3: {e}")
+
+            except Exception as e:
+                print(
+                    f"\nAn error occurred during the Word/Tabular anonymisation workflow: {e}"
+                )
+
+        else:
+            print(f"Error: Unsupported file type '{file_extension}' for redaction.")
+            print("Supported types for redaction: .pdf, .png, .jpg, .jpeg")
+            print(
+                "Supported types for anonymisation: .docx, .xlsx, .xls, .csv, .parquet"
+            )
+
+    # Task 2: Duplicate Detection
+    elif args.task == "deduplicate":
+        print("--- Starting Duplicate Detection Workflow... ---")
+        try:
+            from tools.find_duplicate_pages import run_duplicate_analysis
+
+            if args.duplicate_type == "pages":
+                # Page duplicate detection
+                if file_extension == ".csv":
+                    print(
+                        "--- Detected OCR CSV file. Starting Page Duplicate Detection... ---"
+                    )
+
+                    start_time = time.time()
+
+                    if args.combine_pages is True:
+                        print("Combining pages...")
+                    else:
+                        print("Using line-level duplicate detection...")
+
+                    # Load the CSV file as a list for the duplicate analysis function
+                    (
+                        results_df,
+                        output_paths,
+                        full_data_by_file,
+                        processing_time,
+                        task_textbox,
+                        _,
+                        _,
+                        _,
+                    ) = run_duplicate_analysis(
+                        files=args.input_file,
+                        threshold=args.similarity_threshold,
+                        min_words=args.min_word_count,
+                        min_consecutive=args.min_consecutive_pages,
+                        greedy_match=args.greedy_match,
+                        combine_pages=args.combine_pages,
+                        output_folder=args.output_dir,
+                        all_page_line_level_ocr_results_df_base=pd.DataFrame(),
+                        ocr_df_paths_list=[],
+                    )
+
+                    end_time = time.time()
+                    processing_time = end_time - start_time
+
+                    print("\n--- Page Duplicate Detection Complete ---")
+                    print(f"Found {len(results_df)} duplicate matches")
+                    print(f"\nOutput files saved to: {args.output_dir}")
+                    if output_paths:
+                        print("Generated Files:", sorted(output_paths))
+
+                    # Upload output files to S3 if enabled
+                    if args.save_outputs_to_s3 and s3_output_folder and output_paths:
+                        print("\n--- Uploading output files to S3 ---")
+                        try:
+                            export_outputs_to_s3(
+                                file_list_state=output_paths,
+                                s3_output_folder_state_value=s3_output_folder,
+                                save_outputs_to_s3_flag=args.save_outputs_to_s3,
+                                base_file_state=(
+                                    args.input_file[0] if args.input_file else None
+                                ),
+                                s3_bucket=args.s3_outputs_bucket,
+                            )
+                        except Exception as e:
+                            print(f"Warning: Could not upload output files to S3: {e}")
+
+                    # Log usage for page deduplication (match app: doc name or "document", data blank)
+                    if usage_logger:
+                        try:
+                            print("Saving logs to CSV")
+                            doc_file_name = (
+                                os.path.basename(args.input_file[0])
+                                if args.display_file_names_in_logs and args.input_file
+                                else "document"
+                            )
+                            data_file_name = ""  # Not applicable for page dedup
+                            log_redaction_usage(
+                                logger=usage_logger,
+                                session_hash=session_hash,
+                                doc_file_name=doc_file_name,
+                                data_file_name=data_file_name,
+                                time_taken=processing_time,
+                                total_pages=0,
+                                textract_queries=0,
+                                comprehend_queries=0,
+                                pii_method=args.pii_detector,
+                                cost_code=args.cost_code,
+                                handwriting_signature="",
+                                text_extraction_method=args.ocr_method,
+                                is_textract_call=False,
+                                task=args.task,
+                                save_to_dynamodb=args.save_logs_to_dynamodb,
+                                save_to_s3=args.upload_logs_to_s3,
+                                s3_bucket=args.s3_bucket,
+                                s3_key_prefix=args.s3_logs_prefix,
+                                vlm_model_name="",
+                                vlm_total_input_tokens=0,
+                                vlm_total_output_tokens=0,
+                                llm_model_name="",
+                                llm_total_input_tokens=0,
+                                llm_total_output_tokens=0,
+                            )
+                        except Exception as e:
+                            print(f"Warning: Could not log usage data: {e}")
+
+                else:
+                    print(
+                        "Error: Page duplicate detection requires CSV files with OCR data."
+                    )
+                    print("Please provide a CSV file containing OCR output data.")
+
+                    # Log usage data if logger is available
+                    if usage_logger:
+                        try:
+                            # Extract file name for logging
+                            print("Saving logs to CSV")
+                            doc_file_name = (
+                                os.path.basename(args.input_file[0])
+                                if args.display_file_names_in_logs
+                                else "document"
+                            )
+                            data_file_name = (
+                                ""  # Not applicable for PDF/image redaction
+                            )
+
+                            # Determine if this was a Textract API call
+                            is_textract_call = False
+
+                            # Count pages (approximate from page_sizes if available)
+                            total_pages = len(page_sizes) if page_sizes else 1
+
+                            # Count API calls (approximate - would need to be tracked in the redaction function)
+                            textract_queries = 0
+                            comprehend_queries = 0
+
+                            # Format handwriting/signature options
+                            handwriting_signature = ""
+
+                            log_redaction_usage(
+                                logger=usage_logger,
+                                session_hash=session_hash,
+                                doc_file_name=doc_file_name,
+                                data_file_name=data_file_name,
+                                time_taken=processing_time,
+                                total_pages=total_pages,
+                                textract_queries=textract_queries,
+                                pii_method=args.pii_detector,
+                                comprehend_queries=comprehend_queries,
+                                cost_code=args.cost_code,
+                                handwriting_signature=handwriting_signature,
+                                text_extraction_method=args.ocr_method,
+                                is_textract_call=is_textract_call,
+                                task=args.task,
+                                save_to_dynamodb=args.save_logs_to_dynamodb,
+                                save_to_s3=args.upload_logs_to_s3,
+                                s3_bucket=args.s3_bucket,
+                                s3_key_prefix=args.s3_logs_prefix,
+                                vlm_model_name="",  # Not applicable for duplicate detection
+                                vlm_total_input_tokens=0,
+                                vlm_total_output_tokens=0,
+                                llm_model_name="",  # Not applicable for duplicate detection
+                                llm_total_input_tokens=0,
+                                llm_total_output_tokens=0,
+                            )
+                        except Exception as e:
+                            print(f"Warning: Could not log usage data: {e}")
+
+            elif args.duplicate_type == "tabular":
+                # Tabular duplicate detection
+                from tools.find_duplicate_tabular import run_tabular_duplicate_detection
+
+                if file_extension in [".csv", ".xlsx", ".xls", ".parquet"]:
+                    print(
+                        "--- Detected tabular file. Starting Tabular Duplicate Detection... ---"
+                    )
+
+                    start_time = time.time()
+
+                    (
+                        results_df,
+                        output_paths,
+                        full_data_by_file,
+                        processing_time,
+                        task_textbox,
+                    ) = run_tabular_duplicate_detection(
+                        files=args.input_file,
+                        threshold=args.similarity_threshold,
+                        min_words=args.min_word_count,
+                        text_columns=args.text_columns,
+                        output_folder=args.output_dir,
+                        do_initial_clean_dup=args.do_initial_clean,
+                        in_excel_tabular_sheets=args.excel_sheets,
+                        remove_duplicate_rows=args.remove_duplicate_rows,
+                    )
+
+                    end_time = time.time()
+                    processing_time = end_time - start_time
+
+                    # Log usage data if logger is available
+                    if usage_logger:
+                        try:
+                            # Extract file name for logging
+                            print("Saving logs to CSV")
+                            doc_file_name = ""  # Tabular dedup: no doc (match app)
+                            data_file_name = (
+                                os.path.basename(args.input_file[0])
+                                if args.display_file_names_in_logs and args.input_file
+                                else "data_file"
+                            )
+
+                            is_textract_call = False
+                            total_pages = 0  # Tabular dedup: no page count (match app)
+                            textract_queries = 0
+                            comprehend_queries = 0
+                            handwriting_signature = ""
+
+                            log_redaction_usage(
+                                logger=usage_logger,
+                                session_hash=session_hash,
+                                doc_file_name=doc_file_name,
+                                data_file_name=data_file_name,
+                                time_taken=processing_time,
+                                total_pages=total_pages,
+                                textract_queries=textract_queries,
+                                pii_method=args.pii_detector,
+                                comprehend_queries=comprehend_queries,
+                                cost_code=args.cost_code,
+                                handwriting_signature=handwriting_signature,
+                                text_extraction_method=args.ocr_method,
+                                is_textract_call=is_textract_call,
+                                task=args.task,
+                                save_to_dynamodb=args.save_logs_to_dynamodb,
+                                save_to_s3=args.upload_logs_to_s3,
+                                s3_bucket=args.s3_bucket,
+                                s3_key_prefix=args.s3_logs_prefix,
+                                vlm_model_name="",  # Not applicable for duplicate detection
+                                vlm_total_input_tokens=0,
+                                vlm_total_output_tokens=0,
+                                llm_model_name="",  # Not applicable for duplicate detection
+                                llm_total_input_tokens=0,
+                                llm_total_output_tokens=0,
+                            )
+                        except Exception as e:
+                            print(f"Warning: Could not log usage data: {e}")
+
+                    print("\n--- Tabular Duplicate Detection Complete ---")
+                    print(f"Found {len(results_df)} duplicate matches")
+                    print(f"\nOutput files saved to: {args.output_dir}")
+                    if output_paths:
+                        print("Generated Files:", sorted(output_paths))
+
+                    # Upload output files to S3 if enabled
+                    if args.save_outputs_to_s3 and s3_output_folder and output_paths:
+                        print("\n--- Uploading output files to S3 ---")
+                        try:
+                            export_outputs_to_s3(
+                                file_list_state=output_paths,
+                                s3_output_folder_state_value=s3_output_folder,
+                                save_outputs_to_s3_flag=args.save_outputs_to_s3,
+                                base_file_state=(
+                                    args.input_file[0] if args.input_file else None
+                                ),
+                                s3_bucket=args.s3_outputs_bucket,
+                            )
+                        except Exception as e:
+                            print(f"Warning: Could not upload output files to S3: {e}")
+
+                else:
+                    print(
+                        "Error: Tabular duplicate detection requires CSV, Excel, or Parquet files."
+                    )
+                    print("Supported types: .csv, .xlsx, .xls, .parquet")
+            else:
+                print(f"Error: Invalid duplicate type '{args.duplicate_type}'.")
+                print("Valid options: 'pages' or 'tabular'")
+
+        except Exception as e:
+            print(f"\nAn error occurred during the duplicate detection workflow: {e}")
+
+    # Task 3: Textract Batch Operations
+    elif args.task == "textract":
+        print("--- Starting Textract Batch Operations Workflow... ---")
+
+        if not args.textract_action:
+            print("Error: --textract_action is required for textract task.")
+            print("Valid options: 'submit', 'retrieve', or 'list'")
+            return
+
+        try:
+            if args.textract_action == "submit":
+                from tools.textract_batch_call import (
+                    analyse_document_with_textract_api,
+                    load_in_textract_job_details,
+                )
+
+                # Submit document to Textract for analysis
+                if not args.input_file:
+                    print("Error: --input_file is required for submit action.")
+                    return
+
+                print(f"--- Submitting document to Textract: {args.input_file} ---")
+
+                start_time = time.time()
+
+                # Load existing job details
+                job_df = load_in_textract_job_details(
+                    load_s3_jobs_loc=args.s3_textract_document_logs_subfolder,
+                    load_local_jobs_loc=args.local_textract_document_logs_subfolder,
+                )
+
+                # Determine signature extraction options
+                signature_options = (
+                    ["Extract handwriting", "Extract signatures"]
+                    if args.extract_signatures
+                    else ["Extract handwriting"]
+                )
+
+                # Use configured bucket or override
+                textract_bucket = args.textract_bucket if args.textract_bucket else ""
+
+                # Submit the job
+                (
+                    result_message,
+                    job_id,
+                    job_type,
+                    successful_job_number,
+                    is_textract_call,
+                    total_pages,
+                    task_textbox,
+                ) = analyse_document_with_textract_api(
+                    local_pdf_path=args.input_file,
+                    s3_input_prefix=args.textract_input_prefix,
+                    s3_output_prefix=args.textract_output_prefix,
+                    job_df=job_df,
+                    s3_bucket_name=textract_bucket,
+                    general_s3_bucket_name=args.s3_bucket,
+                    local_output_dir=args.output_dir,
+                    handwrite_signature_checkbox=signature_options,
+                    aws_region=args.aws_region,
+                )
+
+                end_time = time.time()
+                processing_time = end_time - start_time
+
+                print("\n--- Textract Job Submitted Successfully ---")
+                print(f"Job ID: {job_id}")
+                print(f"Job Type: {job_type}")
+                print(f"Message: {result_message}")
+                print(f"Results will be available in: {args.output_dir}")
+
+                # Log usage data if logger is available
+                if usage_logger:
+                    try:
+                        # Extract file name for logging
+                        print("Saving logs to CSV")
+                        doc_file_name = (
+                            os.path.basename(args.input_file[0])
+                            if args.display_file_names_in_logs
+                            else "document"
+                        )
+                        data_file_name = ""
+
+                        # Determine if this was a Textract API call
+                        is_textract_call = True
+                        args.ocr_method == "AWS Textract"
+
+                        # Count API calls (approximate - would need to be tracked in the redaction function)
+                        textract_queries = total_pages
+                        comprehend_queries = 0
+
+                        # Format handwriting/signature options
+                        handwriting_signature = ""
+
+                        log_redaction_usage(
+                            logger=usage_logger,
+                            session_hash=session_hash,
+                            doc_file_name=doc_file_name,
+                            data_file_name=data_file_name,
+                            time_taken=processing_time,
+                            total_pages=total_pages,
+                            textract_queries=textract_queries,
+                            pii_method=args.pii_detector,
+                            comprehend_queries=comprehend_queries,
+                            cost_code=args.cost_code,
+                            handwriting_signature=handwriting_signature,
+                            text_extraction_method=args.ocr_method,
+                            is_textract_call=is_textract_call,
+                            task=args.task,
+                            save_to_dynamodb=args.save_logs_to_dynamodb,
+                            save_to_s3=args.upload_logs_to_s3,
+                            s3_bucket=args.s3_bucket,
+                            s3_key_prefix=args.s3_logs_prefix,
+                            vlm_model_name="",  # Not applicable for Textract submit
+                            vlm_total_input_tokens=0,
+                            vlm_total_output_tokens=0,
+                            llm_model_name="",  # Not applicable for Textract submit
+                            llm_total_input_tokens=0,
+                            llm_total_output_tokens=0,
+                        )
+                    except Exception as e:
+                        print(f"Warning: Could not log usage data: {e}")
+
+            elif args.textract_action == "retrieve":
+                print(f"--- Retrieving Textract results for Job ID: {args.job_id} ---")
+
+                from tools.textract_batch_call import (
+                    load_in_textract_job_details,
+                    poll_whole_document_textract_analysis_progress_and_download,
+                )
+
+                # Retrieve results by job ID
+                if not args.job_id:
+                    print("Error: --job_id is required for retrieve action.")
+                    return
+
+                # Load existing job details to get job type
+                print("Loading existing job details...")
+                job_df = load_in_textract_job_details(
+                    load_s3_jobs_loc=args.s3_textract_document_logs_subfolder,
+                    load_local_jobs_loc=args.local_textract_document_logs_subfolder,
+                )
+
+                # Find job type from the dataframe
+                job_type = "document_text_detection"  # default
+                if not job_df.empty and "job_id" in job_df.columns:
+                    matching_jobs = job_df.loc[job_df["job_id"] == args.job_id]
+                    if not matching_jobs.empty and "job_type" in matching_jobs.columns:
+                        job_type = matching_jobs.iloc[0]["job_type"]
+
+                # Use configured bucket or override
+                textract_bucket = args.textract_bucket if args.textract_bucket else ""
+
+                # Poll for completion and download results
+                print("Polling for completion and downloading results...")
+                downloaded_file_path, job_status, updated_job_df, output_filename = (
+                    poll_whole_document_textract_analysis_progress_and_download(
+                        job_id=args.job_id,
+                        job_type_dropdown=job_type,
+                        s3_output_prefix=args.textract_output_prefix,
+                        pdf_filename="",  # Will be determined from job details
+                        job_df=job_df,
+                        s3_bucket_name=textract_bucket,
+                        load_s3_jobs_loc=args.s3_textract_document_logs_subfolder,
+                        load_local_jobs_loc=args.local_textract_document_logs_subfolder,
+                        local_output_dir=args.output_dir,
+                        poll_interval_seconds=args.poll_interval,
+                        max_polling_attempts=args.max_poll_attempts,
+                    )
+                )
+
+                print("\n--- Textract Results Retrieved Successfully ---")
+                print(f"Job Status: {job_status}")
+                print(f"Downloaded File: {downloaded_file_path}")
+                # print(f"Output Filename: {output_filename}")
+
+            elif args.textract_action == "list":
+                from tools.textract_batch_call import load_in_textract_job_details
+
+                # List recent Textract jobs
+                print("--- Listing Recent Textract Jobs ---")
+
+                job_df = load_in_textract_job_details(
+                    load_s3_jobs_loc=args.s3_textract_document_logs_subfolder,
+                    load_local_jobs_loc=args.local_textract_document_logs_subfolder,
+                )
+
+                if job_df.empty:
+                    print("No recent Textract jobs found.")
+                else:
+                    print(f"\nFound {len(job_df)} recent Textract jobs:")
+                    print("-" * 80)
+                    for _, job in job_df.iterrows():
+                        print(f"Job ID: {job.get('job_id', 'N/A')}")
+                        print(f"File: {job.get('file_name', 'N/A')}")
+                        print(f"Type: {job.get('job_type', 'N/A')}")
+                        print(f"Signatures: {job.get('signature_extraction', 'N/A')}")
+                        print(f"Date: {job.get('job_date_time', 'N/A')}")
+                        print("-" * 80)
+
+            else:
+                print(f"Error: Invalid textract_action '{args.textract_action}'.")
+                print("Valid options: 'submit', 'retrieve', or 'list'")
+
+        except Exception as e:
+            print(f"\nAn error occurred during the Textract workflow: {e}")
+
+    elif args.task == "summarise":
+        print("--- Document Summarisation ---")
+        try:
+            from tools.cli_usage_logger import log_redaction_usage
+            from tools.file_conversion import is_pdf
+            from tools.summaries import (
+                concise_summary_format_prompt,
+                detailed_summary_format_prompt,
+                load_csv_files_to_dataframe,
+                summarise_document_wrapper,
+            )
+
+            # Map format choice to prompt string (same as GUI)
+            format_map = {
+                "concise": concise_summary_format_prompt,
+                "detailed": detailed_summary_format_prompt,
+            }
+            summarise_format_radio = format_map.get(
+                args.summarisation_format, detailed_summary_format_prompt
+            )
+
+            # Normalise input to list of paths
+            input_paths = (
+                [args.input_file]
+                if isinstance(args.input_file, str)
+                else list(args.input_file or [])
+            )
+            input_paths = [p for p in input_paths if p and str(p).strip()]
+
+            # If any input is a PDF, extract text first then summarise (same as app.py)
+            summarise_from_pdf = any(is_pdf(p) for p in input_paths)
+            if summarise_from_pdf:
+                pdf_path = next((p for p in input_paths if is_pdf(p)), None)
+                if not pdf_path:
+                    print("Error: No PDF path found in input files.")
+                    return
+                print(
+                    f"Detected PDF input. Extracting text with '{args.ocr_method}' then summarising..."
+                )
+                from tools.file_conversion import prepare_image_or_pdf
+                from tools.file_redaction import choose_and_run_redactor
+
+                prepare_images = args.ocr_method in ["Local OCR", "AWS Textract"]
+                (
+                    _prep_summary,
+                    prepared_pdf_paths,
+                    image_file_paths,
+                    _,
+                    _,
+                    pdf_doc,
+                    image_annotations,
+                    _,
+                    original_cropboxes,
+                    page_sizes,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                ) = prepare_image_or_pdf(
+                    file_paths=[pdf_path],
+                    text_extract_method=args.ocr_method,
+                    all_line_level_ocr_results_df=pd.DataFrame(),
+                    all_page_line_level_ocr_results_with_words_df=pd.DataFrame(),
+                    first_loop_state=True,
+                    prepare_for_review=False,
+                    output_folder=args.output_dir,
+                    input_folder=args.input_dir,
+                    prepare_images=prepare_images,
+                    page_min=args.page_min,
+                    page_max=args.page_max,
+                )
+                print(f"  {_prep_summary}")
+
+                (
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    ocr_df,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                ) = choose_and_run_redactor(
+                    file_paths=[pdf_path],
+                    prepared_pdf_file_paths=prepared_pdf_paths,
+                    pdf_image_file_paths=image_file_paths,
+                    chosen_redact_entities=args.local_redact_entities or [],
+                    chosen_redact_comprehend_entities=args.aws_redact_entities or [],
+                    chosen_llm_entities=args.llm_redact_entities or [],
+                    text_extraction_method=args.ocr_method,
+                    in_allow_list=args.allow_list_file,
+                    in_deny_list=args.deny_list_file,
+                    redact_whole_page_list=args.redact_whole_page_file,
+                    first_loop_state=True,
+                    page_min=args.page_min,
+                    page_max=args.page_max,
+                    handwrite_signature_checkbox=args.handwrite_signature_extraction
+                    or [],
+                    max_fuzzy_spelling_mistakes_num=getattr(
+                        args, "fuzzy_mistakes", DEFAULT_FUZZY_SPELLING_MISTAKES_NUM
+                    ),
+                    match_fuzzy_whole_phrase_bool=getattr(
+                        args, "match_fuzzy_whole_phrase_bool", True
+                    ),
+                    pymupdf_doc=pdf_doc,
+                    annotations_all_pages=image_annotations,
+                    page_sizes=page_sizes,
+                    document_cropboxes=original_cropboxes,
+                    pii_identification_method=args.pii_detector or "Local",
+                    aws_access_key_textbox=args.aws_access_key or "",
+                    aws_secret_key_textbox=args.aws_secret_key or "",
+                    language=args.language,
+                    output_folder=args.output_dir,
+                    input_folder=args.input_dir,
+                    custom_llm_instructions=args.custom_llm_instructions or "",
+                    inference_server_vlm_model=(
+                        getattr(args, "inference_server_vlm_model", None)
+                        or DEFAULT_INFERENCE_SERVER_VLM_MODEL
+                    ),
+                    efficient_ocr=getattr(args, "efficient_ocr", EFFICIENT_OCR),
+                    efficient_ocr_min_words=(
+                        getattr(args, "efficient_ocr_min_words", None)
+                        or EFFICIENT_OCR_MIN_WORDS
+                    ),
+                    efficient_ocr_min_image_coverage_fraction=(
+                        getattr(args, "efficient_ocr_min_image_coverage_fraction", None)
+                        if getattr(
+                            args, "efficient_ocr_min_image_coverage_fraction", None
+                        )
+                        is not None
+                        else EFFICIENT_OCR_MIN_IMAGE_COVERAGE_FRACTION
+                    ),
+                    ocr_first_pass_max_workers=(
+                        getattr(args, "ocr_first_pass_max_workers", None)
+                        or OCR_FIRST_PASS_MAX_WORKERS
+                    ),
+                    hybrid_textract_bedrock_vlm=getattr(
+                        args, "hybrid_textract_bedrock_vlm", HYBRID_TEXTRACT_BEDROCK_VLM
+                    ),
+                    overwrite_existing_ocr_results=getattr(
+                        args,
+                        "overwrite_existing_ocr_results",
+                        OVERWRITE_EXISTING_OCR_RESULTS,
+                    ),
+                    save_page_ocr_visualisations=(
+                        getattr(args, "save_page_ocr_visualisations", None)
+                        if getattr(args, "save_page_ocr_visualisations", None)
+                        is not None
+                        else SAVE_PAGE_OCR_VISUALISATIONS
+                    ),
+                    text_extraction_only=True,
+                )
+
+                if ocr_df is None or (
+                    isinstance(ocr_df, pd.DataFrame) and ocr_df.empty
+                ):
+                    print("Error: No OCR text extracted from PDF. Cannot summarise.")
+                    return
+
+                # Derive file_name from PDF path (same as app.py _file_name_from_pdf_path)
+                basename = os.path.basename(pdf_path)
+                file_name = os.path.splitext(basename)[0][:20]
+                invalid_chars = '<>:"/\\|?*'
+                for char in invalid_chars:
+                    file_name = file_name.replace(char, "_")
+                file_name = file_name if file_name else "document"
+            else:
+                # CSV path: load OCR CSV file(s)
+                ocr_df = load_csv_files_to_dataframe(input_paths)
+                if ocr_df is None or ocr_df.empty:
+                    print(
+                        "Error: No valid OCR data (page, line, text columns) in input file(s)."
+                    )
+                    return
+
+                first_path = input_paths[0] if input_paths else ""
+                if first_path:
+                    basename = os.path.basename(first_path)
+                    file_name = os.path.splitext(basename)[0][:20]
+                    invalid_chars = '<>:"/\\|?*'
+                    for char in invalid_chars:
+                        file_name = file_name.replace(char, "_")
+                    file_name = file_name if file_name else "document"
+                else:
+                    file_name = "document"
+
+            (
+                output_files,
+                status_message,
+                llm_model_name,
+                llm_total_input_tokens,
+                llm_total_output_tokens,
+                summary_display_text,
+                elapsed_seconds,
+            ) = summarise_document_wrapper(
+                ocr_df,
+                args.output_dir,
+                args.summarisation_inference_method,
+                args.summarisation_api_key or "",
+                args.summarisation_temperature,
+                file_name,
+                args.summarisation_context or "",
+                args.aws_access_key or "",
+                args.aws_secret_key or "",
+                "",
+                AZURE_OPENAI_INFERENCE_ENDPOINT or "",
+                summarise_format_radio,
+                args.summarisation_additional_instructions or "",
+                args.summarisation_max_pages_per_group,
+                None,
+            )
+
+            processing_time = elapsed_seconds
+
+            print(f"\n{status_message}")
+            if output_files:
+                print("Output files:")
+                for p in output_files:
+                    print(f"  {p}")
+            if summary_display_text:
+                print("\n--- Summary ---")
+                print(
+                    summary_display_text[:2000]
+                    + ("..." if len(summary_display_text) > 2000 else "")
+                )
+
+            # Usage logging (same fields as GUI summarisation success callback)
+            if usage_logger:
+                try:
+                    first_input = input_paths[0] if input_paths else ""
+                    doc_file_name = (
+                        os.path.basename(first_input)
+                        if args.display_file_names_in_logs and first_input
+                        else "document"
+                    )
+                    data_file_name = ""
+                    total_pages = (
+                        int(ocr_df["page"].max())
+                        if "page" in ocr_df.columns and not ocr_df.empty
+                        else 0
+                    )
+
+                    log_redaction_usage(
+                        logger=usage_logger,
+                        session_hash=session_hash,
+                        doc_file_name=doc_file_name,
+                        data_file_name=data_file_name,
+                        time_taken=processing_time,
+                        total_pages=total_pages,
+                        textract_queries=0,
+                        pii_method=args.summarisation_inference_method,
+                        comprehend_queries=0,
+                        cost_code=args.cost_code,
+                        handwriting_signature="",
+                        text_extraction_method="",
+                        is_textract_call=False,
+                        task="summarisation",
+                        save_to_dynamodb=args.save_logs_to_dynamodb,
+                        save_to_s3=args.upload_logs_to_s3,
+                        s3_bucket=args.s3_bucket,
+                        s3_key_prefix=args.s3_logs_prefix,
+                        vlm_model_name="",
+                        vlm_total_input_tokens=0,
+                        vlm_total_output_tokens=0,
+                        llm_model_name=llm_model_name or "",
+                        llm_total_input_tokens=llm_total_input_tokens or 0,
+                        llm_total_output_tokens=llm_total_output_tokens or 0,
+                    )
+                except Exception as e:
+                    print(f"Warning: Could not log usage data: {e}")
+
+        except Exception as e:
+            print(f"\nAn error occurred during summarisation: {e}")
+            import traceback
+
+            traceback.print_exc()
+
+    elif args.task == "combine_review_pdfs":
+        print("--- Combine review PDFs ---")
+        try:
+            from tools.file_conversion import combine_review_pdf_files
+
+            paths = (
+                [args.input_file]
+                if isinstance(args.input_file, str)
+                else list(args.input_file)
+            )
+            if len(paths) < 2:
+                print("Error: combine_review_pdfs requires at least 2 input PDF files.")
+                return
+            out_dir = args.output_dir
+            os.makedirs(out_dir, exist_ok=True)
+            result = combine_review_pdf_files(paths, output_folder=out_dir)
+            if result:
+                print(f"Combined PDF saved to: {result[0]}")
+            else:
+                print("No output produced (empty file list or no valid paths).")
+        except ValueError as e:
+            print(f"Error: {e}")
+        except Exception as e:
+            print(f"\nAn error occurred while combining review PDFs: {e}")
+            import traceback
+
+            traceback.print_exc()
+
+    else:
+        print(f"Error: Invalid task '{args.task}'.")
+        print(
+            "Valid options: 'redact', 'deduplicate', 'textract', 'summarise', or 'combine_review_pdfs'"
+        )
+
+
+if __name__ == "__main__":
+    main()