diff --git a/.coveragerc b/.coveragerc
new file mode 100644
index 0000000000000000000000000000000000000000..4fc07d951c8e95c3af76fe21a692ccc908fa4cfe
--- /dev/null
+++ b/.coveragerc
@@ -0,0 +1,56 @@
+[run]
+source = .
+omit =
+ */tests/*
+ */test/*
+ */__pycache__/*
+ */venv/*
+ */env/*
+ */build/*
+ */dist/*
+ */cdk/*
+ */docs/*
+ */example_data/*
+ */examples/*
+ */feedback/*
+ */logs/*
+ */old_code/*
+ */output/*
+ */tmp/*
+ */usage/*
+ */tld/*
+ */tesseract/*
+ */poppler/*
+ config*.py
+ setup.py
+ lambda_entrypoint.py
+ entrypoint.sh
+ cli_redact.py
+ load_dynamo_logs.py
+ load_s3_logs.py
+ *.spec
+ Dockerfile
+ *.qmd
+ *.md
+ *.txt
+ *.yml
+ *.yaml
+ *.json
+ *.csv
+ *.env
+ *.bat
+ *.ps1
+ *.sh
+
+[report]
+exclude_lines =
+ pragma: no cover
+ def __repr__
+ if self.debug:
+ if settings.DEBUG
+ raise AssertionError
+ raise NotImplementedError
+ if 0:
+ if __name__ == .__main__.:
+ class .*\bProtocol\):
+ @(abc\.)?abstractmethod
diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000000000000000000000000000000000000..16691dfeaaa00c0eb8451f7c0ab2a164b20fb961
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,40 @@
+*.url
+*.ipynb
+*.pyc
+.venv/*
+examples/*
+processing/*
+tools/__pycache__/*
+old_code/*
+tesseract/*
+poppler/*
+build/*
+dist/*
+docs/*
+build_deps/*
+user_guide/*
+cdk/config/*
+tld/*
+cdk/config/*
+cdk/cdk.out/*
+cdk/archive/*
+cdk.json
+cdk.context.json
+.quarto/*
+logs/
+output/
+input/
+feedback/
+config/
+usage/
+test/config/*
+test/feedback/*
+test/input/*
+test/logs/*
+test/output/*
+test/tmp/*
+test/usage/*
+.ruff_cache/*
+model_cache/*
+sanitized_file/*
+src/doc_redaction.egg-info/*
diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000000000000000000000000000000000000..674c5a2ce45c516d0d6787bccfdc540cdd2d5791
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,8 @@
+*.pdf filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.xls filter=lfs diff=lfs merge=lfs -text
+*.xlsx filter=lfs diff=lfs merge=lfs -text
+*.docx filter=lfs diff=lfs merge=lfs -text
+*.doc filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.ico filter=lfs diff=lfs merge=lfs -text
diff --git a/.github/scripts/setup_test_data.py b/.github/scripts/setup_test_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..615d2269ad0075266f470d90cf8da7e4d1aab98e
--- /dev/null
+++ b/.github/scripts/setup_test_data.py
@@ -0,0 +1,311 @@
+#!/usr/bin/env python3
+"""
+Setup script for GitHub Actions test data.
+Creates dummy test files when example data is not available.
+"""
+
+import os
+import sys
+
+import pandas as pd
+
+
+def create_directories():
+ """Create necessary directories."""
+ dirs = ["example_data", "example_data/example_outputs"]
+
+ for dir_path in dirs:
+ os.makedirs(dir_path, exist_ok=True)
+ print(f"Created directory: {dir_path}")
+
+
+def create_dummy_pdf():
+ """Create dummy PDFs for testing."""
+
+ # Install reportlab if not available
+ try:
+ from reportlab.lib.pagesizes import letter
+ from reportlab.pdfgen import canvas
+ except ImportError:
+ import subprocess
+
+ subprocess.check_call(["pip", "install", "reportlab"])
+ from reportlab.lib.pagesizes import letter
+ from reportlab.pdfgen import canvas
+
+ try:
+ # Create the main test PDF
+ pdf_path = (
+ "example_data/example_of_emails_sent_to_a_professor_before_applying.pdf"
+ )
+ print(f"Creating PDF: {pdf_path}")
+ print(f"Directory exists: {os.path.exists('example_data')}")
+
+ c = canvas.Canvas(pdf_path, pagesize=letter)
+ c.drawString(100, 750, "This is a test document for redaction testing.")
+ c.drawString(100, 700, "Email: test@example.com")
+ c.drawString(100, 650, "Phone: 123-456-7890")
+ c.drawString(100, 600, "Name: John Doe")
+ c.drawString(100, 550, "Address: 123 Test Street, Test City, TC 12345")
+ c.showPage()
+
+ # Add second page
+ c.drawString(100, 750, "Second page content")
+ c.drawString(100, 700, "More test data: jane.doe@example.com")
+ c.drawString(100, 650, "Another phone: 987-654-3210")
+ c.save()
+
+ print(f"Created dummy PDF: {pdf_path}")
+
+ # Create Partnership Agreement Toolkit PDF
+ partnership_pdf_path = "example_data/Partnership-Agreement-Toolkit_0_0.pdf"
+ print(f"Creating PDF: {partnership_pdf_path}")
+ c = canvas.Canvas(partnership_pdf_path, pagesize=letter)
+ c.drawString(100, 750, "Partnership Agreement Toolkit")
+ c.drawString(100, 700, "This is a test partnership agreement document.")
+ c.drawString(100, 650, "Contact: partnership@example.com")
+ c.drawString(100, 600, "Phone: (555) 123-4567")
+ c.drawString(100, 550, "Address: 123 Partnership Street, City, State 12345")
+ c.showPage()
+
+ # Add second page
+ c.drawString(100, 750, "Page 2 - Partnership Details")
+ c.drawString(100, 700, "More partnership information here.")
+ c.drawString(100, 650, "Contact: info@partnership.org")
+ c.showPage()
+
+ # Add third page
+ c.drawString(100, 750, "Page 3 - Terms and Conditions")
+ c.drawString(100, 700, "Terms and conditions content.")
+ c.drawString(100, 650, "Legal contact: legal@partnership.org")
+ c.save()
+
+ print(f"Created dummy PDF: {partnership_pdf_path}")
+
+ # Create Graduate Job Cover Letter PDF
+ cover_letter_pdf_path = "example_data/graduate-job-example-cover-letter.pdf"
+ print(f"Creating PDF: {cover_letter_pdf_path}")
+ c = canvas.Canvas(cover_letter_pdf_path, pagesize=letter)
+ c.drawString(100, 750, "Cover Letter Example")
+ c.drawString(100, 700, "Dear Hiring Manager,")
+ c.drawString(100, 650, "I am writing to apply for the position.")
+ c.drawString(100, 600, "Contact: applicant@example.com")
+ c.drawString(100, 550, "Phone: (555) 987-6543")
+ c.drawString(100, 500, "Address: 456 Job Street, Employment City, EC 54321")
+ c.drawString(100, 450, "Sincerely,")
+ c.drawString(100, 400, "John Applicant")
+ c.save()
+
+ print(f"Created dummy PDF: {cover_letter_pdf_path}")
+
+ except ImportError:
+ print("ReportLab not available, skipping PDF creation")
+ # Create simple text files instead
+ with open(
+ "example_data/example_of_emails_sent_to_a_professor_before_applying.pdf",
+ "w",
+ ) as f:
+ f.write("This is a dummy PDF file for testing")
+
+ with open(
+ "example_data/Partnership-Agreement-Toolkit_0_0.pdf",
+ "w",
+ ) as f:
+ f.write("This is a dummy Partnership Agreement PDF file for testing")
+
+ with open(
+ "example_data/graduate-job-example-cover-letter.pdf",
+ "w",
+ ) as f:
+ f.write("This is a dummy cover letter PDF file for testing")
+
+ print("Created dummy text files instead of PDFs")
+
+
+def create_dummy_csv():
+ """Create dummy CSV files for testing."""
+ # Main CSV
+ csv_data = {
+ "Case Note": [
+ "Client visited for consultation regarding housing issues",
+ "Follow-up appointment scheduled for next week",
+ "Documentation submitted for review",
+ ],
+ "Client": ["John Smith", "Jane Doe", "Bob Johnson"],
+ "Date": ["2024-01-15", "2024-01-16", "2024-01-17"],
+ }
+ df = pd.DataFrame(csv_data)
+ df.to_csv("example_data/combined_case_notes.csv", index=False)
+ print("Created dummy CSV: example_data/combined_case_notes.csv")
+
+ # Lambeth CSV
+ lambeth_data = {
+ "text": [
+ "Lambeth 2030 vision document content",
+ "Our Future Our Lambeth strategic plan",
+ "Community engagement and development",
+ ],
+ "page": [1, 2, 3],
+ }
+ df_lambeth = pd.DataFrame(lambeth_data)
+ df_lambeth.to_csv(
+ "example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv", index=False
+ )
+ print("Created dummy CSV: example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv")
+
+
+def create_dummy_word_doc():
+ """Create dummy Word document."""
+ try:
+ from docx import Document
+
+ doc = Document()
+ doc.add_heading("Test Document for Redaction", 0)
+ doc.add_paragraph("This is a test document for redaction testing.")
+ doc.add_paragraph("Contact Information:")
+ doc.add_paragraph("Email: test@example.com")
+ doc.add_paragraph("Phone: 123-456-7890")
+ doc.add_paragraph("Name: John Doe")
+ doc.add_paragraph("Address: 123 Test Street, Test City, TC 12345")
+
+ doc.save("example_data/Bold minimalist professional cover letter.docx")
+ print("Created dummy Word document")
+
+ except ImportError:
+ print("python-docx not available, skipping Word document creation")
+
+
+def create_allow_deny_lists():
+ """Create dummy allow/deny lists."""
+ # Allow lists
+ allow_data = {"word": ["test", "example", "document"]}
+ pd.DataFrame(allow_data).to_csv(
+ "example_data/test_allow_list_graduate.csv", index=False
+ )
+ pd.DataFrame(allow_data).to_csv(
+ "example_data/test_allow_list_partnership.csv", index=False
+ )
+ print("Created allow lists")
+
+ # Deny lists
+ deny_data = {"word": ["sensitive", "confidential", "private"]}
+ pd.DataFrame(deny_data).to_csv(
+ "example_data/partnership_toolkit_redact_custom_deny_list.csv", index=False
+ )
+ pd.DataFrame(deny_data).to_csv(
+ "example_data/Partnership-Agreement-Toolkit_test_deny_list_para_single_spell.csv",
+ index=False,
+ )
+ print("Created deny lists")
+
+ # Whole page redaction list
+ page_data = {"page": [1, 2]}
+ pd.DataFrame(page_data).to_csv(
+ "example_data/partnership_toolkit_redact_some_pages.csv", index=False
+ )
+ print("Created whole page redaction list")
+
+
+def create_ocr_output():
+ """Create dummy OCR output CSV."""
+ ocr_data = {
+ "page": [1, 2, 3],
+ "text": [
+ "This is page 1 content with some text",
+ "This is page 2 content with different text",
+ "This is page 3 content with more text",
+ ],
+ "left": [0.1, 0.3, 0.5],
+ "top": [0.95, 0.92, 0.88],
+ "width": [0.05, 0.02, 0.02],
+ "height": [0.01, 0.02, 0.02],
+ "line": [1, 2, 3],
+ }
+ df = pd.DataFrame(ocr_data)
+ df.to_csv(
+ "example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv",
+ index=False,
+ )
+ print("Created dummy OCR output CSV")
+
+
+def create_dummy_image():
+ """Create dummy image for testing."""
+ try:
+ from PIL import Image, ImageDraw, ImageFont
+
+ img = Image.new("RGB", (800, 600), color="white")
+ draw = ImageDraw.Draw(img)
+
+ # Try to use a system font
+ try:
+ font = ImageFont.truetype(
+ "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 20
+ )
+ except Exception as e:
+ print(f"Error loading DejaVuSans font: {e}")
+ try:
+ font = ImageFont.truetype("/System/Library/Fonts/Arial.ttf", 20)
+ except Exception as e:
+ print(f"Error loading Arial font: {e}")
+ font = ImageFont.load_default()
+
+ # Add text to image
+ draw.text((50, 50), "Test Document for Redaction", fill="black", font=font)
+ draw.text((50, 100), "Email: test@example.com", fill="black", font=font)
+ draw.text((50, 150), "Phone: 123-456-7890", fill="black", font=font)
+ draw.text((50, 200), "Name: John Doe", fill="black", font=font)
+ draw.text((50, 250), "Address: 123 Test Street", fill="black", font=font)
+
+ img.save("example_data/example_complaint_letter.jpg")
+ print("Created dummy image")
+
+ except ImportError:
+ print("PIL not available, skipping image creation")
+
+
+def main():
+ """Main setup function."""
+ print("Setting up test data for GitHub Actions...")
+ print(f"Current working directory: {os.getcwd()}")
+ print(f"Python version: {sys.version}")
+
+ create_directories()
+ create_dummy_pdf()
+ create_dummy_csv()
+ create_dummy_word_doc()
+ create_allow_deny_lists()
+ create_ocr_output()
+ create_dummy_image()
+
+ print("\nTest data setup complete!")
+ print("Created files:")
+ for root, dirs, files in os.walk("example_data"):
+ for file in files:
+ file_path = os.path.join(root, file)
+ print(f" {file_path}")
+ # Verify the file exists and has content
+ if os.path.exists(file_path):
+ file_size = os.path.getsize(file_path)
+ print(f" Size: {file_size} bytes")
+ else:
+ print(" WARNING: File does not exist!")
+
+ # Verify critical files exist
+ critical_files = [
+ "example_data/Partnership-Agreement-Toolkit_0_0.pdf",
+ "example_data/graduate-job-example-cover-letter.pdf",
+ "example_data/example_of_emails_sent_to_a_professor_before_applying.pdf",
+ ]
+
+ print("\nVerifying critical test files:")
+ for file_path in critical_files:
+ if os.path.exists(file_path):
+ file_size = os.path.getsize(file_path)
+ print(f"✅ {file_path} exists ({file_size} bytes)")
+ else:
+ print(f"❌ {file_path} MISSING!")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/.github/workflow_README.md b/.github/workflow_README.md
new file mode 100644
index 0000000000000000000000000000000000000000..19582f83810ccae7513bd8ef9a5d2b517b5c56ee
--- /dev/null
+++ b/.github/workflow_README.md
@@ -0,0 +1,183 @@
+# GitHub Actions CI/CD Setup
+
+This directory contains GitHub Actions workflows for automated testing of the CLI redaction application.
+
+## Workflows Overview
+
+### 1. **Simple Test Run** (`.github/workflows/simple-test.yml`)
+- **Purpose**: Basic test execution
+- **Triggers**: Push to main/dev, Pull requests
+- **OS**: Ubuntu Latest
+- **Python**: 3.11
+- **Features**:
+ - Installs system dependencies
+ - Sets up test data
+ - Runs CLI tests
+ - Runs pytest
+
+### 2. **Comprehensive CI/CD** (`.github/workflows/ci.yml`)
+- **Purpose**: Full CI/CD pipeline
+- **Features**:
+ - Linting (Ruff, Black)
+ - Unit tests (Python 3.10, 3.11, 3.12)
+ - Integration tests
+ - Security scanning (Safety, Bandit)
+ - Coverage reporting
+ - Package building (on main branch)
+
+### 3. **Multi-OS Testing** (`.github/workflows/multi-os-test.yml`)
+- **Purpose**: Cross-platform testing
+- **OS**: Ubuntu, macOS (Windows not included currently but may be reintroduced)
+- **Python**: 3.10, 3.11, 3.12
+- **Features**: Tests compatibility across different operating systems
+
+### 4. **Basic Test Suite** (`.github/workflows/test.yml`)
+- **Purpose**: Original test workflow
+- **Features**:
+ - Multiple Python versions
+ - System dependency installation
+ - Test data creation
+ - Coverage reporting
+
+## Setup Scripts
+
+### Test Data Setup (`.github/scripts/setup_test_data.py`)
+Creates dummy test files when example data is not available:
+- PDF documents
+- CSV files
+- Word documents
+- Images
+- Allow/deny lists
+- OCR output files
+
+## Usage
+
+### Running Tests Locally
+
+```bash
+# Install dependencies
+pip install -r requirements.txt
+pip install pytest pytest-cov
+
+# Setup test data
+python .github/scripts/setup_test_data.py
+
+# Run tests
+cd test
+python test.py
+```
+
+### GitHub Actions Triggers
+
+1. **Push to main/dev**: Runs all tests
+2. **Pull Request**: Runs tests and linting
+3. **Daily Schedule**: Runs tests at 2 AM UTC
+4. **Manual Trigger**: Can be triggered manually from GitHub
+
+## Configuration
+
+### Environment Variables
+- `PYTHON_VERSION`: Default Python version (3.11)
+- `PYTHONPATH`: Set automatically for test discovery
+
+### Caching
+- Pip dependencies are cached for faster builds
+- Cache key based on requirements.txt hash
+
+### Artifacts
+- Test results (JUnit XML)
+- Coverage reports (HTML, XML)
+- Security reports
+- Build artifacts (on main branch)
+
+## Test Data
+
+The workflows automatically create test data when example files are missing:
+
+### Required Files Created:
+- `example_data/example_of_emails_sent_to_a_professor_before_applying.pdf`
+- `example_data/combined_case_notes.csv`
+- `example_data/Bold minimalist professional cover letter.docx`
+- `example_data/example_complaint_letter.jpg`
+- `example_data/test_allow_list_*.csv`
+- `example_data/partnership_toolkit_redact_*.csv`
+- `example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv`
+
+### Dependencies Installed:
+- **System**: tesseract-ocr, poppler-utils, OpenGL libraries
+- **Python**: All requirements.txt packages + pytest, reportlab, pillow
+
+## Workflow Status
+
+### Success Criteria:
+- ✅ All tests pass
+- ✅ No linting errors
+- ✅ Security checks pass
+- ✅ Coverage meets threshold (if configured)
+
+### Failure Handling:
+- Tests are designed to skip gracefully if files are missing
+- AWS tests are expected to fail without credentials
+- System dependency failures are handled with fallbacks
+
+## Customization
+
+### Adding New Tests:
+1. Add test methods to `test/test.py`
+2. Update test data in `setup_test_data.py` if needed
+3. Tests will automatically run in all workflows
+
+### Modifying Workflows:
+1. Edit the appropriate `.yml` file
+2. Test locally first
+3. Push to trigger the workflow
+
+### Environment-Specific Settings:
+- **Ubuntu**: Full system dependencies
+- **Windows**: Python packages only
+- **macOS**: Homebrew dependencies
+
+## Troubleshooting
+
+### Common Issues:
+
+1. **Missing Dependencies**:
+ - Check system dependency installation
+ - Verify Python package versions
+
+2. **Test Failures**:
+ - Check test data creation
+ - Verify file paths
+ - Review test output logs
+
+3. **AWS Test Failures**:
+ - Expected without credentials
+ - Tests are designed to handle this gracefully
+
+4. **System Dependency Issues**:
+ - Different OS have different requirements
+ - Check the specific OS section in workflows
+
+### Debug Mode:
+Add `--verbose` or `-v` flags to pytest commands for more detailed output.
+
+## Security
+
+- Dependencies are scanned with Safety
+- Code is scanned with Bandit
+- No secrets are exposed in logs
+- Test data is temporary and cleaned up
+
+## Performance
+
+- Tests run in parallel where possible
+- Dependencies are cached
+- Only necessary system packages are installed
+- Test data is created efficiently
+
+## Monitoring
+
+- Workflow status is visible in GitHub Actions tab
+- Coverage reports are uploaded to Codecov
+- Test results are available as artifacts
+- Security reports are generated and stored
diff --git a/.github/workflows/archive_workflows/multi-os-test.yml b/.github/workflows/archive_workflows/multi-os-test.yml
new file mode 100644
index 0000000000000000000000000000000000000000..de332621c498db6694aebbbe915f2cb96b38e136
--- /dev/null
+++ b/.github/workflows/archive_workflows/multi-os-test.yml
@@ -0,0 +1,109 @@
+name: Multi-OS Test
+
+on:
+ push:
+ branches: [ main ]
+ pull_request:
+ branches: [ main ]
+
+permissions:
+ contents: read
+ actions: read
+
+jobs:
+ test:
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ os: [ubuntu-latest, macos-latest] # windows-latest, not included as tesseract cannot be installed silently
+ python-version: ["3.11", "3.12", "3.13"]
+ exclude:
+ # Exclude some combinations to reduce CI time
+ #- os: windows-latest
+ # python-version: ["3.12", "3.13"]
+ - os: macos-latest
+ python-version: ["3.12", "3.13"]
+
+ steps:
+ - uses: actions/checkout@v6
+
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v6
+ with:
+ python-version: ${{ matrix.python-version }}
+
+ - name: Install system dependencies (Ubuntu)
+ if: matrix.os == 'ubuntu-latest'
+ run: |
+ sudo apt-get update
+ sudo apt-get install -y \
+ tesseract-ocr \
+ tesseract-ocr-eng \
+ poppler-utils \
+ libgl1-mesa-dri \
+ libglib2.0-0
+
+ - name: Install system dependencies (macOS)
+ if: matrix.os == 'macos-latest'
+ run: |
+ brew install tesseract poppler
+
+ - name: Install system dependencies (Windows)
+ if: matrix.os == 'windows-latest'
+ run: |
+ # Create tools directory
+ if (!(Test-Path "C:\tools")) {
+ mkdir C:\tools
+ }
+
+ # Download and install Tesseract
+ $tesseractUrl = "https://github.com/tesseract-ocr/tesseract/releases/download/5.5.0/tesseract-ocr-w64-setup-5.5.0.20241111.exe"
+ $tesseractInstaller = "C:\tools\tesseract-installer.exe"
+ Invoke-WebRequest -Uri $tesseractUrl -OutFile $tesseractInstaller
+
+ # Install Tesseract silently
+ Start-Process -FilePath $tesseractInstaller -ArgumentList "/S", "/D=C:\tools\tesseract" -Wait
+
+ # Download and extract Poppler
+ $popplerUrl = "https://github.com/oschwartz10612/poppler-windows/releases/download/v25.07.0-0/Release-25.07.0-0.zip"
+ $popplerZip = "C:\tools\poppler.zip"
+ Invoke-WebRequest -Uri $popplerUrl -OutFile $popplerZip
+
+ # Extract Poppler
+ Expand-Archive -Path $popplerZip -DestinationPath C:\tools\poppler -Force
+
+ # Add to PATH
+ echo "C:\tools\tesseract" >> $env:GITHUB_PATH
+ echo "C:\tools\poppler\poppler-25.07.0\Library\bin" >> $env:GITHUB_PATH
+
+ # Set environment variables for your application
+ echo "TESSERACT_FOLDER=C:\tools\tesseract" >> $env:GITHUB_ENV
+ echo "POPPLER_FOLDER=C:\tools\poppler\poppler-25.07.0\Library\bin" >> $env:GITHUB_ENV
+ echo "TESSERACT_DATA_FOLDER=C:\tools\tesseract\tessdata" >> $env:GITHUB_ENV
+
+ # Verify installation using full paths (since PATH won't be updated in current session)
+ & "C:\tools\tesseract\tesseract.exe" --version
+ & "C:\tools\poppler\poppler-25.07.0\Library\bin\pdftoppm.exe" -v
+
+ - name: Install Python dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install -r requirements.txt
+ pip install pytest pytest-cov reportlab pillow
+
+ - name: Download spaCy model
+ run: |
+ python -m spacy download en_core_web_lg
+
+ - name: Setup test data
+ run: |
+ python .github/scripts/setup_test_data.py
+
+ - name: Run CLI tests
+ run: |
+ cd test
+ python test.py
+
+ - name: Run tests with pytest
+ run: |
+ pytest test/test.py -v --tb=short
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000000000000000000000000000000000000..dc4743c0118c4651dd836a65bb45c52cef9d8881
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,260 @@
+name: CI/CD Pipeline
+
+on:
+ push:
+ branches: [ main ]
+ pull_request:
+ branches: [ main ]
+ #schedule:
+ # Run tests daily at 2 AM UTC
+ # - cron: '0 2 * * *'
+
+permissions:
+ contents: read
+ actions: read
+ pull-requests: write
+ issues: write
+
+env:
+ PYTHON_VERSION: "3.11"
+
+jobs:
+ lint:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v6
+
+ - name: Set up Python
+ uses: actions/setup-python@v6
+ with:
+ python-version: ${{ env.PYTHON_VERSION }}
+
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install ruff black
+
+ - name: Run Ruff linter
+ run: ruff check .
+
+ - name: Run Black formatter check
+ run: black --check .
+
+ test-unit:
+ runs-on: ubuntu-latest
+ strategy:
+ matrix:
+ python-version: [3.11, 3.12, 3.13]
+
+ steps:
+ - uses: actions/checkout@v6
+
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v6
+ with:
+ python-version: ${{ matrix.python-version }}
+
+ - name: Cache pip dependencies
+ uses: actions/cache@v5
+ with:
+ path: ~/.cache/pip
+ key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
+ restore-keys: |
+ ${{ runner.os }}-pip-
+
+ - name: Install system dependencies
+ run: |
+ sudo apt-get update
+ sudo apt-get install -y \
+ tesseract-ocr \
+ tesseract-ocr-eng \
+ poppler-utils \
+ libgl1-mesa-dri \
+ libglib2.0-0 \
+ libsm6 \
+ libxext6 \
+ libxrender-dev \
+ libgomp1
+
+ - name: Install Python dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install -r requirements_lightweight.txt
+ pip install pytest pytest-cov pytest-html pytest-xdist reportlab pillow
+
+ - name: Download spaCy model
+ run: |
+ python -m spacy download en_core_web_lg
+
+ - name: Setup test data
+ run: |
+ python .github/scripts/setup_test_data.py
+ echo "Setup script completed. Checking results:"
+ ls -la example_data/ || echo "example_data directory not found"
+
+ - name: Verify test data files
+ run: |
+ echo "Checking if critical test files exist:"
+ ls -la example_data/
+ echo "Checking for specific PDF files:"
+ ls -la example_data/*.pdf || echo "No PDF files found"
+ echo "Checking file sizes:"
+ find example_data -name "*.pdf" -exec ls -lh {} \;
+
+ - name: Clean up problematic config files
+ run: |
+ rm -f config*.py || true
+
+ - name: Run CLI tests
+ run: |
+ cd test
+ python test.py
+
+ - name: Run tests with pytest
+ run: |
+ pytest test/test.py -v --tb=short --junitxml=test-results.xml
+
+ - name: Run tests with coverage
+ run: |
+ pytest test/test.py --cov=. --cov-config=.coveragerc --cov-report=xml --cov-report=html --cov-report=term
+
+ #- name: Upload coverage to Codecov - not necessary
+ # uses: codecov/codecov-action@v3
+ # if: matrix.python-version == '3.11'
+ # with:
+ # file: ./coverage.xml
+ # flags: unittests
+ # name: codecov-umbrella
+ # fail_ci_if_error: false
+
+ - name: Upload test results
+ uses: actions/upload-artifact@v6
+ if: always()
+ with:
+ name: test-results-python-${{ matrix.python-version }}
+ path: |
+ test-results.xml
+ htmlcov/
+ coverage.xml
+
+ test-integration:
+ runs-on: ubuntu-latest
+ needs: [lint, test-unit]
+
+ steps:
+ - uses: actions/checkout@v6
+
+ - name: Set up Python
+ uses: actions/setup-python@v6
+ with:
+ python-version: ${{ env.PYTHON_VERSION }}
+
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install -r requirements_lightweight.txt
+ pip install pytest pytest-cov reportlab pillow
+
+ - name: Install system dependencies
+ run: |
+ sudo apt-get update
+ sudo apt-get install -y \
+ tesseract-ocr \
+ tesseract-ocr-eng \
+ poppler-utils \
+ libgl1-mesa-dri \
+ libglib2.0-0
+
+ - name: Download spaCy model
+ run: |
+ python -m spacy download en_core_web_lg
+
+ - name: Setup test data
+ run: |
+ python .github/scripts/setup_test_data.py
+ echo "Setup script completed. Checking results:"
+ ls -la example_data/ || echo "example_data directory not found"
+
+ - name: Verify test data files
+ run: |
+ echo "Checking if critical test files exist:"
+ ls -la example_data/
+ echo "Checking for specific PDF files:"
+ ls -la example_data/*.pdf || echo "No PDF files found"
+ echo "Checking file sizes:"
+ find example_data -name "*.pdf" -exec ls -lh {} \;
+
+ - name: Run integration tests
+ run: |
+ cd test
+ python demo_single_test.py
+
+ - name: Test CLI help
+ run: |
+ python cli_redact.py --help
+
+ - name: Test CLI version
+ run: |
+ python -c "import sys; print(f'Python {sys.version}')"
+
+ security:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v6
+
+ - name: Set up Python
+ uses: actions/setup-python@v6
+ with:
+ python-version: ${{ env.PYTHON_VERSION }}
+
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install safety bandit
+
+ #- name: Run safety scan - removed as now requires login
+ # run: |
+ # safety scan -r requirements.txt
+
+ - name: Run bandit security check
+ run: |
+ bandit -r . -f json -o bandit-report.json || true
+
+ - name: Upload security report
+ uses: actions/upload-artifact@v6
+ if: always()
+ with:
+ name: security-report
+ path: bandit-report.json
+
+ build:
+ runs-on: ubuntu-latest
+ needs: [lint, test-unit]
+ if: github.event_name == 'push' && github.ref == 'refs/heads/main'
+
+ steps:
+ - uses: actions/checkout@v6
+
+ - name: Set up Python
+ uses: actions/setup-python@v6
+ with:
+ python-version: ${{ env.PYTHON_VERSION }}
+
+ - name: Install build dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install build twine
+
+ - name: Build package
+ run: |
+ python -m build
+
+ - name: Check package
+ run: |
+ twine check dist/*
+
+ - name: Upload build artifacts
+ uses: actions/upload-artifact@v6
+ with:
+ name: dist
+ path: dist/
diff --git a/.github/workflows/simple-test.yml b/.github/workflows/simple-test.yml
new file mode 100644
index 0000000000000000000000000000000000000000..6a4d430a18d030fb8a7f5491963536253422143c
--- /dev/null
+++ b/.github/workflows/simple-test.yml
@@ -0,0 +1,67 @@
+name: Simple Test Run
+
+on:
+ push:
+ branches: [ dev ]
+ pull_request:
+ branches: [ dev ]
+
+permissions:
+ contents: read
+ actions: read
+
+jobs:
+ test:
+ runs-on: ubuntu-latest
+
+ steps:
+ - uses: actions/checkout@v6
+
+ - name: Set up Python 3.12
+ uses: actions/setup-python@v6
+ with:
+ python-version: "3.12"
+
+ - name: Install system dependencies
+ run: |
+ sudo apt-get update
+ sudo apt-get install -y \
+ tesseract-ocr \
+ tesseract-ocr-eng \
+ poppler-utils \
+ libgl1-mesa-dri \
+ libglib2.0-0
+
+ - name: Install Python dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install -r requirements_lightweight.txt
+ pip install pytest pytest-cov reportlab pillow
+
+ - name: Download spaCy model
+ run: |
+ python -m spacy download en_core_web_lg
+
+ - name: Setup test data
+ run: |
+ python .github/scripts/setup_test_data.py
+ echo "Setup script completed. Checking results:"
+ ls -la example_data/ || echo "example_data directory not found"
+
+ - name: Verify test data files
+ run: |
+ echo "Checking if critical test files exist:"
+ ls -la example_data/
+ echo "Checking for specific PDF files:"
+ ls -la example_data/*.pdf || echo "No PDF files found"
+ echo "Checking file sizes:"
+ find example_data -name "*.pdf" -exec ls -lh {} \;
+
+ - name: Run CLI tests
+ run: |
+ cd test
+ python test.py
+
+ - name: Run tests with pytest
+ run: |
+ pytest test/test.py -v --tb=short
diff --git a/.github/workflows/sync_to_hf.yml b/.github/workflows/sync_to_hf.yml
new file mode 100644
index 0000000000000000000000000000000000000000..12624f979f7fa9734c9c5824a7cd535b143e0ea8
--- /dev/null
+++ b/.github/workflows/sync_to_hf.yml
@@ -0,0 +1,53 @@
+name: Sync to Hugging Face hub
+on:
+ push:
+ branches: [main]
+
+permissions:
+ contents: read
+
+jobs:
+ sync-to-hub:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v6
+ with:
+ fetch-depth: 1 # Only get the latest state
+ lfs: true # Download actual LFS files so they can be pushed
+
+ - name: Install Git LFS
+ run: git lfs install
+
+ - name: Recreate repo history (single-commit force push)
+ run: |
+ # 1. Capture the message BEFORE we delete the .git folder
+ COMMIT_MSG=$(git log -1 --pretty=%B)
+ echo "Syncing commit message: $COMMIT_MSG"
+
+ # 2. DELETE the .git folder.
+ # This turns the repo into a standard folder of files.
+ rm -rf .git
+
+ # 3. Re-initialize a brand new git repo
+ git init -b main
+ git config --global user.name "$HF_USERNAME"
+ git config --global user.email "$HF_EMAIL"
+
+ # 4. Re-install LFS (needs to be done after git init)
+ git lfs install
+
+ # 5. Add the remote
+ git remote add hf https://$HF_USERNAME:$HF_TOKEN@huggingface.co/spaces/$HF_USERNAME/$HF_REPO_ID
+
+ # 6. Add all files
+ # Since this is a fresh init, Git sees EVERY file as "New"
+ git add .
+
+ # 7. Commit and Force Push
+ git commit -m "Sync: $COMMIT_MSG"
+ git push --force hf main
+ env:
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
+ HF_USERNAME: ${{ secrets.HF_USERNAME }}
+ HF_EMAIL: ${{ secrets.HF_EMAIL }}
+ HF_REPO_ID: ${{ secrets.HF_REPO_ID }}
\ No newline at end of file
diff --git a/.github/workflows/sync_to_hf_zero_gpu.yml b/.github/workflows/sync_to_hf_zero_gpu.yml
new file mode 100644
index 0000000000000000000000000000000000000000..662b836210a1295c5a8dda0be4d998e08eda6454
--- /dev/null
+++ b/.github/workflows/sync_to_hf_zero_gpu.yml
@@ -0,0 +1,53 @@
+name: Sync to Hugging Face hub Zero GPU
+on:
+ push:
+ branches: [dev]
+
+permissions:
+ contents: read
+
+jobs:
+ sync-to-hub-zero-gpu:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v6
+ with:
+ fetch-depth: 1 # Only get the latest state
+ lfs: true # Download actual LFS files so they can be pushed
+
+ - name: Install Git LFS
+ run: git lfs install
+
+ - name: Recreate repo history (single-commit force push)
+ run: |
+ # 1. Capture the message BEFORE we delete the .git folder
+ COMMIT_MSG=$(git log -1 --pretty=%B)
+ echo "Syncing commit message: $COMMIT_MSG"
+
+ # 2. DELETE the .git folder.
+ # This turns the repo into a standard folder of files.
+ rm -rf .git
+
+ # 3. Re-initialize a brand new git repo
+ git init -b main
+ git config --global user.name "$HF_USERNAME"
+ git config --global user.email "$HF_EMAIL"
+
+ # 4. Re-install LFS (needs to be done after git init)
+ git lfs install
+
+ # 5. Add the remote
+ git remote add hf https://$HF_USERNAME:$HF_TOKEN@huggingface.co/spaces/$HF_USERNAME/$HF_REPO_ID_ZERO_GPU
+
+ # 6. Add all files
+ # Since this is a fresh init, Git sees EVERY file as "New"
+ git add .
+
+ # 7. Commit and Force Push
+ git commit -m "Sync: $COMMIT_MSG"
+ git push --force hf main
+ env:
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
+ HF_USERNAME: ${{ secrets.HF_USERNAME }}
+ HF_EMAIL: ${{ secrets.HF_EMAIL }}
+ HF_REPO_ID_ZERO_GPU: ${{ secrets.HF_REPO_ID_ZERO_GPU }}
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..496a491daefe5f3d15658cd2b0a38f78d57d096a
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,45 @@
+*.url
+*.ipynb
+*.pyc
+.venv/*
+examples/*
+processing/*
+input/*
+output/*
+tools/__pycache__/*
+old_code/*
+tesseract/*
+poppler/*
+build/*
+dist/*
+build_deps/*
+logs/*
+usage/*
+feedback/*
+config/*
+user_guide/*
+cdk/config/*
+cdk/cdk.out/*
+cdk/archive/*
+tld/*
+tmp/*
+docs/*
+cdk.out/*
+cdk.json
+cdk.context.json
+.quarto/*
+/.quarto/
+/_site/
+test/config/*
+test/feedback/*
+test/input/*
+test/logs/*
+test/output/*
+test/tmp/*
+test/usage/*
+.ruff_cache/*
+model_cache/*
+sanitized_file/*
+src/doc_redaction.egg-info/*
+
+**/*.quarto_ipynb
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..a4992aeb35eea541eaaf01ad1f05d67483f8d418
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,222 @@
+# Stage 1: Build dependencies and download models
+FROM public.ecr.aws/docker/library/python:3.12.13-slim-trixie AS builder
+
+# Install system dependencies
+RUN apt-get update \
+ && apt-get upgrade -y \
+ && apt-get install -y --no-install-recommends \
+ g++ \
+ make \
+ cmake \
+ unzip \
+ libcurl4-openssl-dev \
+ git \
+ && pip install --upgrade pip \
+ && apt-get clean \
+ && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /src
+
+COPY requirements_lightweight.txt .
+
+RUN pip install --verbose --no-cache-dir --target=/install -r requirements_lightweight.txt && rm requirements_lightweight.txt
+
+# Optionally install PaddleOCR if the INSTALL_PADDLEOCR environment variable is set to True. Note that GPU-enabled PaddleOCR is unlikely to work in the same environment as a GPU-enabled version of PyTorch, so it is recommended to install PaddleOCR as a CPU-only version if you want to use GPU-enabled PyTorch.
+
+ARG INSTALL_PADDLEOCR=False
+ENV INSTALL_PADDLEOCR=${INSTALL_PADDLEOCR}
+
+ARG PADDLE_GPU_ENABLED=False
+ENV PADDLE_GPU_ENABLED=${PADDLE_GPU_ENABLED}
+
+RUN if [ "$INSTALL_PADDLEOCR" = "True" ] && [ "$PADDLE_GPU_ENABLED" = "False" ]; then \
+ pip install --verbose --no-cache-dir --target=/install "protobuf<=7.34.0" && \
+ pip install --verbose --no-cache-dir --target=/install "paddlepaddle<=3.2.1" && \
+ pip install --verbose --no-cache-dir --target=/install "paddleocr<=3.3.0"; \
+elif [ "$INSTALL_PADDLEOCR" = "True" ] && [ "$PADDLE_GPU_ENABLED" = "True" ]; then \
+ pip install --verbose --no-cache-dir --target=/install "protobuf<=7.34.0" && \
+ pip install --verbose --no-cache-dir --target=/install "paddlepaddle-gpu<=3.2.1" --index-url https://www.paddlepaddle.org.cn/packages/stable/cu129/ && \
+ pip install --verbose --no-cache-dir --target=/install "paddleocr<=3.3.0"; \
+fi
+
+ARG INSTALL_VLM=False
+ENV INSTALL_VLM=${INSTALL_VLM}
+
+ARG TORCH_GPU_ENABLED=False
+ENV TORCH_GPU_ENABLED=${TORCH_GPU_ENABLED}
+
+# Optionally install VLM/LLM packages if the INSTALL_VLM environment variable is set to True.
+RUN if [ "$INSTALL_VLM" = "True" ] && [ "$TORCH_GPU_ENABLED" = "False" ]; then \
+ pip install --verbose --no-cache-dir --target=/install \
+ "torch==2.9.1+cpu" \
+ "torchvision==0.24.1+cpu" \
+ "transformers<=5.30.0" \
+ "accelerate<=1.13.0" \
+ "bitsandbytes<=0.49.2" \
+ "sentencepiece<=0.2.1" \
+ --extra-index-url https://download.pytorch.org/whl/cpu; \
+elif [ "$INSTALL_VLM" = "True" ] && [ "$TORCH_GPU_ENABLED" = "True" ]; then \
+ pip install --verbose --no-cache-dir --target=/install "torch<=2.8.0" --index-url https://download.pytorch.org/whl/cu129 && \
+ pip install --verbose --no-cache-dir --target=/install "torchvision<=0.23.0" --index-url https://download.pytorch.org/whl/cu129 && \
+ pip install --verbose --no-cache-dir --target=/install \
+ "transformers<=5.30.0" \
+ "accelerate<=1.13.0" \
+ "bitsandbytes<=0.49.2" \
+ "sentencepiece<=0.2.1" && \
+ pip install --verbose --no-cache-dir --target=/install "optimum<=2.1.0" && \
+ pip install --verbose --no-cache-dir --target=/install https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu12torch2.8cxx11abiTRUE-cp312-cp312-linux_x86_64.whl && \
+ pip install --verbose --no-cache-dir --target=/install https://github.com/ModelCloud/GPTQModel/releases/download/v5.8.0/gptqmodel-5.8.0+cu128torch2.8-cp312-cp312-linux_x86_64.whl; \
+fi
+
+# ===================================================================
+# Stage 2: A common base for both Lambda and Gradio
+# ===================================================================
+FROM public.ecr.aws/docker/library/python:3.12.13-slim-trixie AS base
+
+# MUST re-declare ARGs in every stage where they are used in RUN commands
+ARG TORCH_GPU_ENABLED=False
+ARG PADDLE_GPU_ENABLED=False
+
+ENV TORCH_GPU_ENABLED=${TORCH_GPU_ENABLED}
+ENV PADDLE_GPU_ENABLED=${PADDLE_GPU_ENABLED}
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+ tesseract-ocr \
+ poppler-utils \
+ libgl1 \
+ libglib2.0-0 && \
+ if [ "$TORCH_GPU_ENABLED" = "True" ] || [ "$PADDLE_GPU_ENABLED" = "True" ]; then \
+ apt-get install -y --no-install-recommends libgomp1; \
+ fi && \
+ apt-get clean && rm -rf /var/lib/apt/lists/*
+
+ENV APP_HOME=/home/user
+
+# Set env variables for Gradio & other apps
+ENV GRADIO_TEMP_DIR=/tmp/gradio_tmp/ \
+ TLDEXTRACT_CACHE=/tmp/tld/ \
+ MPLCONFIGDIR=/tmp/matplotlib_cache/ \
+ GRADIO_OUTPUT_FOLDER=$APP_HOME/app/output/ \
+ GRADIO_INPUT_FOLDER=$APP_HOME/app/input/ \
+ FEEDBACK_LOGS_FOLDER=$APP_HOME/app/feedback/ \
+ ACCESS_LOGS_FOLDER=$APP_HOME/app/logs/ \
+ USAGE_LOGS_FOLDER=$APP_HOME/app/usage/ \
+ CONFIG_FOLDER=$APP_HOME/app/config/ \
+ XDG_CACHE_HOME=/tmp/xdg_cache/user_1000 \
+ TESSERACT_DATA_FOLDER=/usr/share/tessdata \
+ GRADIO_SERVER_NAME=0.0.0.0 \
+ GRADIO_SERVER_PORT=7860 \
+ PATH=$APP_HOME/.local/bin:$PATH \
+ PYTHONPATH=$APP_HOME/app \
+ PYTHONUNBUFFERED=1 \
+ PYTHONDONTWRITEBYTECODE=1 \
+ GRADIO_ALLOW_FLAGGING=never \
+ GRADIO_NUM_PORTS=1 \
+ GRADIO_ANALYTICS_ENABLED=False
+
+# Copy Python packages from the builder stage
+COPY --from=builder /install /usr/local/lib/python3.12/site-packages/
+COPY --from=builder /install/bin /usr/local/bin/
+
+# Reinstall protobuf into the final site-packages. Builder uses multiple `pip install --target=/install`
+# passes; that can break the `google` namespace so `google.protobuf` is missing and Paddle fails at import.
+RUN pip install --no-cache-dir "protobuf<=7.34.0"
+
+# Copy your application code and entrypoint
+COPY . ${APP_HOME}/app
+COPY entrypoint.sh ${APP_HOME}/app/entrypoint.sh
+# Fix line endings and set execute permissions
+RUN sed -i 's/\r$//' ${APP_HOME}/app/entrypoint.sh \
+ && chmod +x ${APP_HOME}/app/entrypoint.sh
+
+WORKDIR ${APP_HOME}/app
+
+# ===================================================================
+# FINAL Stage 3: The Lambda Image (runs as root for simplicity)
+# ===================================================================
+FROM base AS lambda
+# Set runtime ENV for Lambda mode
+ENV APP_MODE=lambda
+ENTRYPOINT ["/home/user/app/entrypoint.sh"]
+CMD ["lambda_entrypoint.lambda_handler"]
+
+# ===================================================================
+# FINAL Stage 4: The Gradio Image (runs as a secure, non-root user)
+# ===================================================================
+FROM base AS gradio
+# Set runtime ENV for Gradio mode
+ENV APP_MODE=gradio
+
+# Create non-root user
+RUN useradd -m -u 1000 user
+
+# Create the base application directory and set its ownership
+RUN mkdir -p ${APP_HOME}/app && chown user:user ${APP_HOME}/app
+
+# Create required sub-folders within the app directory and set their permissions
+# This ensures these specific directories are owned by 'user'
+RUN mkdir -p \
+ ${APP_HOME}/app/output \
+ ${APP_HOME}/app/input \
+ ${APP_HOME}/app/logs \
+ ${APP_HOME}/app/usage \
+ ${APP_HOME}/app/feedback \
+ ${APP_HOME}/app/config \
+ && chown user:user \
+ ${APP_HOME}/app/output \
+ ${APP_HOME}/app/input \
+ ${APP_HOME}/app/logs \
+ ${APP_HOME}/app/usage \
+ ${APP_HOME}/app/feedback \
+ ${APP_HOME}/app/config \
+ && chmod 755 \
+ ${APP_HOME}/app/output \
+ ${APP_HOME}/app/input \
+ ${APP_HOME}/app/logs \
+ ${APP_HOME}/app/usage \
+ ${APP_HOME}/app/feedback \
+ ${APP_HOME}/app/config
+
+# Now handle the /tmp and /var/tmp directories and their subdirectories, paddle, spacy, tessdata
+RUN mkdir -p /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache /tmp /var/tmp ${XDG_CACHE_HOME} \
+ && chown user:user /tmp /var/tmp /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache ${XDG_CACHE_HOME} \
+ && chmod 1777 /tmp /var/tmp /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache \
+ && chmod 700 ${XDG_CACHE_HOME} \
+ && mkdir -p ${APP_HOME}/.paddlex \
+ && chown user:user ${APP_HOME}/.paddlex \
+ && chmod 755 ${APP_HOME}/.paddlex \
+ && mkdir -p ${APP_HOME}/.local/share/spacy/data \
+ && chown user:user ${APP_HOME}/.local/share/spacy/data \
+ && chmod 755 ${APP_HOME}/.local/share/spacy/data \
+ && mkdir -p /usr/share/tessdata \
+ && chown user:user /usr/share/tessdata \
+ && chmod 755 /usr/share/tessdata
+
+# Fix apply user ownership to all files in the home directory
+RUN chown -R user:user /home/user
+
+# Set permissions for Python executable
+RUN chmod 755 /usr/local/bin/python
+
+# Declare volumes (NOTE: runtime mounts will override permissions — handle with care)
+VOLUME ["/tmp/matplotlib_cache"]
+VOLUME ["/tmp/gradio_tmp"]
+VOLUME ["/tmp/tld"]
+VOLUME ["/home/user/app/output"]
+VOLUME ["/home/user/app/input"]
+VOLUME ["/home/user/app/logs"]
+VOLUME ["/home/user/app/usage"]
+VOLUME ["/home/user/app/feedback"]
+VOLUME ["/home/user/app/config"]
+VOLUME ["/home/user/.paddlex"]
+VOLUME ["/home/user/.local/share/spacy/data"]
+VOLUME ["/usr/share/tessdata"]
+VOLUME ["/tmp"]
+VOLUME ["/var/tmp"]
+
+USER user
+
+EXPOSE $GRADIO_SERVER_PORT
+
+ENTRYPOINT ["/home/user/app/entrypoint.sh"]
+CMD ["python", "app.py"]
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..92103d802e441da25cbc509be6015e053b3f5300
--- /dev/null
+++ b/README.md
@@ -0,0 +1,1471 @@
+---
+title: Document redaction
+emoji: 📝
+colorFrom: blue
+colorTo: yellow
+sdk: docker
+app_file: app.py
+pinned: true
+license: agpl-3.0
+short_description: OCR / redact PDF documents and tabular data
+---
+# Document redaction
+
+version: 2.0.1
+
+Redact personally identifiable information (PII) from documents (PDF, PNG, JPG), Word files (DOCX), or tabular data (XLSX/CSV/Parquet). Please see the [User Guide](#user-guide) for a full walkthrough of all the features in the app.
+
+To extract text from documents, the 'Local' options are PikePDF for PDFs with selectable text, and OCR with Tesseract. Use AWS Textract to extract more complex elements e.g. handwriting, signatures, or unclear text. PaddleOCR and VLM support is also provided (see the installation instructions below).
+
+For PII identification, 'Local' (based on spaCy) gives good results if you are looking for common names or terms, or a custom list of terms to redact (see Redaction settings). AWS Comprehend gives better results at a small cost.
+
+Additional options on the 'Redaction settings' include, the type of information to redact (e.g. people, places), custom terms to include/ exclude from redaction, fuzzy matching, language settings, and whole page redaction. After redaction is complete, you can view and modify suggested redactions on the 'Review redactions' tab to quickly create a final redacted document.
+
+NOTE: The app is not 100% accurate, and it will miss some personal information. It is essential that all outputs are reviewed **by a human** before using the final outputs.
+
+---
+
+## 🚀 Quick Start - Installation and first run
+
+Follow these instructions to get the document redaction application running on your local machine.
+
+### 1. Prerequisites: System Dependencies
+
+This application relies on two external tools for OCR (Tesseract) and PDF processing (Poppler). Please install them on your system before proceeding.
+
+---
+
+
+#### **On Windows**
+
+Installation on Windows requires downloading installers and adding the programs to your system's PATH.
+
+1. **Install Tesseract OCR:**
+ * Download the installer from the official Tesseract at [UB Mannheim page](https://github.com/UB-Mannheim/tesseract/wiki) (e.g., `tesseract-ocr-w64-setup-v5.X.X...exe`).
+ * Run the installer.
+ * **IMPORTANT:** During installation, ensure you select the option to "Add Tesseract to system PATH for all users" or a similar option. This is crucial for the application to find the Tesseract executable.
+
+
+2. **Install Poppler:**
+ * Download the latest Poppler binary for Windows. A common source is the [Poppler for Windows](https://github.com/oschwartz10612/poppler-windows) GitHub releases page. Download the `.zip` file (e.g., `poppler-25.07.0-win.zip`).
+ * Extract the contents of the zip file to a permanent location on your computer, for example, `C:\Program Files\poppler\`.
+ * You must add the `bin` folder from your Poppler installation to your system's PATH environment variable.
+ * Search for "Edit the system environment variables" in the Windows Start Menu and open it.
+ * Click the "Environment Variables..." button.
+ * In the "System variables" section, find and select the `Path` variable, then click "Edit...".
+ * Click "New" and add the full path to the `bin` directory inside your Poppler folder (e.g., `C:\Program Files\poppler\poppler-24.02.0\bin`).
+ * Click OK on all windows to save the changes.
+
+ To verify, open a new Command Prompt and run `tesseract --version` and `pdftoppm -v`. If they both return version information, you have successfully installed the prerequisites.
+
+---
+
+#### **On Linux (Debian/Ubuntu)**
+
+Open your terminal and run the following command to install Tesseract and Poppler:
+
+```bash
+sudo apt-get update && sudo apt-get install -y tesseract-ocr poppler-utils
+```
+
+#### **On Linux (Fedora/CentOS/RHEL)**
+
+Open your terminal and use the `dnf` or `yum` package manager:
+
+```bash
+sudo dnf install -y tesseract poppler-utils
+```
+---
+
+
+### 2. Installation: Code and Python Packages
+
+Once the system prerequisites are installed, you can set up the Python environment.
+
+#### Step 1: Clone the Repository
+
+Open your terminal or Git Bash and clone this repository:
+```bash
+git clone https://github.com/seanpedrick-case/doc_redaction.git
+cd doc_redaction
+```
+
+#### Step 2: Create and Activate a Virtual Environment (Recommended)
+
+It is highly recommended to use a virtual environment to isolate project dependencies and avoid conflicts with other Python projects.
+
+```bash
+# Create the virtual environment
+python -m venv venv
+
+# Activate it
+# On Windows:
+.\venv\Scripts\activate
+
+# On macOS/Linux:
+source venv/bin/activate
+```
+
+#### Step 3: Install Python Dependencies
+
+##### Lightweight version (without PaddleOCR and VLM support)
+
+This project uses `pyproject.toml` to manage dependencies. You can install everything with a single pip command. This process will also download the required Spacy models and other packages directly from their URLs.
+
+```bash
+pip install .
+```
+
+Alternatively, you can install from the `requirements_lightweight.txt` file:
+```bash
+pip install -r requirements_lightweight.txt
+```
+
+##### Full version (with Paddle and VLM support)
+
+Run the following command to install the additional dependencies:
+
+```bash
+pip install .[paddle,vlm]
+```
+
+Alternatively, you can use the full `requirements.txt` file, that contains references to the PaddleOCR and related Torch/transformers dependencies (for cuda 12.9):
+```bash
+pip install -r requirements.txt
+```
+
+Note that the versions of both PaddleOCR and Torch installed by default are the CPU-only versions. If you want to install the equivalent GPU versions, you will need to run the following commands:
+```bash
+pip install paddlepaddle-gpu==3.2.1 --index-url https://www.paddlepaddle.org.cn/packages/stable/cu129/
+```
+
+**Note:** It is difficult to get paddlepaddle gpu working in an environment alongside torch. You may well need to reinstall the cpu version to ensure compatibility, and run paddlepaddle-gpu in a separate environment without torch installed. If you get errors related to .dll files following paddle gpu install, you may need to install the latest c++ redistributables. For Windows, you can find them [here](https://learn.microsoft.com/en-us/cpp/windows/latest-supported-vc-redist?view=msvc-170)
+
+```bash
+pip install torch==2.8.0 --index-url https://download.pytorch.org/whl/cu129
+pip install torchvision --index-url https://download.pytorch.org/whl/cu129
+```
+
+#### Docker installation
+
+The doc_redaction Redaction app can be installed by using the [Dockerfile](https://github.com/seanpedrick-case/doc_redaction/blob/main/Dockerfile) or Docker compose files ([llama.cpp](https://github.com/ggml-org/llama.cpp), [vLLM](https://docs.vllm.ai/en/stable/)) provided in the repo.
+
+##### Without Llama.cpp / vLLM inference server
+
+If you want a working Docker installation without GPU support, you can install from the [Dockerfile](https://github.com/seanpedrick-case/doc_redaction/blob/main/Dockerfile) in the repo. A working example of this, with the CPU version of PaddleOCR, can be found on [Hugging Face](https://huggingface.co/spaces/seanpedrickcase/document_redaction). You can adjust the INSTALL_PADDLEOCR, PADDLE_GPU_ENABLED, INSTALL_VLM, and TORCH_GPU_ENABLED config variables to adjust for PaddleOCR and Transformers packages for local VLM support. Note that GPU-enabled PaddleOCR, and GPU-enabled Transformers/Torch often don't work well together, which is one reason why a Llama.cpp/vLLM inference server Docker installation option is provided below.
+
+##### With Llama.cpp / vLLM inference server
+
+The project now has Docker and Docker compose files available to pair running the Redaction app with local inference servers powered by [llama.cpp](https://github.com/ggml-org/llama.cpp), or [vLLM](https://docs.vllm.ai/en/stable/). Llama.cpp is more flexible than vLLM for low VRAM systems, as Llama.cpp will offload to cpu/system RAM automatically rather than failing as vLLM tends to do.
+
+For Llama.cpp, you can use the [docker-compose_llama.yml](https://github.com/seanpedrick-case/doc_redaction/blob/main/docker-compose_llama.yml) file, and for vLLM, you can use the [docker-compose_vllm.yml](https://github.com/seanpedrick-case/doc_redaction/blob/main/docker-compose_vllm.yml) file. To run, Docker / Docker Desktop should be installed, and then you can run the commands suggested in the top of the files to run the servers.
+
+You will need ~40-50GB of disk space to run everything depending on the model chosen from the compose file. For the vLLM server, you will need 24 GB VRAM. For the Llama.cpp server, 24 GB VRAM is needed to run at full speed, but the n-gpu-layers and n-cpu-moe parameters in the Docker compose file can be adjusted to fit into your system. I would suggest that 8 GB VRAM is needed as a bare minimum for decent inference speed. See the [Unsloth guide](https://unsloth.ai/docs/models/qwen3.5) for more details on working with GGUF files for Qwen 3.5.
+
+### 3. Run the Application
+
+With all dependencies installed, you can now start the Gradio application.
+
+```bash
+python app.py
+```
+
+After running the command, the application will start, and you will see a local URL in your terminal (usually `http://127.0.0.1:7860`).
+
+Open this URL in your web browser to use the document redaction tool
+
+#### Command line interface
+
+If instead you want to run redactions or other app functions in CLI mode, run the following for instructions:
+
+```bash
+python cli_redact.py --help
+```
+
+---
+
+
+### 4. ⚙️ Configuration (Optional)
+
+You can customise the application's behavior by creating a configuration file. This allows you to change settings without modifying the source code, such as enabling AWS features, changing logging behavior, or pointing to local Tesseract/Poppler installations. A full overview of all the potential settings you can modify in the app_config.env file can be seen in tools/config.py, with explanation on the documentation website for [the github repo](https://seanpedrick-case.github.io/doc_redaction/)
+
+To get started:
+1. Locate the `example_config.env` file in the root of the project.
+2. Create a new file named `app_config.env` inside the `config/` directory (i.e., `config/app_config.env`).
+3. Copy the contents from `example_config.env` into your new `config/app_config.env` file.
+4. Modify the values in `config/app_config.env` to suit your needs. The application will automatically load these settings on startup.
+
+If you do not create this file, the application will run with default settings.
+
+#### Configuration Breakdown
+
+Here is an overview of the most important settings, separated by whether they are for local use or require AWS.
+
+---
+
+#### **Local & General Settings (No AWS Required)**
+
+These settings are useful for all users, regardless of whether you are using AWS.
+
+* `TESSERACT_FOLDER` / `POPPLER_FOLDER`
+ * Use these if you installed Tesseract or Poppler to a custom location on **Windows** and did not add them to the system PATH.
+ * Provide the path to the respective installation folders (for Poppler, point to the `bin` sub-directory).
+ * **Examples:** `POPPLER_FOLDER=C:/Program Files/poppler-24.02.0/bin/` `TESSERACT_FOLDER=tesseract/`
+
+* `SHOW_LANGUAGE_SELECTION=True`
+ * Set to `True` to display a language selection dropdown in the UI for OCR processing.
+
+* `DEFAULT_LOCAL_OCR_MODEL=tesseract`"
+ * Choose the backend for local OCR. Options are `tesseract`, `paddle`, or `hybrid`. "Tesseract" is the default, and is recommended. "hybrid-paddle" is a combination of the two - first pass through the redactions will be done with Tesseract, and then a second pass will be done with PaddleOCR on words with low confidence. "paddle" will only return whole line text extraction, and so will only work for OCR, not redaction.
+
+* `SESSION_OUTPUT_FOLDER=False`
+ * If `True`, redacted files will be saved in unique subfolders within the `output/` directory for each session.
+
+* `DISPLAY_FILE_NAMES_IN_LOGS=False`
+ * For privacy, file names are not recorded in usage logs by default. Set to `True` to include them.
+
+---
+
+#### **AWS-Specific Settings**
+
+These settings are only relevant if you intend to use AWS services like Textract for OCR and Comprehend for PII detection.
+
+* `RUN_AWS_FUNCTIONS=True`
+ * **This is the master switch.** You must set this to `True` to enable any AWS functionality. If it is `False`, all other AWS settings will be ignored.
+
+* **UI Options:**
+ * `SHOW_AWS_TEXT_EXTRACTION_OPTIONS=True`: Adds "AWS Textract" as an option in the text extraction dropdown.
+ * `SHOW_AWS_PII_DETECTION_OPTIONS=True`: Adds "AWS Comprehend" as an option in the PII detection dropdown.
+
+* **Core AWS Configuration:**
+ * `AWS_REGION=example-region`: Set your AWS region (e.g., `us-east-1`).
+ * `DOCUMENT_REDACTION_BUCKET=example-bucket`: The name of the S3 bucket the application will use for temporary file storage and processing.
+
+* **AWS Logging:**
+ * `SAVE_LOGS_TO_DYNAMODB=True`: If enabled, usage and feedback logs will be saved to DynamoDB tables.
+ * `ACCESS_LOG_DYNAMODB_TABLE_NAME`, `USAGE_LOG_DYNAMODB_TABLE_NAME`, etc.: Specify the names of your DynamoDB tables for logging.
+
+* **Advanced AWS Textract Features:**
+ * `SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS=True`: Enables UI components for large-scale, asynchronous document processing via Textract.
+ * `TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET=example-bucket-output`: A separate S3 bucket for the final output of asynchronous Textract jobs.
+ * `LOAD_PREVIOUS_TEXTRACT_JOBS_S3=True`: If enabled, the app will try to load the status of previously submitted asynchronous jobs from S3.
+
+* **Cost Tracking (for internal accounting):**
+ * `SHOW_COSTS=True`: Displays an estimated cost for AWS operations. Can be enabled even if AWS functions are off.
+ * `GET_COST_CODES=True`: Enables a dropdown for users to select a cost code before running a job.
+ * `COST_CODES_PATH=config/cost_codes.csv`: The local path to a CSV file containing your cost codes.
+ * `ENFORCE_COST_CODES=True`: Makes selecting a cost code mandatory before starting a redaction.
+
+Now you have the app installed, what follows is a guide on how to use it for basic and advanced redaction.
+
+# User guide
+
+## Table of contents
+
+### Getting Started
+- [Quickstart - Test the app with built-in examples](#quickstart---test-the-app-with-built-in-examples)
+ - [PDF document examples](#pdf-document-examples)
+ - [CSV/Excel file examples](#csvexcel-file-examples)
+- [Basic redaction](#basic-redaction)
+ - [Upload files to the app](#upload-files-to-the-app)
+ - [Text extraction](#text-extraction)
+ - [AWS Textract signature extraction](#aws-textract-signature-extraction)
+ - [PII redaction method](#pii-redaction-method)
+ - [Duplicate page redaction](#duplicate-page-redaction)
+ - [Allow list, deny list, and whole-page redaction](#allow-list-deny-list-and-whole-page-redaction)
+ - [Cost and time estimation](#cost-and-time-estimation)
+ - [Cost code selection](#cost-code-selection)
+ - [Redact only specific pages](#redact-only-specific-pages)
+ - [Run redaction](#run-redaction)
+ - [Redaction outputs](#redaction-outputs)
+- [Reviewing and modifying suggested redactions](#reviewing-and-modifying-suggested-redactions)
+ - [Uploading documents for review](#uploading-documents-for-review)
+ - [Page navigation](#page-navigation)
+ - [Document viewer](#document-viewer)
+ - [Modify existing redactions](#modify-existing-redactions)
+ - [Search text and redact](#search-text-and-redact)
+ - [Navigating through the document using 'View text'](#navigating-through-the-document-using-view-text)
+ - [Apply revised redactions to PDF](#apply-revised-redactions-to-pdf)
+- [Loading in previous results to continue redaction](#loading-in-previous-results-to-continue-redaction)
+ - [Loading in previous results from the redactions_for_review.pdf file](#loading-in-previous-results-from-the-redactions_for_reviewpdf-file)
+ - [Loading in OCR results to search for new redactions](#loading-in-ocr-results-to-search-for-new-redactions)
+ - [Using a previous OCR results file to skip redoing OCR for future redaction tasks](#using-a-previous-ocr-results-file-to-skip-redoing-ocr-for-future-redaction-tasks)
+- [Document summarisation](#document-summarisation)
+- [Redacting Word, tabular data files (CSV/XLSX) or open text](#redacting-word-tabular-data-files-xlsxcsv-or-open-text)
+ - [Word or tabular data files (XLSX/CSV)](#word-or-tabular-data-files-xlsxcsv)
+ - [Choosing output anonymisation format](#choosing-output-anonymisation-format)
+ - [Redacting open text](#redacting-open-text)
+
+### Advanced user guide
+- [Identifying and redacting duplicate pages with custom settings](#identifying-and-redacting-duplicate-pages-with-custom-settings)
+ - [Duplicate page detection in documents](#duplicate-page-detection-in-documents)
+ - [Duplicate detection in tabular data](#duplicate-detection-in-tabular-data)
+- [Export redacted document files to Adobe Acrobat](#export-redacted-document-files-to-adobe-acrobat)
+ - [Using redactions_for_review.pdf files with Adobe Acrobat](#using-redactions_for_reviewpdf-files-with-adobe-acrobat)
+ - [Exporting comment files to Adobe Acrobat](#exporting-comment-files-to-adobe-acrobat)
+ - [Importing comment files from Adobe Acrobat](#importing-comment-files-from-adobe-acrobat)
+- [Submit documents to the AWS Textract API service for faster OCR](#submit-documents-to-the-aws-textract-api-service-for-faster-ocr)
+- [Advanced OCR settings - Efficient OCR, overwrite existing OCR](#advanced-ocr-settings---efficient-ocr-overwrite-existing-ocr)
+- [Modifying existing redaction review files](#modifying-existing-redaction-review-files)
+
+### Features for expert users/system administrators
+- [Using AWS Textract and Comprehend when not running in an AWS environment](#using-aws-textract-and-comprehend-when-not-running-in-an-aws-environment)
+- [Advanced OCR model options (inlcuding Hybrid OCR)](#advanced-ocr-options-hybrid-ocr)
+- [PII identification with LLMs](#pii-identification-with-llms)
+- [Command Line Interface (CLI)](#command-line-interface-cli)
+
+## Quickstart - Test the app with built-in examples
+
+### PDF document examples
+
+The app provides some built-in examples so you can see how it works before trying one of your own files.
+
+**For PDF/image redaction:** On the 'Redact PDFs/images' tab, you'll see a section titled "Try an example - Click on an example below and then the 'Extract text and redact document' button". Simply click on any of the available examples to load them with pre-configured settings:
+
+- **PDF with selectable text redaction** - Uses local text extraction with standard PII detection
+- **Image redaction with local OCR** - Processes an image file using OCR
+- **PDF redaction with custom entities** - Demonstrates custom entity selection (Titles, Person, Dates)
+- **PDF redaction with AWS services and signature detection** - Shows AWS Textract with signature extraction (if AWS is enabled)
+- **PDF redaction with custom deny list and whole page redaction** - Demonstrates the use of redacting specific named terms and whole pages
+
+Once you have clicked on an example, you can click the 'Extract text and redact document' button to redact the document. You can then click the 'Review and modify redactions' button below this to review and modify suggested redactions. See the 'Basic redaction' section below for more details on redacting your own documents.
+
+### CSV/Excel file examples
+
+**For tabular data:** On the 'Word or Excel/CSV files' tab, you'll find examples for both redaction and duplicate detection:
+
+- **CSV file redaction** - Shows how to redact specific columns in tabular data
+- **Word document redaction** - Demonstrates Word document processing
+- **Excel file duplicate detection** - Shows how to find duplicate rows in spreadsheet data
+
+Once you have clicked on an example, you can click the 'Redact text/data files' button directly to redact the example file. Once done, you can click the 'Review redactions' button to review and modify suggested redaction boxes.
+
+## Basic redaction
+
+The document redaction app can detect personally-identifiable information (PII) in documents. Documents can be redacted directly, or suggested redactions can be reviewed and modified using a graphical user interface. Basic document redaction can be performed quickly using the default options.
+
+**Where to work:** All of the main redaction options and the redact button are on the **'Redact PDFs/images'** tab.
+
+
+
+### Upload files to the app
+
+On the **'Redact PDFs/images'** tab, the **'Redaction settings'** accordion at the top accepts PDFs and image files (JPG, PNG) for redaction. Click on the **'Drop files here or Click to Upload'** area, or select from one of the [examples provided](#pdf-document-examples).
+
+### Text extraction
+
+Under **'Redaction settings'** accordion, you can see **'Change default text extraction settings'**. You may have the following options available depending on your configuration - if not, AWS Textract will likely be the default option:
+
+- **'Local model - selectable text'** - (optional) Reads text directly from PDFs that have selectable text.
+- **'Local OCR model - PDFs without selectable text'** - (optional) Uses a local OCR model to extract text from PDFs/images. Handles most typed text without selectable text but is less accurate for handwriting and signatures; use the AWS Textract option in this case.
+- **'AWS Textract service - all PDF types'** - Available when the app is configured for AWS. Textract runs in the cloud and is more capable for complex layouts, handwriting, and signatures. It incurs a (relatively small) cost per page.
+
+### AWS Textract signature extraction
+
+
+
+If you select **'AWS Textract service - all PDF types'** as the text extraction method, an accordion **'Enable AWS Textract signature detection (default is off)'** appears. Open it to turn on handwriting and/or signature detection. Enabling signatures has a cost impact (~£2.66 ($3.50) per 1,000 pages vs ~£1.14 ($1.50) per 1,000 pages without signature detection).
+
+
+
+
+
+**NOTE:** Form, layout, table extraction, or face detection can be enabled if required for specific use cases; they are off by default - please contact your system administrator if you need these features.
+
+### PII redaction method
+
+Next we need to choose our model to identify personally-identifiable information (PII) in the document. Under **'Change PII identification method'** accordion (under **'Change default redaction settings'**) you will see **'Choose redaction method'**, a radio with three options:
+- **'Extract text only'** runs text extraction without redaction — useful when you only need OCR output or want to review text before redacting.
+- **'Redact all PII'** (the default) uses the chosen PII detection method to find and redact personal information across a range of standard entity types, e.g. addresses, names, dates, etc.
+- **'Redact selected terms'** will focus redaction only on the specific terms in the [custom allow/deny lists](#allow-list-deny-list-and-whole-page-redaction) below.
+
+Still under **'Change default redaction settings'**, you may see the **'Change PII identification model'** section, if enabled, which lets you choose how PII is detected. You may have the choice of the following options. If not, AWS Comprehend will likely be the default option:
+
+- **'Local'** - (optional) Uses a local model (e.g. spaCy) to detect PII at no extra cost, but with less accuracy than the alternative options.
+- **'AWS Comprehend'** - Uses AWS Comprehend for PII detection when the app is configured for AWS; typically more accurate but incurs a cost (around £0.0075 ($0.01) per 10,000 characters).
+- Other options may be available depending on the app settings (e.g. AWS Bedrock, local LLM models).
+
+Under **'Select entity types to redact'** you can choose which types of PII to redact (e.g. names, emails, dates). Click in the box or near the dropdown arrow to see the full list. Any entity type that remains in the box will be searched for during the redaction process.
+
+
+
+### Duplicate page redaction
+
+Alongside the 'Change PII identification method' section, you will see 'Redact duplicate pages'. If this is enabled, following the main redaction process, the app will identify pages with duplicate text in the document and redact them in the same run. If you want to modify the duplicate page detection settings, you can do so on the **Identify duplicate pages** tab - please refer to the [Identifying and redacting duplicate pages](#identifying-and-redacting-duplicate-pages) section for more details.
+
+### Allow list, deny list, and whole-page redaction
+
+Underneath you will see **'Terms to always include or exclude in redactions, and whole page redaction'**. Here you can:
+
+- **Allow list** – Terms that if found, will never be redacted. To use, ensure that CUSTOM is selected in the **'Select entity types to redact'** dropdown.
+- **Deny list** – Terms that if found, will always be redacted. To use, ensure that CUSTOM is selected in the **'Select entity types to redact'** dropdown.
+- **Fully redact these pages** – Page numbers that will be fully redacted with a box that covers the entire page.
+- **Maximum spelling mistakes for matching deny list terms** – Number (0–9) used for [fuzzy matching](#fuzzy-search-and-redaction) for terms in the Deny list when **CUSTOM_FUZZY** is selected in the entity list. For example, if this is set to 1, then all terms set in the Deny list will be matched with up to 1 spelling mistake. Note that setting this value greater than 0 will increase the time taken to redact the document.
+
+To add items to the allow list, deny list, or fully redacted pages list, you can do so by typing the item into the box and pressing enter. You can also remove items by clicking the **'x'** next to the item, or pressing the backspace key when the box is selected.
+
+
+
+You can add or remove terms directly in these controls. To load many terms from a file (e.g. a CSV), use the file upload areas at the top of the **Settings** tab. If you upload a csv into one of these boxes containing a single column of terms, the terms will be loaded into the allow list, deny list on the Redact PDFs/images tab. Similarly, if you upload a csv into the 'Fully redact these pages' box containing a single column of page numbers, the pages will be fully redacted on the next redaction run, and the relevant box on the Redact PDFs/images tab will be filled with the page numbers.
+
+
+
+### Cost and time estimation
+
+If enabled, an **'Estimated costs and time taken'** section appears on the **'Redact PDFs/images'** tab. **'Existing Textract output file found'** (or **'Existing local OCR output file found'**) is ticked automatically when previous extraction output for the same document already exists in the output folder or was [uploaded by you](#aws-textract-outputs), which indicates that the text extraction process will not be repeated for future redaction tasks.
+
+
+
+### Cost code selection
+
+If cost codes are enabled, an **'Assign task to cost code'** section appears on the same tab. Choose a cost code before running redaction. You can search the table or type in the **'Choose cost code for analysis'** dropdown to filter. You can then set a default cost code for future redaction tasks with your login by clicking on the **'Set default cost code'** button.
+
+
+
+### Redact only specific pages
+
+To redact only a subset of pages (e.g. only page 1), go to the **Settings** tab and open the **'Redact only selected pages'** accordion. Set **'Lowest page to redact (set to 0 to redact from the first page)'** and **'Highest page to redact (set to 0 to redact to the last page)'** (e.g. both 1 for only page 1). The next redaction run will only process that range; output filenames will include a suffix like **'..._1_1.pdf'**.
+
+
+
+### Run redaction
+
+Once all the above settings have been prepared, at the bottom of the **'Redact PDFs/images'** tab, open the **'Extract text and redact document'** accordion and click **'Extract text and redact document'**. The app will process the document (typically around 30 pages per minute, depending on options). When finished, a message will indicate completion and output files will appear in the **'Output files'** area. Use **'Review and modify redactions'** to open the review tab.
+
+### Redaction outputs
+
+After you click **'Extract text and redact document'**, the **'Output files'** area on the **'Redact PDFs/images'** tab shows:
+
+
+
+- **'...redacted.pdf'** – The original PDF with suggested redactions applied (text removed and replaced by a black box).
+- **'...redactions_for_review.pdf'** – The original PDF with redaction boxes overlaid but text still visible. Use this in Adobe Acrobat or other PDF viewers to review suggested redactions before finalising.
+- **'...ocr_outputs.csv'** – Line-by-line extracted text from the document (useful for searching text in Excel or similar).
+- **'...ocr_outputs_with_words.csv'** – Word-level extracted text from the document with bounding boxes .
+- **'...review_file.csv'** – Details and locations of all suggested redactions; required for the [review process](#reviewing-and-modifying-suggested-redactions).
+
+#### Additional AWS Textract / local OCR outputs
+
+You may also see a **'..._textract.json'** file, and/or a **'...ocr_outputs_with_words.json'** file. You can save this to your computer, and upload it later alongside your input document to skip calling Textract again for the same document:
+
+
+
+
+#### Log file outputs and other optional outputs
+
+On the **Settings** tab, open the **'Log file outputs'** accordion to access log and optional output files. You may see a **'decision_process_table.csv'** (decisions made per page) and, if enabled by your administrator, **'..._visualisations.jpg'** images showing OCR bounding boxes per page:
+
+
+
+#### Downloading output files from previous redaction tasks
+
+If you are logged in via AWS Cognito and lose the app page (e.g. after a crash or reload), you may still be able to recover output files if the server has not been restarted. When enabled, open the **Settings** tab and use **'View and download all output files from this session'** at the bottom. Click **'Refresh files in output folder'**, then tick the box next to a file to display and download it.
+
+
+
+## Reviewing and modifying suggested redactions
+
+Sometimes the app will suggest redactions that are incorrect, or will miss phrases with personal information. The app allows you to review and modify suggested redactions to compensate for this. You can do this on the 'Review redactions' tab.
+
+We will go through ways to review suggested redactions with an example. On the **'Redact PDFs/images'** tab, upload the ['Example of files sent to a professor before applying.pdf'](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/example_of_emails_sent_to_a_professor_before_applying.pdf) file. Keep the default 'Local model - selectable text' option and click **'Extract text and redact document'**. Once the outputs are created, go to the **'Review redactions'** tab.
+
+On the 'Review redactions' tab you have a visual interface that allows you to inspect and modify redactions suggested by the app. There are quite a few options to look at, so we'll go from top to bottom.
+
+
+
+### Uploading documents for review
+
+The top area has a file upload area where you can upload documents to review redactions. If you have just done a redaction task, these boxes should already be filled with the relevant files. In the left box (1.), upload the original PDF file. If you have a document that you have previously redacted, you can also upload the '...redactions_for_review.pdf' file that is produced by the redaction process, which will load in the previous redactions.
+
+In the second input file box to the right (2.), you can upload a '..._ocr_result_with_words' file, that will allow you to search through the text and easily [add new redactions based on word search](#searching-and-adding-custom-redactions). You can also upload one of the '..._ocr_output.csv' file here that comes out of a redaction task, so that you can navigate the extracted text from the document. Click the button '2. Upload Review or OCR csv files' load in these files.
+
+Now you can review and modify the suggested redactions using the interface described below.
+
+
+
+### Page navigation
+
+You can change the page viewed either by clicking 'Previous page' or 'Next page', or by typing a specific page number in the 'Current page' box and pressing Enter on your keyboard. Each time you switch page, it will save redactions you have made on the page you are moving from, so you will not lose changes you have made.
+
+You can also navigate to different pages by clicking on rows in the tables under 'Search suggested redactions' to the right, or 'search all extracted text' (if enabled) beneath that.
+
+### Document viewer
+
+In the centre left of the tab, you will see the first page of the document. On the selected page, each redaction is highlighted with a box next to its suggested redaction label (e.g. person, email). To zoom in and out of the page, use your mouse wheel. When selected, press spacebar to go back to the default zoom.
+
+
+
+There are a number of different options to add and modify redaction boxes and page on the document viewer pane. At the top and bottom of the document viewer, you will see controls to add and modify redaction boxes. You should see a box icon, a hand icon, and two arrows pointing counter-clockwise and clockwise (details below).
+
+
+
+
+#### Modify existing redactions (hand icon)
+
+After clicking on the hand icon (or press 'd' on your keyboard when the document viewer is selected), the interface allows you to modify existing redaction boxes. When in this mode, you can click and hold on an existing box to move it. Click on one of the small boxes at the edges to change the size of the box. Double click on a box to enter the menu to change label, colour, or remove the box.
+
+
+
+To delete a box, click on it to highlight it, then press delete on your keyboard. Alternatively, double click on a box and click 'Remove' on the box that appears.
+
+
+
+#### Add new redaction boxes (box icon)
+
+To change to 'add redaction boxes' mode, scroll to the bottom of the page. Click on the box icon, and your cursor will change into a crosshair. Now you can add new redaction boxes where you wish. A popup will appear when you create a new box so you can select a label and colour for the new box.
+
+#### 'Locking in' new redaction box format
+
+It is possible to lock in a chosen format for new redaction boxes so that you don't have the popup appearing each time. When you make a new box, select the options for your 'locked' format, and then click on the lock icon on the left side of the popup, which should turn blue.
+
+
+
+You can now add new redaction boxes without a popup appearing. If you want to change or 'unlock' the your chosen box format, you can click on the new icon that has appeared at the bottom of the document viewer pane that looks a little like a gift tag. You can then change the defaults, or click on the lock icon again to 'unlock' the new box format - then popups will appear again each time you create a new box.
+
+
+
+#### Saving new page redactions on the Document viewer pane
+
+If you are working on a page and you have created boxes manually or modified existing boxes, you are advised to click on the 'Save changes on current page to file' button to the right to ensure that they are saved to your output files.
+
+### Modify existing redactions
+
+To the right of the Document viewer pane you should see a heading above a table called 'Modify redactions' (see below). The table shows a list of all the suggested redactions in the document alongside the page, label, and text (if available).
+
+
+
+If you click on one of the rows in this table, you will be taken to the page of the redaction. Clicking on a redaction row on the same page will change the colour of redaction box to blue to help you locate it in the document viewer (just when using the app, not in redacted output PDFs).
+
+
+
+You can choose a specific entity type to see which pages the entity is present on. If you want to go to the page specified in the table, you can click on a cell in the table and the review page will be changed to that page.
+
+To filter the 'Search suggested redactions' table you can:
+1. Click on one of the dropdowns (Redaction category, Page, Text), and select an option, or
+2. Write text in the 'Filter' box just above the table. Click the blue box to apply the filter to the table.
+
+Once you have filtered the table, or selected a row from the table, you have a few options underneath on what you can do with the filtered rows:
+
+- Click the **Exclude all redactions in table** button to remove all redactions visible in the table from the document. **Important:** ensure that you have clicked the blue tick icon next to the search box before doing this to first filter the table, or you will remove all redactions from the document. If you do end up doing this, click the 'Undo last element removal' button below to restore the redactions.
+- Click the **Exclude specific redaction row** button to remove only the redaction from the last row you clicked on from the document. The currently selected row is visible below.
+- Click the **Exclude all redactions with the same text as selected row** button to remove all redactions from the document that are exactly the same as the selected row text.
+
+**NOTE**: After excluding redactions using any of the above options, click the 'Reset filters' button below to ensure that the dropdowns and table return to seeing all remaining redactions in the document.
+
+If you made a mistake, click the 'Undo last element removal' button to restore the Search suggested redactions table to its previous state (can only undo the last action).
+
+### Search text and redact
+
+After a document has been processed, you may need to redact specific terms, names, or phrases that the automatic PII detection might have missed. The **"Search text and redact"** tab gives you the power to find and redact any text within your document manually.
+
+#### **Step 1: Search for Text**
+
+
+
+
+
+1. Navigate to the **"Search text and redact"** tab to the right of the **'Review redactions'** tab, under the 'Apply redactions to PDF' and 'Save changes on current page to file' buttons.
+2. The main table will initially be populated with all the text extracted from the document for a page, broken down by word. Use the page selector to view a specific page, or 'ALL' to see all words.
+3. To find specific words or phrases to redact, use the **"Multi-word text search"** box to type the word or phrase you want to find (this will search the whole document).
+4. If you want to do a regex-based search, tick the 'Enable regex pattern matching' box under 'Search options' below.
+5. Click the **"Search"** button or press Enter.
+6. The table below will update to show only the rows containing text that matches your search query.
+
+Below the search button you can customise the appearance and label of the new redactions under the **"Search options"** accordion:
+
+* **Label for new redactions:** Change the text that appears on the redaction box (default is "Redaction"). You could change this to "CONFIDENTIAL" or "CUSTOM".
+* **Colour for labels:** Set a custom color for the redaction boxes. You can use the colour picker interface that pops up to select a colour.
+
+
+
+
+
+> **Tip:** If you make a mistake during the search, to clear all filters and see the full text again, click the **"Reset table to original state"** button. You can also click the 'Undo latest redaction' button to remove the latest redaction item.
+
+#### **Step 2: Select and Review a Match**
+
+Your redaction search results will appear in the table underneath the search options, with each found word on a different row.
+
+
+
+
+
+When you click on any row in the search results table:
+
+* The document preview on the left will automatically jump to that page, allowing you to see the word in its original context.
+* The details of your selection will appear in the smaller **"Selected row"** table for confirmation.
+
+#### **Step 3: Choose Your Redaction Method**
+
+You have several options for redacting the text you've found:
+
+* **Redact a single, specific instance of the phrase:**
+ * Click on the exact row in the table you want to redact.
+ * Click the **`Redact specific text row`** button.
+ * Only that single instance will be redacted.
+
+* **Redact all instances of a word/phrase:**
+ * Let's say you want to redact the project name "John Smith" everywhere it appears.
+ * Find and select one instance of "John Smith" in the table.
+ * Click the **`Redact all words with same text as selected row`** button.
+ * The application will find and redact every single occurrence of "John Smith" throughout the entire document.
+
+
+
+
+
+* **Redact all current search results:**
+ * Perform a search (e.g., for a specific person's name).
+ * If you are confident that every result shown in the filtered table should be redacted, click the **`Redact all text in table`** button.
+ * This will apply a redaction to all currently visible items in the table in one go, across all relevant pages in the document.
+
+An example of the outputs you can see in the document view pane is shown below.
+
+
+
+
+#### **Undoing a Mistake**
+
+If you make a mistake, you can reverse the last redaction action you performed on this tab.
+
+* Click the **`Undo latest redaction`** button. This will revert the last set of redactions you added (whether it was a single row, all of a certain text, or all search results).
+
+> **Important:** This undo button only works for the *most recent* action. It maintains a single backup state, so it cannot undo actions that are two or more steps in the past.
+
+### Navigating through the document using 'View text'
+
+The 'View text' table shows the text extracted from the document so you can check for correctness of the OCR process. If you have just completed a redaction task, you should see this table already filled in. If you have uploaded a '..._ocr_output.csv' file alongside a document file on the Review redactions tab as [described above](#uploading-documents-for-review).
+
+
+
+You can search through the extracted text by using the search bar just above the table. You can also filter specific columns by clicking on the three dots next to the column header and clicking 'Filter'. When you click on a row, the Document viewer pane to the left will change to the selected page. To return the table to its original content, click the button below the table 'Reset OCR output table filter'.
+
+
+
+
+
+### Apply revised redactions to PDF
+
+Once you have reviewed all the redactions in your document and you are happy with the outputs, you can click 'Apply revised redactions to PDF' to create a new '_redacted.pdf' and '_redactions_for_review.pdf' outputs.
+
+
+
+## Loading in previous results to continue redaction
+
+You may want to return to the same document later to modify existing redactions and add new ones. In this case, especially for large documents, you do not want to waste the time and expense to redo the redaction task from scratch. The Redaction app allows you to load in previous redaction results so you can continue from where you left off.
+
+### Loading in previous results from the _redactions_for_review.pdf file
+
+The redaction process produces a file with the suffix '_redactions_for_review.pdf', which contains all the redaction information needed to reuse in the app, and is also suitable for continuing with redactions in Adobe Acrobat.
+
+
+
+To continue with a file previously redacted, import your 'redactions_for_review.pdf' file on the 'Review redactions' tab in the [first file input box](#uploading-documents-for-review). Once your file is uploaded, you should see the first page of the document appear in the Document view pane below, and under the 'Modify redactions' tab to the right, you should see the table filled in with details of the previous redactions.
+
+### Merging previous redaction review files
+
+Say you have done multiple redaction tasks with the same file, and you want to merge all the redaction boxes together into one combined review document. You can merge multiple output 'redactions for review.pdf' files using the interface on the Settings page under 'Combine multiple review PDFs or CSV files'.
+
+
+
+In the file input box, upload your multiple 'redactions_for_review.pdf' files, then click 'Combine multiple review PDFs into one'. You should then get out a combined file that contains all the redaction boxes from the input files. You can then upload this file into the file input on the 'Review redactions' tab to review and modify redactions [as detailed above](#loading-in-previous-results-from-the-_redactions_for_review.pdf-file).
+
+
+
+### Loading in OCR results to search for new redactions
+
+When loading in results following the above method, you will notice that the table under 'Search text and redact' is still empty. To use this feature we need to load in the OCR results from your previous redaction task.
+
+When you redact a document, one of the outputs has the suffix 'ocr_results_with_words.csv'. It's this file that we need to upload to be able to search for new redactions. To do this, find the relevant ocr_results_with_words.csv file, and upload it into the second upload box on the 'Review redactions' tab in the top right (labelled 2.). When you do this, you should see the 'Search text and redact' tab table filled in with data. The table under the 'View text' tab should also be filled in.
+
+
+
+### Using a previous OCR results file to skip redoing OCR for future redaction tasks
+
+If you have a large document that you want to redact a second time in future, you can save time and money by retaining output files from the first run, and uploading them alongside the document next time you use the app for redaction.
+
+Every time you redact, a file with the suffix '.json' is produced. This file contains all the OCR results from your analysis. To use this file next time and skip waiting and paying for the OCR process, you can upload this alongside your document. On the 'Redact PDFs/images' tab, the file input area allows you to upload multiple files at the same time. With your document file and the .json file in the same folder, click on Drop files here or click to upload, and select both the PDF file and the .json file from the previous analysis output. The app should load both in.
+
+To check the previous OCR result upload has been successful, lower down the page you should see two checkboxes, called 'Existing Textract output file found' and 'Existing local OCR output found'. If one of these has become checked, then the OCR output has been successfully uploaded. When you next redact the document with the given method, it should load in the existing OCR results to skip re-analysis.
+
+## Document summarisation
+
+When summarisation is enabled, a **Document summarisation** tab is shown in the app. It lets you generate LLM-based summaries from PDFs, or from OCR output CSVs (e.g. from a previous redaction run).
+
+
+
+Here is how you can summarise a document:
+
+1. **Upload Files:** * **1.a Upload PDF files:** In the summarisation tab, use "Upload one or multiple PDF files to summarise" to attach one or more PDF files, or
+ * **1.b Upload OCR output files:** In the summarisation tab, use "Upload one or multiple 'ocr_output.csv' files to summarise" to attach one or more `*_ocr_output.csv` files.
+2. **Summarisation settings (accordion):**
+ * **Choose LLM inference method for summarisation**: Choose from the LLM options available in the app settings.
+ * **Max pages per page-group summary**: Limits how many pages are summarised together.
+ * **Summary format**: **Concise** or **Detailed**.
+ * **Additional summary instructions (optional)**: e.g. "Focus on key obligations."
+3. **Generate summary:** Click **"Generate summary"** to run the summarisation.
+4. **Outputs:** When finished, you can download summary files and view the summary that appears below.
+
+
+
+## Redacting Word, tabular data files (XLSX/CSV) or open text
+
+### Word or tabular data files (XLSX/CSV)
+
+The app can be used to redact Word (.docx), or tabular data files such as XLSX or CSV files. For redaction with tabular files to work properly, your data file (CSV, Excel) needs to be in a simple table format, with a single table starting from the first cell (A1), and no other information in the sheet. Similarly for XLSX files, each sheet in the file that you want to redact should be in this simple format.
+
+To demonstrate this, we can use [the example csv file 'combined_case_notes.csv'](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/combined_case_notes.csv), which is a small dataset of dummy social care case notes. Go to the 'Open text or Excel/csv files' tab. Drop the file into the upload area. After the file is loaded, you should see the suggested columns for redaction in the box underneath. You can select and deselect columns to redact as you wish from this list.
+
+
+
+If you were instead to upload an XLSX file, you would see also a list of all the sheets in the XLSX file that can be redacted. The 'Select columns' area underneath will suggest a list of all columns in the file across all sheets.
+
+
+
+Once you have chosen your input file and data, you can choose the redaction method. If visible, 'Local' will use a small local model, which is the same as that used for documents on the **'Redact PDFs/images'** tab. 'AWS Comprehend' will give better results, at a slight cost.
+
+When you click Redact text/data files, you will see the progress of the redaction task by file and sheet, and you will receive a CSV output with the redacted data.
+
+### Choosing output anonymisation format
+
+You can also choose the anonymisation format of your output results. Open the tab 'Anonymisation output format' to see the options. By default, any detected PII will be replaced with the word 'REDACTED' in the cell. You can choose one of the following options as the form of replacement for the redacted text:
+
+- **replace with 'REDACTED'**: Replaced by the word 'REDACTED' (default)
+- **replace with **: Replaced by e.g. 'PERSON' for people, 'EMAIL_ADDRESS' for emails etc.
+- **redact completely**: Text is removed completely and replaced by nothing.
+- **hash**: Replaced by a unique long ID code that is consistent with entity text. I.e. a particular name will always have the same ID code.
+- **mask**: Replace with stars '*'.
+
+
+
+### Redacting open text
+You can also write open text into an input box and redact that using the same methods as described above. To do this, write or paste text into the 'Enter open text' box that appears when you open the 'Redact open text' tab. Then select a redaction method, and an anonymisation output format as described above. The redacted text will be printed in the output textbox, and will also be saved to a simple CSV file in the output file box.
+
+You can test this by copy and pasting the text "John Smith is a person" into the input box, and selecting the **replace with 'REDACTED'** option. After pressing Redact text/data files, you should see the text "REDACTED" in the output textbox, and a csv file with the redacted text in the output file box.
+
+
+
+#### Redaction log outputs
+A list of the suggested redaction outputs from tabular data / Word/ open text data redaction is also provided in the output box with the name '..._log.csv'. This file gives a tabular breakdown of the specific redactions applied, the entity type, position, and underlying text.
+
+# Advanced user guide
+
+If the default settings and options for redaction are not sufficient for your needs, you can adjust the settings following the guide below.
+
+## Identifying and redacting duplicate pages with custom settings
+
+Simple redaction of duplicate pages using the checkbox on the 'Redact PDFs/images' tab during initial redaction is described [here](#duplicate-page-redaction). If the default settings are not sufficient for your needs, or if you want to identify duplicate lines of text or passages, you can adjust the settings following the guide below.
+
+The files for this section are stored [here](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/duplicate_page_find_in_app/).
+
+Some redaction tasks involve removing duplicate pages of text that may exist across multiple documents. This feature helps you find and remove duplicate content that may exist in single or multiple documents. It can identify everything from single identical pages to multi-page sections (subdocuments). The process involves three main steps: configuring the analysis, reviewing the results in the interactive interface, and then using the generated files to perform the redactions.
+
+### Duplicate page detection in documents
+
+This section covers finding duplicate pages across PDF documents using OCR output files. For a quick introduction to this functionality, you could run one of the following examples:
+
+**For duplicate page detection:** On the 'Identify duplicate pages' tab, you'll find examples for finding duplicate content in documents:
+
+- **Find duplicate pages of text in document OCR outputs** - Uses page-level analysis with a similarity threshold of 0.95 and minimum word count of 10
+- **Find duplicate text lines in document OCR outputs** - Uses line-level analysis with a similarity threshold of 0.95 and minimum word count of 3
+
+Once you have clicked on an example, you can click the 'Identify duplicate pages/subdocuments' button to load the example into the app and find duplicate content.
+
+
+
+**Step 1: Upload and Configure the Analysis**
+First, navigate to the **'Identify duplicate pages'** tab. In **'Step 1: Configure and run analysis'**, upload all the ocr_output.csv files you wish to compare. These files are generated every time you run a redaction task and contain the text for each page of a document.
+
+For our example, you can upload the four 'ocr_output.csv' files provided in the example folder. Open the **'Duplicate matching parameters'** accordion to set:
+
+- **Similarity threshold:** A score from 0 to 1. Pages or sequences with text similarity above this value are considered a match (default 0.95).
+- **Minimum word count:** Pages or lines with fewer words than this are ignored (default 10).
+- **Duplicate matching mode:** **'Find duplicates by page'** compares full-page text; **'Find duplicates by text line'** compares individual lines.
+
+**Matching strategy** (below the parameters):
+- **'Combine consecutive matches into a single match (subdocument match)'** (default: checked): Finds the longest possible sequence of matching pages (subdocuments). Uncheck to use the next option.
+- **Minimum consecutive matches** (slider, shown when subdocument matching is unchecked): Only report sequences of at least this many consecutive matches (e.g. 3 for at least 3-page runs). Set to 1 for single-page matching.
+
+Once your parameters are set, click **'Identify duplicate pages/subdocuments'**.
+
+In case you want to see the original PDFs for the example, they are available [here](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/duplicate_page_find_in_app/input_pdfs/).
+
+
+
+**Step 2: Review Results in the Interface**
+After the analysis is complete, the results will be displayed directly in the interface.
+
+*Analysis Summary:* A table will appear showing a summary of all the matches found. The columns will change depending on the matching strategy you chose. For subdocument matches, it will show the start and end pages of the matched sequence.
+
+*Interactive Preview:* This is the most important part of the review process. Click on any row in the summary table. The full text of the matching page(s) will appear side-by-side in the "Full Text Preview" section below, allowing you to instantly verify the accuracy of the match.
+
+
+
+**Step 3: Download and Use the Output Files**
+The analysis also generates a set of downloadable files for your records and for performing redactions.
+
+- page_similarity_results.csv: This is a detailed report of the analysis you just ran. It shows a breakdown of the pages from each file that are most similar to each other above the similarity threshold. You can compare the text in the two columns 'Page_1_Text' and 'Page_2_Text'. For single-page matches, it will list each pair of matching pages. For subdocument matches, it will list the start and end pages of each matched sequence, along with the total length of the match.
+
+
+
+- [Original_Filename]_pages_to_redact.csv: For each input document that was found to contain duplicate content, a separate redaction list is created. This is a simple, one-column CSV file containing a list of all page numbers that should be removed. To use these files, you can either upload the original document (i.e. the PDF) on the 'Review redactions' tab, and then click on the 'Apply relevant duplicate page output to document currently under review' button. You should see the whole pages suggested for redaction on the 'Review redactions' tab. Alternatively, you can reupload the file into the whole page redaction section as described in the ['Full page redaction list example' section](#full-page-redaction-list-example).
+
+
+
+If you want to combine the results from this redaction process with previous redaction tasks for the same PDF, you could merge review file outputs following the steps described in [Merging existing redaction review files](#merging-existing-redaction-review-files) above.
+
+**Redact duplicate pages on the Redact PDFs/images tab**
+
+On the **'Redact PDFs/images'** tab, in the **'Redaction settings'** area (accordion **'Terms to always include or exclude in redactions, and whole page redaction'**), there is a **'Redact duplicate pages'** checkbox. When this is enabled, the app will identify duplicate pages and apply whole-page redaction to them as part of the same redaction run. This option is shown when PII identification options are enabled. You can use it instead of (or in addition to) running duplicate detection on the **'Identify duplicate pages'** tab and then applying the output to a document on **'Review redactions'**.
+
+### Duplicate detection in tabular data
+
+The app also includes functionality to find duplicate cells or rows in CSV, Excel, or Parquet files. This is particularly useful for cleaning datasets where you need to identify and remove duplicate entries.
+
+**Step 1: Upload files and configure analysis**
+
+Navigate to the 'Word or Excel/CSV files' tab and scroll down to the "Find duplicate cells in tabular data" section. Upload your tabular files (CSV, Excel, or Parquet) and configure the analysis parameters:
+
+- **Similarity threshold**: Score (0-1) to consider cells a match. 1 = perfect match
+- **Minimum word count**: Cells with fewer words than this value are ignored
+- **Do initial clean of text**: Remove URLs, HTML tags, and non-ASCII characters
+- **Remove duplicate rows**: Automatically remove duplicate rows from deduplicated files
+- **Select Excel sheet names**: Choose which sheets to analyze (for Excel files)
+- **Select text columns**: Choose which columns contain text to analyze
+
+
+
+**Step 2: Review results**
+
+After clicking "Find duplicate cells/rows", the results will be displayed in a table showing:
+- File1, Row1, File2, Row2
+- Similarity_Score
+- Text1, Text2 (the actual text content being compared)
+
+Click on any row to see more details about the duplicate match in the preview boxes below.
+
+**Step 3: Remove duplicates**
+
+The deduplicated output files will be available for download in the output box. If you have selected 'Remove duplicate rows from deduplicated files', the duplicate rows will be removed from the deduplicated files. If not, then the duplicate rows will be indicated in the output files in the column named 'duplicated', which can be TRUE or FALSE.
+
+
+
+## Export redacted document files to Adobe Acrobat
+
+Files for this section are stored [here](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/export_to_adobe/).
+
+The Document Redaction app has enhanced features for working with Adobe Acrobat. You can now export suggested redactions to Adobe, import Adobe comment files into the app, and use the new `redactions_for_review.pdf` files directly in Adobe Acrobat.
+
+### Using redactions_for_review.pdf files with Adobe Acrobat
+
+The app now generates `redactions_for_review.pdf` files that contain the original PDF with redaction boxes overlaid but the original text still visible underneath. These files are specifically designed for use in Adobe Acrobat and other PDF viewers where you can:
+
+- See the suggested redactions without the text being permanently removed
+- Review redactions before finalising them
+- Use Adobe Acrobat's built-in redaction tools to modify or apply the redactions
+- Export the final redacted version directly from Adobe
+
+Simply open the `redactions_for_review.pdf` file in Adobe Acrobat to begin reviewing and modifying the suggested redactions.
+
+### Exporting comment files to Adobe Acrobat
+
+To convert suggested redactions to Adobe format in the format of xfdf comment files, you need to have the original PDF and a review file csv in the input box at the top of the Review redactions page.
+
+
+
+Then, you can find the export to Adobe option at the bottom of the Review redactions tab. Adobe comment files will be output here.
+
+
+
+Once the input files are ready, you can click on the 'Convert review file to Adobe comment format'. You should see a file appear in the output box with a '.xfdf' file type. To use this in Adobe, after download to your computer, you should be able to double click on it, and a pop-up box will appear asking you to find the PDF file associated with it. Find the original PDF file used for your redaction task. The file should be opened up in Adobe Acrobat with the suggested redactions.
+
+
+
+### Importing comment files from Adobe Acrobat
+
+The app also allows you to import .xfdf files from Adobe Acrobat. To do this, go to the same Adobe import/export area as described above at the bottom of the Review Redactions tab. In this box, you need to upload a .xfdf Adobe comment file, along with the relevant original PDF for redaction.
+
+
+
+When you click the 'convert .xfdf comment file to review_file.csv' button, the app should take you up to the top of the screen where the new review file has been created and can be downloaded.
+
+
+
+## Submit documents to the AWS Textract API service for faster OCR
+
+If enabled by your administrator, an accordion **'Submit whole document to AWS Textract API (quickest text extraction for large documents)'** appears on the **'Redact PDFs/images'** tab. This sends whole documents to Textract for fast extraction, separate from the main redaction flow. For very large documents, this can be a significant time saver in getting your OCR results, which you can then use for redaction tasks in the app.
+
+
+
+### Starting a new Textract API job
+
+To use this feature, first upload a document file [in the usual way](#upload-files-to-the-app) on the **'Redact PDFs/images'** tab. Under **'Enable AWS Textract signature detection'** (inside **'Change default redaction settings'**) you can choose whether to analyse signatures or not (with a [cost implication](#enable-aws-textract-signature-extraction)).
+
+Then, open the section under the heading 'Submit whole document to AWS Textract API...'.
+
+
+
+Click 'Analyse document with AWS Textract API call'. After a few seconds, the job should be submitted to the AWS Textract service. The box 'Job ID to check status' should now have an ID filled in. If it is not already filled with previous jobs (up to seven days old), the table should have a row added with details of the new API job.
+
+Click the button underneath, 'Check status of Textract job and download', to see progress on the job. Processing will continue in the background until the job is ready, so it is worth periodically clicking this button to see if the outputs are ready. In testing, and as a rough estimate, it seems like this process takes about five seconds per page. However, this has not been tested with very large documents. Once ready, the '_textract.json' output should appear below.
+
+### Textract API job outputs
+
+The '_textract.json' output can be used to speed up further redaction tasks as [described previously](#optional---costs-and-time-estimation), the 'Existing Textract output file found' flag should now be ticked.
+
+
+
+You can now easily get the '..._ocr_output.csv' redaction output based on this '_textract.json' (described in [Redaction outputs](#redaction-outputs)) by clicking on the button 'Convert Textract job outputs to OCR results'. You can now use this file e.g. for [identifying duplicate pages](#identifying-and-redacting-duplicate-pages), or for redaction review.
+
+## Advanced OCR settings - Efficient OCR, overwrite existing OCR
+
+On the Settings tab you may see a section called 'Advanced OCR settings'. Here you may see a few options.
+
+### Efficient OCR
+
+When checked, redaction tasks will first check all pages for text that can be extracted simply without needing to analyse with the OCR model. If there are more words on the page than the value shown in 'Minimum words for efficient OCR', the app will follow this simple extraction process, and only run the more expensive OCR process for the remaining pages.
+
+This field 'Min. page-area fraction for an embedded image to force OCR' is shown when Efficient OCR is enabled. It sets the smallest fraction of the page area (on the PDF page) that a **single** embedded image must cover—on one placement on the page—for that page to be sent through the full OCR step **even if** the page already has enough extractable words for the text-only path. That way, pages with plenty of selectable text but a large image (for example a photo or embedded scan) can still be analysed by OCR, which can pick up text that exists only inside the image and is not separate selectable text.
+
+Values are typically small decimals (for example `0.01` for 1% of the page area). Enter **0** to turn off this rule: routing then depends only on the minimum word count (the label in the app notes this as “word count only”). To disable Efficient OCR entirely, uncheck the 'Efficient OCR' checkbox and run your redaction task.
+
+**Note:** *AWS Textract* with *Extract signatures* still analyses **all** pages and disables Efficient OCR for that workflow.
+
+### Overwrite existing OCR results
+
+When checked, redaction tasks will overwrite any existing OCR results with the new results. This is useful if you have already run a redaction task and you want to re-run it with a different method or settings. To disable this feature, simply uncheck the 'Overwrite existing OCR results' checkbox and run your redaction task.
+
+### High-quality Textract OCR
+
+This option may not be visible unless you system administrator has enabled it. When checked, redaction tasks will use a hybrid approach to OCR. This will first use the relatively efficient Textract OCR process to extract text, and then use the more expensive Vision Language Model (VLM) OCR process to extract any text that was missed by the efficient OCR process. This is useful if you have a document that is a mix of text and images, and you want to ensure that all text is extracted. To disable this feature, simply uncheck the 'High quality Textract OCR' checkbox and run your redaction task.
+
+### Save page OCR visualisations
+When checked, redaction tasks will save page OCR visualisations to the output folder. This is useful for debugging and to see the OCR results for each page. The visualisations will be saved as '_page_ocr_visualisations.png' files in the output folder. To disable this feature, simply uncheck the 'Save page OCR visualisations' checkbox and run your redaction task.
+
+## Modifying existing redaction review files
+You can find the folder containing the files discussed in this section [here](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/merge_review_files/).
+
+As well as serving as inputs to the document redaction app's review function, the 'review_file.csv' output can be modified insider or outside of the app. This gives you the flexibility to change redaction details outside of the app.
+
+If you open up a 'review_file' csv output using a spreadsheet software program such as Microsoft Excel you can easily modify redaction properties. Open the file '[Partnership-Agreement-Toolkit_0_0_redacted.pdf_review_file_local.csv](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/merge_review_files/Partnership-Agreement-Toolkit_0_0.pdf_review_file_local.csv)', and you should see a spreadshet with just four suggested redactions (see below). The following instructions are for using Excel.
+
+
+
+The first thing we can do is remove the first row - 'et' is suggested as a person, but is obviously not a genuine instance of personal information. Right click on the row number and select delete on this menu. Next, let's imagine that what the app identified as a 'phone number' was in fact another type of number and so we wanted to change the label. Simply click on the relevant label cells, let's change it to 'SECURITY_NUMBER'. You could also use 'Find & Select' -> 'Replace' from the top ribbon menu if you wanted to change a number of labels simultaneously.
+
+How about we wanted to change the colour of the 'email address' entry on the redaction review tab of the redaction app? The colours in a review file are based on an RGB scale with three numbers ranging from 0-255. [You can find suitable colours here](https://rgbcolorpicker.com). Using this scale, if I wanted my review box to be pure blue, I can change the cell value to (0,0,255).
+
+Imagine that a redaction box was slightly too small, and I didn't want to use the in-app options to change the size. In the review file csv, we can modify e.g. the ymin and ymax values for any box to increase the extent of the redaction box. For the 'email address' entry, let's decrease ymin by 5, and increase ymax by 5.
+
+I have saved an output file following the above steps as '[Partnership-Agreement-Toolkit_0_0_redacted.pdf_review_file_local_mod.csv](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/merge_review_files/outputs/Partnership-Agreement-Toolkit_0_0.pdf_review_file_local_mod.csv)' in the same folder that the original was found. Let's upload this file to the app along with the original pdf to see how the redactions look now.
+
+
+
+We can see from the above that we have successfully removed a redaction box, changed labels, colours, and redaction box sizes.
+
+# Features for expert users/system administrators
+This advanced user guide covers features that require system administration access or command-line usage. These options are not enabled by default but can be configured by your system administrator, and are not available to users who are just using the graphical user interface. These features are typically used by system administrators or advanced users who need more control over the redaction process.
+
+## Using AWS Textract and Comprehend when not running in an AWS environment
+
+AWS Textract and Comprehend give much better results for text extraction and document redaction than the local model options in the app. The most secure way to access them in the Redaction app is to run the app in a secure AWS environment with relevant permissions. Alternatively, you could run the app on your own system while logged in to AWS SSO with relevant permissions.
+
+However, it is possible to access these services directly via API from outside an AWS environment by creating IAM users and access keys with relevant permissions to access AWS Textract and Comprehend services. Please check with your IT and data security teams that this approach is acceptable for your data before trying the following approaches.
+
+To do the following, in your AWS environment you will need to create a new user with permissions for "textract:AnalyzeDocument", "textract:DetectDocumentText", and "comprehend:DetectPiiEntities". Under security credentials, create new access keys - note down the access key and secret key.
+
+### Direct access by passing AWS access keys through app
+The Redaction Settings tab now has boxes for entering the AWS access key and secret key. If you paste the relevant keys into these boxes before performing redaction, you should be able to use these services in the app.
+
+### Picking up AWS access keys through an .env file
+The app also has the capability of picking up AWS access key details through a .env file located in a '/config/aws_config.env' file (default), or alternative .env file location specified by the environment variable AWS_CONFIG_PATH. The env file should look like the following with just two lines:
+
+AWS_ACCESS_KEY= your-access-key
+AWS_SECRET_KEY= your-secret-key
+
+The app should then pick up these keys when trying to access the AWS Textract and Comprehend services during redaction.
+
+## Advanced OCR model options
+
+The app supports advanced OCR options that combine multiple OCR engines for improved accuracy. These options are not enabled by default but can be configured by changing the app_config.env file in your '/config' folder, or system environment variables in your system.
+
+### Available OCR models
+
+A range of local and cloud OCR models are available for text extraction. The options shown in the app depend on which environment variables are enabled in your `app_config.env` file (see [Enabling advanced OCR options](#enabling-advanced-ocr-options) below). Once enabled, the models appear under **'Change default redaction settings'** on the **'Redact PDFs/images'** tab.
+
+
+
+- **Tesseract** (`tesseract`, default, requires `SHOW_LOCAL_OCR_MODEL_OPTIONS=True`): Best for documents with clear, well-formatted text. Provides a good balance of speed and accuracy with precise word-level bounding boxes, but struggles with handwriting or noisy/scanned documents.
+- **PaddleOCR** (`paddle`, requires `SHOW_PADDLE_MODEL_OPTIONS=True`): More powerful than Tesseract and handles unclear typed text on scanned documents reasonably well, but is slower. Word-level bounding boxes are estimated from line-level output so may be less precise.
+- **VLM** (`vlm`, requires `SHOW_VLM_MODEL_OPTIONS=True`): Uses a Vision Language Model locally (recommended: Qwen 3.5, configurable via the `SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL` environment variable). Excellent for difficult handwriting and noisy documents, but significantly slower than the options above. Inference runs via the `transformers` package and can be quantised with `bitsandbytes` if `QUANTISE_VLM_MODELS=True`; for faster inference, use an [inference server](#inference-server-options) instead.
+- **Inference server** (`inference-server`, requires `SHOW_INFERENCE_SERVER_VLM_OPTIONS=True`): Uses an OpenAI-compatible API endpoint such as [llama-cpp (llama-server)](https://github.com/ggml-org/llama.cpp) or [vLLM](https://docs.vllm.ai/en/stable). Produces results comparable to the in-app VLM but is much faster, and supports GGUF or AWQ/GPTQ quantised models. Requires a separately configured server. See [below for details](#inference-server-options).
+- **AWS Bedrock VLM** (`AWS Bedrock VLM OCR - all PDF types`, requires `SHOW_BEDROCK_VLM_MODELS=True`): Cloud-based OCR using a Bedrock vision model (e.g. Qwen VL). Extracts text and optionally detects people and signatures from PDFs and images without a local GPU. Requires AWS credentials and Bedrock model access, and is subject to Bedrock quotas and pricing.
+
+#### Hybrid options
+
+Hybrid models (enabled with `SHOW_HYBRID_MODELS=True`) use PaddleOCR first to identify bounding boxes and extract text, then pass low-confidence regions to a more powerful model for correction. **Note:** In practice, results are not significantly better than using Paddle or VLM/inference server analysis alone (particularly with Qwen 3 VL), but the options are provided for comparison.
+
+- **Hybrid-paddle-vlm** (`hybrid-paddle-vlm`): Combines PaddleOCR's bounding box detection with an in-app VLM for low-confidence regions. PaddleOCR's stronger bounding box identification makes this the most practical hybrid option, provided both Paddle and the VLM model can run in the same environment.
+- **Hybrid-paddle-inference-server** (`hybrid-paddle-inference-server`): The same as above but uses an inference server instead of an in-app VLM, allowing the use of GGUF or AWQ/GPTQ quantised models via llama.cpp or vLLM. See [below for details](#inference-server-options).
+
+### Enabling advanced OCR options
+
+To enable these options, you need to change your system environment variables, or modify the app_config.env file in your '/config' folder and set the following environment variables:
+
+**Basic OCR model selection:**
+```
+SHOW_LOCAL_OCR_MODEL_OPTIONS=True
+```
+
+**To enable PaddleOCR options (paddle):**
+```
+SHOW_PADDLE_MODEL_OPTIONS=True
+```
+
+**To enable Vision Language Model options (vlm):**
+```
+SHOW_VLM_MODEL_OPTIONS=True
+```
+
+**To enable AWS Bedrock VLM OCR (cloud-based VLM text extraction):**
+```
+SHOW_BEDROCK_VLM_MODELS=True
+```
+
+**To enable Inference Server options (inference-server):**
+```
+SHOW_INFERENCE_SERVER_VLM_OPTIONS=True
+```
+
+**To enable Hybrid OCR options (hybrid-paddle-vlm, hybrid-paddle-inference-server):**
+```
+SHOW_HYBRID_MODELS=True
+```
+
+See the [app settings documentation](https://seanpedrick-case.github.io/doc_redaction/src/app_settings.html), or below, for more details on these options. Once enabled, when running the app, you will see a "Change default local OCR model" (or text extraction method) section in the redaction settings where they can choose between the available models based on what has been enabled, including Bedrock VLM analysis when configured.
+
+### OCR configuration parameters
+
+The following parameters can be configured to fine-tune OCR behaviour. Set them in your `app_config.env` file or as system environment variables. See the [app settings documentation](https://seanpedrick-case.github.io/doc_redaction/src/app_settings.html) for a full reference.
+
+#### General OCR settings
+
+- **DEFAULT_LOCAL_OCR_MODEL** (default: `"tesseract"`): Sets the default OCR engine without requiring a UI selection. Valid values: `"tesseract"`, `"paddle"`, `"vlm"`, `"inference-server"`, `"hybrid-paddle-vlm"`, `"hybrid-paddle-inference-server"`.
+- **SHOW_LOCAL_OCR_MODEL_OPTIONS** (default: False): If enabled, users can select the local OCR model from the UI.
+- **SHOW_OCR_GUI_OPTIONS** (default: True): If enabled, OCR-related options (e.g. model selection, Paddle options) are shown in the UI.
+- **EFFICIENT_OCR** (default: False): If enabled, uses a two-step process for PDFs — tries selectable text extraction per page first and only runs OCR on pages where no text could be extracted. Saves time and cost.
+- **EFFICIENT_OCR_MIN_WORDS** (default: 20): Minimum extractable words on a page to use the text-only route when `EFFICIENT_OCR` is enabled; pages below this threshold go through OCR.
+- **EFFICIENT_OCR_MIN_IMAGE_COVERAGE_FRACTION** (default: 0.03): When `EFFICIENT_OCR` is enabled, the minimum fraction of the page area a single embedded image must cover for that page to be routed through OCR regardless of word count. Set to `0` to rely on word count only.
+- **MERGE_BOUNDING_BOXES** (default: True): When enabled, merges nearby bounding boxes in OCR/Textract results (reconstruction, grouping, and horizontal merge).
+- **MODEL_CACHE_PATH** (default: `"./model_cache"`): Directory where OCR models are cached locally.
+- **OVERWRITE_EXISTING_OCR_RESULTS** (default: False): If enabled, always runs OCR fresh instead of loading from existing JSON result files.
+- **HF_TOKEN** (default: `''`): Hugging Face token. Required for downloading gated or private VLM models from the Hub.
+
+#### Tesseract settings
+
+- **TESSERACT_SEGMENTATION_LEVEL** (default: 11): Tesseract PSM (Page Segmentation Mode) level. Valid values are 0–13. Higher values provide more detailed segmentation but may be slower.
+- **TESSERACT_WORD_LEVEL_OCR** (default: True): If enabled, uses Tesseract word-level OCR rather than line-level.
+- **TESSERACT_MAX_WORKERS** (default: 4): Maximum number of worker threads for running Tesseract on multiple pages in parallel. Keep lower than `MAX_WORKERS` to avoid saturating CPU/RAM.
+- **TESSERACT_FOLDER** (default: `''`): Path to the local Tesseract installation folder. On Windows, install Tesseract 5.5.0 from [UB-Mannheim/tesseract](https://github.com/UB-Mannheim/tesseract/wiki) and point this variable at the folder (e.g. `tesseract/`).
+- **TESSERACT_DATA_FOLDER** (default: `"/usr/share/tessdata"`): Path to the Tesseract trained data files (`tessdata`).
+
+#### PaddleOCR settings
+
+- **SHOW_PADDLE_MODEL_OPTIONS** (default: False): If enabled, PaddleOCR options will be shown in the UI.
+- **PADDLE_USE_TEXTLINE_ORIENTATION** (default: False): If enabled, PaddleOCR will detect and correct text line orientation.
+- **PADDLE_DET_DB_UNCLIP_RATIO** (default: 1.2): Controls the expansion ratio of detected text regions. Higher values expand the detection area more.
+- **CONVERT_LINE_TO_WORD_LEVEL** (default: False): If enabled, converts PaddleOCR line-level results to approximate word-level bounding boxes for better precision.
+- **LOAD_PADDLE_AT_STARTUP** (default: False): If enabled, loads the PaddleOCR model when the application starts, reducing latency for first use but increasing startup time.
+- **PADDLE_MAX_WORKERS** (default: 2): Maximum number of worker threads for running PaddleOCR on multiple pages in parallel. Paddle is often GPU-bound; keep this low (e.g. 1–2) to avoid VRAM contention.
+- **PADDLE_MODEL_PATH** (default: `''`): Custom directory for PaddleOCR model storage, useful for environments like AWS Lambda.
+- **PADDLE_FONT_PATH** (default: `''`): Custom font path for PaddleOCR. If empty, the app uses system fonts.
+- **POPPLER_FOLDER** (default: `''`): Path to the local Poppler `bin` folder, required for PDF-to-image conversion. On Windows, install from [oschwartz10612/poppler-windows](https://github.com/oschwartz10612/poppler-windows) and point this at the `bin` folder (e.g. `poppler/poppler-24.02.0/Library/bin/`).
+
+#### Image preprocessing
+
+- **PREPROCESS_LOCAL_OCR_IMAGES** (default: True): If enabled, images are preprocessed before local OCR. Testing has shown this doesn't always improve results and can slow processing — consider setting to `False` if speed is a priority.
+- **SAVE_PREPROCESS_IMAGES** (default: False): If enabled, saves the preprocessed images for debugging.
+- **SAVE_PAGE_OCR_VISUALISATIONS** (default: False): If enabled, saves images with detected bounding boxes overlaid on page images for debugging.
+- **INCLUDE_OCR_VISUALISATION_IN_OUTPUT_FILES** (default: False): If enabled, includes OCR visualisation files (created when `SAVE_PAGE_OCR_VISUALISATIONS` is True) in the final output file list shown in the Gradio interface.
+
+#### Vision Language Model (VLM) settings
+
+- **SHOW_VLM_MODEL_OPTIONS** (default: False): If enabled, VLM OCR options will be shown in the UI.
+- **SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL** (default: `"Qwen3-VL-8B-Instruct"`): The local transformers VLM model to use for OCR. Available options: `"Nanonets-OCR2-3B"`, `"Dots.OCR"`, `"Qwen3-VL-2B-Instruct"`, `"Qwen3-VL-4B-Instruct"`, `"Qwen3-VL-8B-Instruct"`, `"Qwen3-VL-30B-A3B-Instruct"`, `"Qwen3-VL-235B-A22B-Instruct"`, `"PaddleOCR-VL"`. Generally the Qwen 3 VL range offers the best accuracy.
+- **OVERRIDE_VLM_REPO_ID** (default: `''`): If non-empty, overrides the Hugging Face repo ID or local path used for the selected VLM. Useful for pointing at a custom checkpoint or local folder.
+- **LOAD_TRANSFORMERS_VLM_MODEL_AT_START** (default: True): If enabled, loads the VLM weights at application startup. If disabled, weights load on the first VLM OCR call — reducing startup memory at the cost of a slower first run.
+- **QUANTISE_VLM_MODELS** (default: False): If enabled, quantises VLM models to 4-bit using `bitsandbytes` to reduce VRAM requirements.
+- **MAX_NEW_TOKENS** (default: 4096): Maximum number of tokens the VLM may generate per response.
+- **MAX_INPUT_TOKEN_LENGTH** (default: 8192): Maximum number of tokens that can be input to the VLM.
+- **VLM_MAX_IMAGE_SIZE** (default: 819200): Upper bound on total pixels (width × height) for images sent to the VLM. Larger images are downscaled while preserving aspect ratio.
+- **VLM_MIN_IMAGE_SIZE** (default: 614400): Minimum total pixels for full-page VLM OCR. Smaller pages are upscaled.
+- **VLM_HYBRID_MIN_IMAGE_SIZE** (default: 153600): Minimum total pixels for hybrid VLM line/crop passes. Smaller crops are upscaled.
+- **VLM_MIN_DPI** (default: 200.0): Minimum effective DPI after image preparation. Images below this DPI are upscaled.
+- **VLM_MAX_DPI** (default: 300.0): Maximum effective DPI after image preparation. High-DPI scans are downscaled accordingly.
+- **VLM_MAX_ASPECT_RATIO** (default: 10.0): Upper bound on image aspect ratio sent to the VLM. Very long/thin crops (e.g. hybrid line regions) are white-padded to stay within this limit.
+- **USE_FLASH_ATTENTION** (default: False): If enabled, uses flash attention for improved VLM performance. Not available on Windows.
+- **VLM_DISABLE_QWEN3_5_THINKING** (default: False): If enabled, disables the Qwen3.5 "thinking" chain for local transformers VLM calls, making responses faster by skipping the reasoning step.
+- **MAX_SPACES_GPU_RUN_TIME** (default: 60): Maximum seconds to run GPU operations on Hugging Face Spaces.
+- **ADD_VLM_BOUNDING_BOX_RULES** (default: False): If enabled, adds bounding box rules to the VLM prompt (e.g. coordinate format constraints for OCR output).
+- **REPORT_VLM_OUTPUTS_TO_GUI** (default: False): If enabled, reports VLM outputs to the GUI with info boxes as they are processed — useful for monitoring long OCR jobs.
+- **SAVE_VLM_INPUT_IMAGES** (default: False): If enabled, saves input images sent to the VLM for debugging.
+- **VLM_SEED** (default: `''`): Random seed for VLM generation. If empty, generation is non-deterministic.
+- **VLM generation parameters** (`VLM_DEFAULT_TEMPERATURE`, `VLM_DEFAULT_TOP_P`, `VLM_DEFAULT_MIN_P`, `VLM_DEFAULT_TOP_K`, `VLM_DEFAULT_REPETITION_PENALTY`, `VLM_DEFAULT_DO_SAMPLE`, `VLM_DEFAULT_PRESENCE_PENALTY`): Control sampling behaviour for VLM generation. All default to `''` (model-specific defaults are used when empty).
+
+#### Inference server settings
+
+- **SHOW_INFERENCE_SERVER_VLM_OPTIONS** (default: False): If enabled, inference server OCR options will be shown in the UI.
+- **INFERENCE_SERVER_API_URL** (default: `"http://localhost:8080"`): Base URL of the inference server API for remote VLM OCR processing.
+- **INFERENCE_SERVER_MODEL_NAME** (default: `''`): Optional model name to send in inference server API requests. If empty, the server's default model is used.
+- **DEFAULT_INFERENCE_SERVER_VLM_MODEL** (default: `"qwen_3_vl_30b_a3b_it"`): Default model name for inference server VLM OCR calls.
+- **INFERENCE_SERVER_TIMEOUT** (default: 300): Timeout in seconds for inference server API requests.
+- **INFERENCE_SERVER_DISABLE_THINKING** (default: False): If enabled, disables chain-of-thought "thinking" for inference server VLM calls (adds `{"enable_thinking": false}` to request payloads). Useful when running a Qwen3.5 reasoning model and thinking tokens are unnecessary, such as in hybrid line-crop OCR — eliminates overhead and avoids needing a large `HYBRID_OCR_MAX_NEW_TOKENS` budget.
+- **SHOW_INFERENCE_SERVER_VLM_MODEL_OPTIONS** (default: False): If enabled, allows users to select from available inference server VLM models in the UI.
+
+#### Hybrid OCR settings
+
+- **SHOW_HYBRID_MODELS** (default: False): If enabled, hybrid OCR options (`hybrid-paddle-vlm`, `hybrid-paddle-inference-server`) will be shown in the UI.
+- **HYBRID_OCR_CONFIDENCE_THRESHOLD** (default: 95): Confidence score below which the secondary OCR engine (VLM or inference server) will be used for re-extraction. Lower values mean more text will be re-extracted by the secondary model.
+- **HYBRID_OCR_PADDING** (default: 1): Padding in pixels added to word bounding boxes before re-extraction with the secondary engine.
+- **HYBRID_OCR_MAX_NEW_TOKENS** (default: 1024): Maximum tokens the inference server (or local VLM) may generate per hybrid line-crop OCR call. For reasoning models like Qwen3.5, thinking tokens count against this budget — increase to 2048 or higher if you see "Inference server returned no results" when using `hybrid-paddle-inference-server` with a reasoning model.
+- **HYBRID_OCR_MAX_WORDS** (default: 50): Maximum words allowed in a hybrid OCR result for a single text line. Results exceeding this are discarded as likely hallucinations, and the original OCR result is kept instead.
+- **SAVE_EXAMPLE_HYBRID_IMAGES** (default: False): If enabled, saves comparison images showing Tesseract vs. secondary engine results when using hybrid modes.
+
+### Inference server options
+
+If using a local inference server, I would suggest using [llama.cpp](https://github.com/ggml-org/llama.cpp), or [vLLM](https://docs.vllm.ai/en/stable/) as they are much faster than transformers/torch inference provided in the app through the 'vlm' OCR route or 'Local Transformers LLM' route. Llama.cpp is more flexible than vLLM for low VRAM systems, as Llama.cpp will offload to cpu/system RAM automatically rather than failing as vLLM tends to do.
+
+To help with installation of the Redaction app with Llama.cpp and vLLM, I have created Docker compose files for both Llama.cpp and vLLM. These can be found in the doc_redaction repo. For Llama.cpp, you can use the [docker-compose_llama.yml](https://github.com/seanpedrick-case/doc_redaction/blob/main/docker-compose_llama.yml) file, and for vLLM, you can use the [docker-compose_vllm.yml](https://github.com/seanpedrick-case/doc_redaction/blob/main/docker-compose_vllm.yml) file. To run, Docker / Docker Desktop should be installed, and then you can run the commands suggested in the top of the files to run the servers.
+
+You will need ~30-50GB of disk space to run everything depending on the model chosen from the Compose file. For the vLLM server, you will need 24 GB VRAM. For the Llama.cpp server, 24 GB VRAM is needed to run at full speed, but the n-gpu-layers and n-cpu-moe parameters in the Docker compose file can be adjusted to fit into your system. I would suggest that 8 GB VRAM is needed as a bare minimum for decent inference speed. See the [Unsloth guide](https://unsloth.ai/docs/models/qwen3.5) for more details on working with GGUF files for Qwen 3.5.
+
+### Identifying people and signatures with VLMs
+
+If VLM or inference server options are enabled, you can also use the VLM to identify photos of people's faces and signatures in the document, and redact them accordingly.
+
+On the 'Redaction Settings' tab, select the CUSTOM_VLM_FACES and CUSTOM_VLM_SIGNATURE entities. When you conduct an OCR task with the VLM or inference server, it will identify the bounding boxes for photos of people's faces and signatures in the document, and redact them accordingly if a redaction option is selected.
+
+With **Efficient OCR** enabled, **CUSTOM_VLM_FACES** follows the same page rules as OCR (VLM only on OCR-classified pages); **CUSTOM_VLM_SIGNATURE** still scans all pages. Textract **Extract signatures** uses full-document Textract OCR and disables Efficient OCR.
+
+### PII identification with LLMs
+
+In addition to rule-based (Local) and AWS Comprehend PII detection, the app can use **Large Language Models (LLMs)** to identify and label personal information. This is useful for entity types that are context-dependent (e.g. job titles, organisation names) or when you want custom instructions (e.g. "do not redact the name of the university"). LLM-based PII can be run via **AWS Bedrock**, a **local transformers** model, or a **local inference server** (e.g. llama.cpp, vLLM).
+
+
+
+**Options (when enabled by your administrator):**
+
+- **LLM (AWS Bedrock)**: Uses a Bedrock model (e.g. Claude, Nova) for PII detection. Requires AWS credentials and Bedrock model access. No local GPU needed.
+- **Local transformers LLM**: Runs a Hugging Face transformers model on your machine for PII detection. Requires sufficient RAM/VRAM. The default model is Qwen 3.5 9B; other supported models include Qwen 3.5 (0.8B–122B), Gemma 3 (12B/27B), GPT-OSS 20B, Ministral 3 14B, and NVIDIA Nemotron 3 30B (see the [config.py file for the updated list](https://github.com/seanpedrick-case/doc_redaction/blob/main/tools/config.py)).
+- **Local inference server**: Sends text to an OpenAI-compatible API (e.g. llama.cpp, vLLM) for PII detection. You run the server separately; the app only calls the API. Note: this uses `INFERENCE_SERVER_API_URL`, the same URL endpoint used for VLM OCR. See [Inference server options](#inference-server-options) for installation, as calls to the Local inference server will use the same model as the OCR inference-server model by default.
+
+**Using LLM PII detection**
+
+On the **'Redact PDFs/images'** tab, under **'Redaction settings'**, choose the desired **PII identification** method. You can then:
+
+- Select which **LLM entities** to detect (e.g. NAME, EMAIL_ADDRESS, PHONE_NUMBER, ADDRESS, CUSTOM). Custom entity types can also be added directly in the dropdown box.
+- Optionally add **custom instructions** to guide the LLM — for example: `"Do not redact company names"` or `"Redact all organisation names with the label ORGANISATION"`. When custom instructions are provided, the app can optionally use a more capable model (see `CLOUD_LLM_PII_CUSTOM_INSTRUCTIONS_MODEL_CHOICE` below).
+
+The LLM processes text page by page. Results include entity type, text span, and confidence score, which are then used to place redaction boxes in the same way as the local or Comprehend methods.
+
+**Enabling LLM PII options**
+
+Visibility of these methods is controlled by environment variables in `app_config.env`:
+
+- **SHOW_AWS_BEDROCK_LLM_MODELS** (default: False): Show "LLM (AWS Bedrock)" in the PII identification dropdown.
+- **SHOW_TRANSFORMERS_LLM_PII_DETECTION_OPTIONS** (default: False): Show "Local transformers LLM".
+- **SHOW_INFERENCE_SERVER_PII_OPTIONS** (default: False): Show "Local inference server".
+
+#### LLM PII configuration variables
+
+##### Model selection
+
+- **CLOUD_LLM_PII_MODEL_CHOICE** (default: `"amazon.nova-pro-v1:0"`): Default Bedrock (or cloud) model ID for LLM-based PII detection.
+- **CLOUD_LLM_PII_CUSTOM_INSTRUCTIONS_MODEL_CHOICE** (default: `"amazon.nova-pro-v1:0"`): If non-empty, overrides `CLOUD_LLM_PII_MODEL_CHOICE` when custom instructions are provided. Leave empty to always use `CLOUD_LLM_PII_MODEL_CHOICE`.
+- **LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE** (default: `"Qwen 3.5 9B"`): The local transformers model for PII detection. Supported values: `"Qwen 3.5 0.8B"`, `"Qwen 3.5 2B"`, `"Qwen 3.5 4B"`, `"Qwen 3.5 9B"`, `"Qwen 3.5 27B"`, `"Qwen 3.5 35B-A3B"`, `"Qwen 3.5 122B-A10B"`, `"Gemma 3 12B"`, `"Gemma 3 27B"`, `"GPT-OSS 20B"`, `"Ministral 3 14B Instruct"`, `"NVIDIA Nemotron 3 Nano 30B A3B NVFP4"`.
+- **USE_TRANSFORMERS_VLM_MODEL_AS_LLM** (default: False): If enabled, reuses the already-loaded VLM model (`SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL`) for LLM PII detection instead of downloading a separate model. The VLM weights must already be loaded (e.g. from a prior VLM OCR run, or `LOAD_TRANSFORMERS_VLM_MODEL_AT_START=True`).
+- **LOAD_TRANSFORMERS_LLM_PII_MODEL_AT_START** (default: False): If enabled, loads the local transformers PII model at application startup to reduce latency on first use.
+- **CHOSEN_LLM_PII_INFERENCE_METHOD** (default: `"aws-bedrock"`): Sets the default inference method programmatically. Options: `"aws-bedrock"`, `"local"`, `"inference-server"`, `"azure-openai"`, `"gemini"`.
+- **DEFAULT_INFERENCE_SERVER_PII_MODEL** (default: `"gemma_3_12b"`): Default model name for inference-server LLM PII detection calls.
+- **INFERENCE_SERVER_API_URL** (default: `"http://localhost:8080"`): Base URL for the inference server used for LLM PII detection. Distinct from `INFERENCE_SERVER_API_URL`, which is used for VLM OCR.
+- **DEFAULT_PII_DETECTION_MODEL**: Default PII detection method shown selected in the UI on startup. Automatically defaults to AWS Comprehend if `SHOW_AWS_PII_DETECTION_OPTIONS` is enabled, otherwise defaults to the local model.
+
+##### Entity configuration
+
+- **CHOSEN_LLM_ENTITIES**: Default entity types pre-selected for LLM detection (e.g. `EMAIL_ADDRESS`, `ADDRESS`, `NAME`, `PHONE_NUMBER`, `CUSTOM`). Configurable as a comma-separated list.
+- **FULL_LLM_ENTITY_LIST**: Full list of entity types available in the LLM entity selection dropdown.
+
+##### Generation parameters
+
+- **LLM_TEMPERATURE** (default: 0.1): Sampling temperature for LLM generation. Lower values produce more deterministic output — recommended for PII detection.
+- **LLM_MAX_NEW_TOKENS** (default: 8192): Maximum tokens the LLM may generate per PII detection call.
+- **LLM_SEED** (default: 42): Random seed for reproducible results.
+- **LLM_CONTEXT_LENGTH** (default: 32768): Maximum context length for the local transformers LLM.
+- **REASONING_SUFFIX**: Suffix appended to prompts for reasoning-capable models to control chain-of-thought behaviour. Examples: `"/nothink"` for Qwen 3.5 (disables thinking), `"Reasoning: low"` for GPT-OSS 20B, `""` for models without reasoning modes. Set automatically based on `LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE`.
+- **MULTIMODAL_PROMPT_FORMAT** (default: False): If enabled, uses a multimodal prompt format. Auto-set to True for some model choices (e.g. Gemma 3 12B/27B).
+- **PRINT_TRANSFORMERS_USER_PROMPT** (default: False): If enabled, prints the system and user prompts to the console when calling the local transformers LLM. Useful for debugging; avoid in production if logs are sensitive.
+- **Additional LLM generation parameters**: `LLM_TOP_K` (default: 64), `LLM_MIN_P` (default: 0), `LLM_TOP_P` (default: 0.95), `LLM_REPETITION_PENALTY` (default: 1.0), `LLM_STREAM` (default: True), `LLM_RESET` (default: False), `SPECULATIVE_DECODING` (default: False), `ASSISTANT_MODEL` (empty), `LLM_MODEL_DTYPE` (default: `"bfloat16"`).
+
+## Command Line Interface (CLI)
+
+The app includes a comprehensive command-line interface (`cli_redact.py`) that allows you to perform redaction, deduplication, AWS Textract batch operations, and document summarisation directly from the terminal. This is particularly useful for batch processing, automation, and integration with other systems.
+
+### Getting started with the CLI
+
+To use the CLI, you need to:
+
+1. Open a terminal window
+2. Navigate to the app folder containing `cli_redact.py`
+3. Activate your virtual environment (conda or venv)
+4. Run commands using `python cli_redact.py` followed by your options
+
+### Basic CLI syntax
+
+```bash
+python cli_redact.py --task [redact|deduplicate|textract|summarise] --input_file [file_path] [additional_options]
+```
+
+Default task is `redact` if `--task` is omitted.
+
+### Redaction examples
+
+**Basic PDF redaction with default settings:**
+```bash
+python cli_redact.py --input_file example_data/example_of_emails_sent_to_a_professor_before_applying.pdf
+```
+
+**Extract text only (no redaction) with whole page redaction:**
+```bash
+python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --redact_whole_page_file example_data/partnership_toolkit_redact_some_pages.csv --pii_detector None
+```
+
+**Redact with custom entities and allow list:**
+```bash
+python cli_redact.py --input_file example_data/graduate-job-example-cover-letter.pdf --allow_list_file example_data/test_allow_list_graduate.csv --local_redact_entities TITLES PERSON DATE_TIME
+```
+
+**Redact with fuzzy matching and custom deny list:**
+```bash
+python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --deny_list_file example_data/Partnership-Agreement-Toolkit_test_deny_list_para_single_spell.csv --local_redact_entities CUSTOM_FUZZY --page_min 1 --page_max 3 --fuzzy_mistakes 3
+```
+
+**Redact with AWS services:**
+```bash
+python cli_redact.py --input_file example_data/example_of_emails_sent_to_a_professor_before_applying.pdf --ocr_method "AWS Textract" --pii_detector "AWS Comprehend"
+```
+
+**Redact specific pages with signature extraction:**
+```bash
+python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --page_min 6 --page_max 7 --ocr_method "AWS Textract" --handwrite_signature_extraction "Extract handwriting" "Extract signatures"
+```
+
+**Redact with LLM PII (entity subset and custom instructions):**
+When your deployment uses an LLM-based PII method (e.g. via config/defaults), you can pass LLM entities and instructions:
+```bash
+python cli_redact.py --input_file example_data/example_of_emails_sent_to_a_professor_before_applying.pdf --llm_redact_entities NAME EMAIL_ADDRESS PHONE_NUMBER ADDRESS CUSTOM --custom_llm_instructions "Do not redact the name of the university."
+```
+
+### Tabular data redaction
+
+**Anonymize CSV file with specific columns:**
+```bash
+python cli_redact.py --input_file example_data/combined_case_notes.csv --text_columns "Case Note" "Client" --anon_strategy replace_redacted
+```
+
+**Anonymize Excel file:**
+```bash
+python cli_redact.py --input_file example_data/combined_case_notes.xlsx --text_columns "Case Note" "Client" --excel_sheets combined_case_notes --anon_strategy redact
+```
+
+**Anonymize Word document:**
+```bash
+python cli_redact.py --input_file "example_data/Bold minimalist professional cover letter.docx" --anon_strategy replace_redacted
+```
+
+### Duplicate detection
+
+**Find duplicate pages in OCR files:**
+```bash
+python cli_redact.py --task deduplicate --input_file example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv --duplicate_type pages --similarity_threshold 0.95
+```
+
+**Find duplicates at line level:**
+```bash
+python cli_redact.py --task deduplicate --input_file example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv --duplicate_type pages --similarity_threshold 0.95 --combine_pages False --min_word_count 3
+```
+
+**Find duplicate rows in tabular data:**
+```bash
+python cli_redact.py --task deduplicate --input_file example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv --duplicate_type tabular --text_columns "text" --similarity_threshold 0.95
+```
+
+### AWS Textract operations
+
+**Submit document for analysis:**
+```bash
+python cli_redact.py --task textract --textract_action submit --input_file example_data/example_of_emails_sent_to_a_professor_before_applying.pdf
+```
+
+**Submit with signature extraction:**
+```bash
+python cli_redact.py --task textract --textract_action submit --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --extract_signatures
+```
+
+**Retrieve results by job ID:**
+```bash
+python cli_redact.py --task textract --textract_action retrieve --job_id 12345678-1234-1234-1234-123456789012
+```
+
+**List recent jobs:**
+```bash
+python cli_redact.py --task textract --textract_action list
+```
+
+### Document summarisation
+
+**Summarise OCR output CSV(s) with AWS Bedrock:**
+```bash
+python cli_redact.py --task summarise --input_file example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_ocr_output.csv --summarisation_inference_method "LLM (AWS Bedrock)"
+```
+
+**Summarise with local LLM and detailed format:**
+```bash
+python cli_redact.py --task summarise --input_file example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_ocr_output.csv --summarisation_inference_method "Local transformers LLM" --summarisation_format detailed
+```
+
+**Summarise with context and extra instructions (concise format):**
+```bash
+python cli_redact.py --task summarise --input_file example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_ocr_output.csv --summarisation_context "This is a partnership agreement" --summarisation_additional_instructions "Focus on key obligations and termination clauses" --summarisation_format concise
+```
+
+**Summarise multiple OCR CSV files:**
+```bash
+python cli_redact.py --task summarise --input_file example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_ocr_output.csv example_data/example_outputs/example_of_emails_sent_to_a_professor_before_applying_ocr_output_textract.csv --summarisation_inference_method "LLM (AWS Bedrock)"
+```
+
+### Common CLI options
+
+#### General options
+
+- `--task`: Task to perform: "redact", "deduplicate", "textract", or "summarise" (default: redact)
+- `--input_file`: Path to input file(s); multiple files separated by spaces
+- `--output_dir`: Directory for output files (default: output/)
+- `--input_dir`: Directory for input files (default: input/)
+- `--language`: Language of document content (e.g. "en", "es", "fr")
+- `--username`: Username for session tracking
+- `--save_to_user_folders`: Save outputs under username-based subfolders (True/False)
+- `--allow_list`: Path to CSV of terms to exclude from redaction (default from config)
+- `--pii_detector`: PII detection method: "Local", "AWS Comprehend", or "None"
+- `--local_redact_entities`: Local entities to redact (space-separated list)
+- `--aws_redact_entities`: AWS Comprehend entities to redact (space-separated list)
+- `--aws_access_key` / `--aws_secret_key`: AWS credentials for cloud services
+- `--aws_region`: AWS region for cloud services
+- `--s3_bucket`: S3 bucket name for cloud operations
+- `--cost_code`: Cost code for tracking usage
+- `--save_outputs_to_s3`: Upload output files to S3 after processing (True/False)
+- `--s3_outputs_folder`: S3 key prefix for output files
+- `--s3_outputs_bucket`: S3 bucket for outputs (defaults to --s3_bucket if not set)
+- `--do_initial_clean`: Perform initial text cleaning for tabular data (True/False)
+- `--save_logs_to_csv`: Save processing logs to CSV (True/False)
+- `--save_logs_to_dynamodb`: Save processing logs to DynamoDB (True/False)
+- `--display_file_names_in_logs`: Include file names in log output (True/False)
+- `--upload_logs_to_s3`: Upload log files to S3 after processing (True/False)
+- `--s3_logs_prefix`: S3 prefix for usage log files
+- `--feedback_logs_folder`: Directory for feedback log files
+- `--access_logs_folder`: Directory for access log files
+- `--usage_logs_folder`: Directory for usage log files
+- `--paddle_model_path`: Directory for PaddleOCR model storage
+- `--spacy_model_path`: Directory for spaCy model storage
+
+#### PDF/Image redaction options
+
+- `--ocr_method`: Text extraction method: "AWS Textract", "Local OCR", or "Local text"
+- `--chosen_local_ocr_model`: Local OCR model (e.g. "tesseract", "paddle", "hybrid-paddle", "hybrid-vlm", "inference-server")
+- `--page_min` / `--page_max`: Page range to process (0 for page_max means all pages)
+- `--images_dpi`: DPI for image processing (default: 300.0)
+- `--preprocess_local_ocr_images`: Preprocess images before OCR (True/False)
+- `--compress_redacted_pdf`: Compress the final redacted PDF (True/False)
+- `--return_pdf_end_of_redaction`: Return PDF at end of redaction (True/False)
+- `--allow_list_file` / `--deny_list_file`: Paths to custom allow/deny list CSV files
+- `--redact_whole_page_file`: Path to CSV listing pages to redact completely
+- `--handwrite_signature_extraction`: Textract options ("Extract handwriting", "Extract signatures")
+- `--extract_forms`: Extract forms during Textract analysis (flag)
+- `--extract_tables`: Extract tables during Textract analysis (flag)
+- `--extract_layout`: Extract layout during Textract analysis (flag)
+- `--vlm_model_choice`: VLM model for OCR (e.g. Bedrock model ID when using cloud VLM)
+- `--inference_server_vlm_model`: Inference server VLM model name for OCR
+- `--inference_server_api_url`: Inference server API URL for VLM OCR
+- `--gemini_api_key`: Google Gemini API key for VLM OCR
+- `--azure_openai_api_key`: Azure OpenAI API key for VLM OCR
+- `--azure_openai_endpoint`: Azure OpenAI endpoint URL for VLM OCR
+- `--efficient_ocr`: Use efficient OCR: try selectable text first per page, run OCR only when needed (flag)
+- `--no_efficient_ocr`: Disable efficient OCR (flag)
+- `--efficient_ocr_min_words`: Min words on page to use text-only route; below this use OCR (default from config)
+
+#### LLM PII detection options
+
+Used when the PII method is LLM-based (e.g. via config/defaults). Model used depends on inference method.
+
+- `--llm_model_choice`: LLM model for PII (e.g. Bedrock model ID); defaults to config CLOUD_LLM_PII_MODEL_CHOICE for Bedrock
+- `--llm_inference_method`: "aws-bedrock", "local", "inference-server", "azure-openai", or "gemini"
+- `--inference_server_pii_model`: Inference server PII detection model name
+- `--llm_temperature`: Temperature for LLM PII (lower = more deterministic)
+- `--llm_max_tokens`: Max tokens in LLM response for PII detection
+- `--llm_redact_entities`: LLM entities to detect (space-separated, e.g. NAME, EMAIL_ADDRESS, PHONE_NUMBER, ADDRESS, CUSTOM)
+- `--custom_llm_instructions`: Custom instructions for LLM entity detection (e.g. "Do not redact company names")
+
+#### Tabular/Word anonymization options
+
+- `--anon_strategy`: One of "redact", "redact completely", "replace_redacted", "entity_type", "encrypt", "hash", "replace with 'REDACTED'", "replace with ", "mask", "fake_first_name"
+- `--text_columns`: Column names to anonymise or use for deduplication (space-separated)
+- `--excel_sheets`: Excel sheet names to process (space-separated)
+- `--fuzzy_mistakes`: Allowed spelling mistakes for fuzzy matching (default: 0)
+- `--match_fuzzy_whole_phrase_bool`: Match fuzzy whole phrase (True/False)
+- `--do_initial_clean`: Initial text cleaning for tabular data (True/False)
+
+#### Duplicate detection options
+
+- `--duplicate_type`: Type of duplicate detection ("pages" for OCR files or "tabular" for CSV/Excel)
+- `--similarity_threshold`: Similarity threshold (0-1) to consider content as duplicates (default: 0.95)
+- `--min_word_count`: Minimum word count for text to be considered (default: 10)
+- `--min_consecutive_pages`: Minimum number of consecutive pages to consider as a match (default: 1)
+- `--greedy_match`: Use greedy matching strategy for consecutive pages (True/False)
+- `--combine_pages`: Combine text from same page number within a file (True/False)
+- `--remove_duplicate_rows`: Remove duplicate rows from output (True/False)
+
+#### Document summarisation options
+
+- `--summarisation_inference_method`: "LLM (AWS Bedrock)", "Local transformers LLM", or "Local inference server"
+- `--summarisation_temperature`: Temperature for summarisation (0.0–2.0; default: 0.6)
+- `--summarisation_max_pages_per_group`: Max pages per page-group summary (default: 30)
+- `--summarisation_api_key`: API key if required by the chosen LLM
+- `--summarisation_context`: Additional context (e.g. "This is a consultation response document")
+- `--summarisation_format`: "concise" (key themes) or "detailed" (default)
+- `--summarisation_additional_instructions`: Extra instructions (e.g. "Focus on key decisions and recommendations")
+
+#### Textract batch operations options
+
+- `--textract_action`: "submit", "retrieve", or "list"
+- `--job_id`: Textract job ID for retrieve action
+- `--extract_signatures`: Extract signatures during Textract analysis (flag)
+- `--textract_bucket`: S3 bucket for Textract operations
+- `--textract_input_prefix`: S3 prefix for input files in Textract operations
+- `--textract_output_prefix`: S3 prefix for output files in Textract operations
+- `--s3_textract_document_logs_subfolder`: S3 prefix for Textract job logs
+- `--local_textract_document_logs_subfolder`: Local path for Textract job logs
+- `--poll_interval`: Polling interval in seconds for job status (default: 30)
+- `--max_poll_attempts`: Max polling attempts before timeout (default: 120)
+
+### Output files
+
+The CLI generates the same output files as the GUI:
+- `...redacted.pdf`: Final redacted document
+- `...redactions_for_review.pdf`: Document with redaction boxes for review
+- `...review_file.csv`: Detailed redaction information
+- `...ocr_results.csv`: Extracted text results
+- `..._textract.json`: AWS Textract results (if applicable)
+
+For more advanced options and configuration, refer to the help text by running:
+```bash
+python cli_redact.py --help
+```
\ No newline at end of file
diff --git a/_quarto.yml b/_quarto.yml
new file mode 100644
index 0000000000000000000000000000000000000000..5262f1997fe348465816815d893ba856ee1df21d
--- /dev/null
+++ b/_quarto.yml
@@ -0,0 +1,33 @@
+project:
+ type: website
+ output-dir: docs
+ render:
+ - "*.qmd"
+
+website:
+ title: "Document Redaction App"
+ page-navigation: true
+ back-to-top-navigation: true
+ search: true
+ google-analytics: G-9JNEKNN14K
+ navbar:
+ left:
+ - href: index.qmd
+ text: Home
+ - href: src/user_guide.qmd
+ text: User guide
+ - href: src/faq.qmd
+ text: User FAQ
+ - href: src/installation_guide.qmd
+ text: App installation guide (with CDK)
+ - href: src/app_settings.qmd
+ text: App settings management guide
+ - href: src/redaction_with_vlm_and_llms.qmd
+ text: Redaction with local VLM and LLMs (Qwen 3)
+ - href: src/ocr_and_redaction_with_qwen35.qmd
+ text: OCR and redaction with Qwen 3.5 (Mar 2026)
+
+format:
+ html:
+ theme: cosmo
+ css: styles.css
diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..46be2fe86134b4cb7590f95adce1aa44910a8bce
--- /dev/null
+++ b/app.py
@@ -0,0 +1,9984 @@
+import os
+from pathlib import Path
+
+import gradio as gr
+import pandas as pd
+import spaces
+from fastapi import FastAPI, status
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.middleware.trustedhost import TrustedHostMiddleware
+from gradio_image_annotation import image_annotator
+
+from tools.auth import authenticate_user
+from tools.aws_functions import (
+ download_file_from_s3,
+ export_outputs_to_s3,
+ upload_log_file_to_s3,
+)
+from tools.config import (
+ ACCESS_LOG_DYNAMODB_TABLE_NAME,
+ ACCESS_LOGS_FOLDER,
+ ALLOW_LIST_PATH,
+ ALLOWED_HOSTS,
+ ALLOWED_ORIGINS,
+ AWS_ACCESS_KEY,
+ AWS_LLM_PII_OPTION,
+ AWS_PII_OPTION,
+ AWS_REGION,
+ AWS_SECRET_KEY,
+ AZURE_OPENAI_API_KEY,
+ AZURE_OPENAI_INFERENCE_ENDPOINT,
+ BEDROCK_VLM_TEXT_EXTRACT_OPTION,
+ CHOSEN_COMPREHEND_ENTITIES,
+ CHOSEN_LLM_ENTITIES,
+ CHOSEN_LLM_PII_INFERENCE_METHOD,
+ CHOSEN_LOCAL_MODEL_INTRO_TEXT,
+ CHOSEN_REDACT_ENTITIES,
+ CLOUD_LLM_PII_MODEL_CHOICE,
+ CLOUD_VLM_MODEL_CHOICE,
+ COGNITO_AUTH,
+ CONFIG_FOLDER,
+ COST_CODES_PATH,
+ CSV_ACCESS_LOG_HEADERS,
+ CSV_FEEDBACK_LOG_HEADERS,
+ CSV_USAGE_LOG_HEADERS,
+ CUSTOM_BOX_COLOUR,
+ DEFAULT_CONCURRENCY_LIMIT,
+ DEFAULT_COST_CODE,
+ DEFAULT_DUPLICATE_DETECTION_THRESHOLD,
+ DEFAULT_EXCEL_SHEETS,
+ DEFAULT_FUZZY_SPELLING_MISTAKES_NUM,
+ DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX,
+ DEFAULT_INFERENCE_SERVER_PII_MODEL,
+ DEFAULT_INFERENCE_SERVER_VLM_MODEL,
+ DEFAULT_LANGUAGE,
+ DEFAULT_LANGUAGE_FULL_NAME,
+ DEFAULT_LOCAL_OCR_MODEL,
+ DEFAULT_MIN_CONSECUTIVE_PAGES,
+ DEFAULT_MIN_WORD_COUNT,
+ DEFAULT_PAGE_MAX,
+ DEFAULT_PAGE_MIN,
+ DEFAULT_PII_DETECTION_MODEL,
+ DEFAULT_SEARCH_QUERY,
+ DEFAULT_TABULAR_ANONYMISATION_STRATEGY,
+ DEFAULT_TEXT_COLUMNS,
+ DEFAULT_TEXT_EXTRACTION_MODEL,
+ DENY_LIST_PATH,
+ DIRECT_MODE_ANON_STRATEGY,
+ DIRECT_MODE_CHOSEN_LOCAL_OCR_MODEL,
+ DIRECT_MODE_COMBINE_PAGES,
+ DIRECT_MODE_COMPRESS_REDACTED_PDF,
+ DIRECT_MODE_DEFAULT_USER,
+ DIRECT_MODE_DUPLICATE_TYPE,
+ DIRECT_MODE_EXTRACT_FORMS,
+ DIRECT_MODE_EXTRACT_LAYOUT,
+ DIRECT_MODE_EXTRACT_SIGNATURES,
+ DIRECT_MODE_EXTRACT_TABLES,
+ DIRECT_MODE_FUZZY_MISTAKES,
+ DIRECT_MODE_GREEDY_MATCH,
+ DIRECT_MODE_IMAGES_DPI,
+ DIRECT_MODE_INPUT_FILE,
+ DIRECT_MODE_JOB_ID,
+ DIRECT_MODE_LANGUAGE,
+ DIRECT_MODE_MATCH_FUZZY_WHOLE_PHRASE_BOOL,
+ DIRECT_MODE_MIN_CONSECUTIVE_PAGES,
+ DIRECT_MODE_MIN_WORD_COUNT,
+ DIRECT_MODE_OCR_FIRST_PASS_MAX_WORKERS,
+ DIRECT_MODE_OCR_METHOD,
+ DIRECT_MODE_OUTPUT_DIR,
+ DIRECT_MODE_PAGE_MAX,
+ DIRECT_MODE_PAGE_MIN,
+ DIRECT_MODE_PII_DETECTOR,
+ DIRECT_MODE_PREPROCESS_LOCAL_OCR_IMAGES,
+ DIRECT_MODE_REMOVE_DUPLICATE_ROWS,
+ DIRECT_MODE_RETURN_PDF_END_OF_REDACTION,
+ DIRECT_MODE_SIMILARITY_THRESHOLD,
+ DIRECT_MODE_SUMMARY_PAGE_GROUP_MAX_WORKERS,
+ DIRECT_MODE_TASK,
+ DIRECT_MODE_TEXTRACT_ACTION,
+ DISPLAY_FILE_NAMES_IN_LOGS,
+ DO_INITIAL_TABULAR_DATA_CLEAN,
+ DOCUMENT_REDACTION_BUCKET,
+ DYNAMODB_ACCESS_LOG_HEADERS,
+ DYNAMODB_FEEDBACK_LOG_HEADERS,
+ DYNAMODB_USAGE_LOG_HEADERS,
+ EFFICIENT_OCR,
+ EFFICIENT_OCR_MIN_IMAGE_COVERAGE_FRACTION,
+ EFFICIENT_OCR_MIN_WORDS,
+ ENFORCE_COST_CODES,
+ EXTRACTION_AND_PII_OPTIONS_OPEN_BY_DEFAULT,
+ FASTAPI_ROOT_PATH,
+ FAVICON_PATH,
+ FEEDBACK_LOG_DYNAMODB_TABLE_NAME,
+ FEEDBACK_LOG_FILE_NAME,
+ FEEDBACK_LOGS_FOLDER,
+ FILE_INPUT_HEIGHT,
+ FULL_COMPREHEND_ENTITY_LIST,
+ FULL_ENTITY_LIST,
+ FULL_LLM_ENTITY_LIST,
+ GEMINI_API_KEY,
+ GET_COST_CODES,
+ GET_DEFAULT_ALLOW_LIST,
+ GRADIO_SERVER_NAME,
+ GRADIO_SERVER_PORT,
+ GRADIO_TEMP_DIR,
+ HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS,
+ HOST_NAME,
+ HYBRID_TEXTRACT_BEDROCK_VLM,
+ INFERENCE_SERVER_API_URL,
+ INFERENCE_SERVER_PII_OPTION,
+ INPUT_FOLDER,
+ INTRO_TEXT,
+ LANGUAGE_CHOICES,
+ LLM_MAX_NEW_TOKENS,
+ LLM_TEMPERATURE,
+ LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
+ LOCAL_OCR_MODEL_OPTIONS,
+ LOCAL_OCR_MODEL_TEXT_EXTRACT_OPTION,
+ LOCAL_PII_OPTION,
+ LOCAL_TRANSFORMERS_LLM_PII_OPTION,
+ LOG_FILE_NAME,
+ MAPPED_LANGUAGE_CHOICES,
+ MAX_FILE_SIZE,
+ MAX_OPEN_TEXT_CHARACTERS,
+ MAX_QUEUE_SIZE,
+ MPLCONFIGDIR,
+ NO_REDACTION_PII_OPTION,
+ OUTPUT_COST_CODES_PATH,
+ OUTPUT_FOLDER,
+ OVERWRITE_EXISTING_OCR_RESULTS,
+ PADDLE_MODEL_PATH,
+ PII_DETECTION_MODELS,
+ REMOVE_DUPLICATE_ROWS,
+ ROOT_PATH,
+ RUN_ALL_EXAMPLES_THROUGH_AWS,
+ RUN_AWS_FUNCTIONS,
+ RUN_DIRECT_MODE,
+ RUN_FASTAPI,
+ RUN_MCP_SERVER,
+ S3_ACCESS_LOGS_FOLDER,
+ S3_ALLOW_LIST_PATH,
+ S3_COST_CODES_PATH,
+ S3_FEEDBACK_LOGS_FOLDER,
+ S3_OUTPUTS_FOLDER,
+ S3_USAGE_LOGS_FOLDER,
+ SAVE_LOGS_TO_CSV,
+ SAVE_LOGS_TO_DYNAMODB,
+ SAVE_OUTPUTS_TO_S3,
+ SAVE_PAGE_OCR_VISUALISATIONS,
+ SESSION_OUTPUT_FOLDER,
+ SHOW_ALL_OUTPUTS_IN_OUTPUT_FOLDER,
+ SHOW_AWS_API_KEYS,
+ SHOW_AWS_EXAMPLES,
+ SHOW_AWS_PII_DETECTION_OPTIONS,
+ SHOW_AWS_TEXT_EXTRACTION_OPTIONS,
+ SHOW_COSTS,
+ SHOW_DIFFICULT_OCR_EXAMPLES,
+ SHOW_EXAMPLES,
+ SHOW_HYBRID_TEXTRACT_BEDROCK_CHECKBOX,
+ SHOW_INFERENCE_SERVER_PII_OPTIONS,
+ SHOW_INFERENCE_SERVER_VLM_MODEL_OPTIONS,
+ SHOW_LANGUAGE_SELECTION,
+ SHOW_LOCAL_OCR_MODEL_OPTIONS,
+ SHOW_OCR_GUI_OPTIONS,
+ SHOW_PII_IDENTIFICATION_OPTIONS,
+ SHOW_QUICKSTART,
+ SHOW_SUMMARISATION,
+ SHOW_TRANSFORMERS_LLM_PII_DETECTION_OPTIONS,
+ SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS,
+ SPACY_MODEL_PATH,
+ TABULAR_PII_DETECTION_MODELS,
+ TEXT_EXTRACTION_MODELS,
+ TEXTRACT_JOBS_LOCAL_LOC,
+ TEXTRACT_JOBS_S3_INPUT_LOC,
+ TEXTRACT_JOBS_S3_LOC,
+ TEXTRACT_TEXT_EXTRACT_OPTION,
+ TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET,
+ TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER,
+ TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER,
+ USAGE_LOG_DYNAMODB_TABLE_NAME,
+ USAGE_LOG_FILE_NAME,
+ USAGE_LOGS_FOLDER,
+ USE_GREEDY_DUPLICATE_DETECTION,
+ WHOLE_PAGE_REDACTION_LIST_PATH,
+)
+from tools.custom_csvlogger import CSVLogger_custom
+from tools.data_anonymise import anonymise_files_with_open_text
+from tools.file_conversion import (
+ combine_review_pdf_files,
+ get_document_file_names,
+ get_input_file_names,
+ is_pdf,
+ prepare_image_or_pdf,
+ prepare_image_or_pdf_with_efficient_ocr,
+)
+from tools.file_redaction import choose_and_run_redactor
+from tools.find_duplicate_pages import (
+ apply_whole_page_redactions_from_list,
+ create_annotation_objects_from_duplicates,
+ exclude_match,
+ handle_selection_and_preview,
+ run_duplicate_analysis,
+ run_search_with_regex_option,
+)
+from tools.find_duplicate_tabular import (
+ clean_tabular_duplicates,
+ handle_tabular_row_selection,
+ run_tabular_duplicate_detection,
+)
+from tools.helper_functions import (
+ _file_name_from_pdf_path,
+ all_outputs_file_download_fn,
+ apply_session_default_cost_code,
+ auto_set_local_ocr_for_bedrock_vlm,
+ calculate_aws_costs,
+ calculate_time_taken,
+ change_tab_to_review_redactions,
+ change_tab_to_tabular_or_document_redactions,
+ check_duplicate_pages_checkbox,
+ check_for_existing_textract_file,
+ check_for_relevant_ocr_output_with_words,
+ custom_regex_load,
+ enforce_cost_codes,
+ ensure_folder_exists,
+ get_connection_params,
+ lifespan,
+ load_all_output_files,
+ load_in_default_allow_list,
+ load_in_default_cost_codes,
+ merge_csv_files,
+ put_columns_in_df,
+ reset_aws_call_vars,
+ reset_base_dataframe,
+ reset_data_vars,
+ reset_ocr_base_dataframe,
+ reset_ocr_with_words_base_dataframe,
+ reset_review_vars,
+ reset_state_vars,
+ reveal_feedback_buttons,
+ save_default_cost_code_for_session,
+ show_duplicate_info_box_on_click,
+ show_info_box_on_click,
+ show_info_box_on_click_ocr_examples,
+ show_tabular_info_box_on_click,
+ update_cost_code_dataframe_from_dropdown_select,
+ update_language_dropdown,
+)
+from tools.load_spacy_model_custom_recognisers import custom_entities
+from tools.quickstart import (
+ handle_main_pii_method_selection,
+ handle_main_redaction_method_selection,
+ handle_main_text_extract_method_selection,
+ handle_pii_method_selection,
+ handle_pii_method_selection_tabular,
+ handle_redaction_method_selection,
+ handle_step_2_next,
+ handle_step_3_next,
+ handle_text_extract_method_selection,
+ route_walkthrough_files,
+ update_step_2_on_data_file_upload,
+ update_step_3_tabular_visibility,
+ update_step_4_visibility,
+)
+from tools.redaction_review import (
+ apply_redactions_to_review_df_and_files,
+ convert_df_to_xfdf,
+ convert_xfdf_to_dataframe,
+ create_annotation_objects_from_filtered_ocr_results_with_words,
+ decrease_page,
+ df_select_callback_cost,
+ df_select_callback_dataframe_row,
+ df_select_callback_dataframe_row_ocr_with_words,
+ df_select_callback_ocr,
+ df_select_callback_textract_api,
+ exclude_selected_items_from_redaction,
+ get_all_rows_with_same_text,
+ get_all_rows_with_same_text_redact,
+ get_and_merge_current_page_annotations,
+ increase_bottom_page_count_based_on_top,
+ increase_page,
+ reset_dropdowns,
+ undo_last_removal,
+ update_all_entity_df_dropdowns,
+ update_all_page_annotation_object_based_on_previous_page,
+ update_annotator_object_and_filter_df,
+ update_annotator_page_from_review_df,
+ update_entities_df_page,
+ update_entities_df_recogniser_entities,
+ update_entities_df_text,
+ update_other_annotator_number_from_current,
+ update_redact_choice_df_from_page_dropdown,
+ update_selected_review_df_row_colour,
+)
+from tools.summaries import (
+ _summarisation_upload_to_paths,
+ _upload_contains_pdf,
+ concise_summary_format_prompt,
+ detailed_summary_format_prompt,
+ summarise_document_wrapper,
+)
+from tools.textract_batch_call import (
+ analyse_document_with_textract_api,
+ check_for_provided_job_id,
+ check_textract_outputs_exist,
+ load_in_textract_job_details,
+ poll_whole_document_textract_analysis_progress_and_download,
+ replace_existing_pdf_input_for_whole_document_outputs,
+)
+
+# Ensure that output folders exist
+ensure_folder_exists(CONFIG_FOLDER)
+ensure_folder_exists(OUTPUT_FOLDER)
+ensure_folder_exists(INPUT_FOLDER)
+if GRADIO_TEMP_DIR:
+ ensure_folder_exists(GRADIO_TEMP_DIR)
+if MPLCONFIGDIR:
+ ensure_folder_exists(MPLCONFIGDIR)
+
+ensure_folder_exists(FEEDBACK_LOGS_FOLDER)
+ensure_folder_exists(ACCESS_LOGS_FOLDER)
+ensure_folder_exists(USAGE_LOGS_FOLDER)
+
+# Add custom spacy recognisers to the Comprehend list, so that local Spacy model can be used to pick up e.g. titles, streetnames, UK postcodes that are sometimes missed by comprehend
+CHOSEN_COMPREHEND_ENTITIES.extend(custom_entities)
+FULL_COMPREHEND_ENTITY_LIST.extend(custom_entities)
+FULL_LLM_ENTITY_LIST.extend(custom_entities)
+
+
+###
+# Load in FastAPI app
+###
+
+
+# 3. Initialize the App with the lifespan parameter
+# Clean the ROOT_PATH for FastAPI
+# Ensure it starts with / and has no trailing /
+CLEAN_ROOT = f"/{FASTAPI_ROOT_PATH.strip('/')}" if FASTAPI_ROOT_PATH.strip("/") else ""
+app = FastAPI(lifespan=lifespan, root_path=CLEAN_ROOT)
+
+# Added to pass lint check, no effect
+if 0 == 1:
+ print(f"spaces.__name__: {spaces.__name__}")
+
+###
+# Load in Gradio app components
+###
+
+# Check which example files exist and create examples only for available files
+example_files = [
+ "example_data/example_of_emails_sent_to_a_professor_before_applying.pdf",
+ "example_data/example_complaint_letter.jpg",
+ "example_data/graduate-job-example-cover-letter.pdf",
+ "example_data/Partnership-Agreement-Toolkit_0_0.pdf",
+ "example_data/partnership_toolkit_redact_custom_deny_list.csv",
+ "example_data/partnership_toolkit_redact_some_pages.csv",
+]
+
+ocr_example_files = [
+ "example_data/Partnership-Agreement-Toolkit_0_0.pdf",
+ "example_data/Difficult handwritten note.jpg",
+ "example_data/Example-cv-university-graduaty-hr-role-with-photo-2.pdf",
+]
+
+# Load some components outside of blocks context that are used for examples
+
+# Components for "Redact all PII" option (conditionally visible)
+# Set initial visibility based on default redaction method ("Redact all PII")
+initial_show_pii_method = SHOW_PII_IDENTIFICATION_OPTIONS # Default is "Redact all PII"
+default_pii_method = DEFAULT_PII_DETECTION_MODEL
+initial_show_local_entities = initial_show_pii_method and (
+ default_pii_method == LOCAL_PII_OPTION
+)
+initial_show_comprehend_entities = initial_show_pii_method and (
+ default_pii_method == AWS_PII_OPTION
+)
+initial_is_llm_method = initial_show_pii_method and (
+ default_pii_method == LOCAL_TRANSFORMERS_LLM_PII_OPTION
+ or default_pii_method == INFERENCE_SERVER_PII_OPTION
+ or default_pii_method == AWS_LLM_PII_OPTION
+)
+
+## Walkthrough / quickstart components
+walkthrough_file_input = gr.File(
+ label="Choose a PDF document, image file (PDF, JPG, PNG), tabular data file (Excel, CSV, Parquet), or Word document (DOCX)",
+ file_count="multiple",
+ file_types=[
+ ".pdf",
+ ".jpg",
+ ".png",
+ ".json",
+ ".zip",
+ ".xlsx",
+ ".xls",
+ ".csv",
+ ".parquet",
+ ".docx",
+ ],
+ height=FILE_INPUT_HEIGHT,
+)
+
+walkthrough_in_redact_entities = gr.Dropdown(
+ value=CHOSEN_REDACT_ENTITIES,
+ choices=FULL_ENTITY_LIST,
+ multiselect=True,
+ label="Local PII identification model (click empty space in box for full list)",
+ visible=initial_show_local_entities,
+ allow_custom_value=True,
+)
+
+walkthrough_in_redact_comprehend_entities = gr.Dropdown(
+ value=CHOSEN_COMPREHEND_ENTITIES,
+ choices=FULL_COMPREHEND_ENTITY_LIST,
+ multiselect=True,
+ label="AWS Comprehend PII identification model (click empty space in box for full list)",
+ visible=initial_show_comprehend_entities,
+ allow_custom_value=True,
+)
+
+# Set initial visibility for local OCR and AWS Textract based on default text extraction method
+initial_local_ocr_visible = (
+ DEFAULT_TEXT_EXTRACTION_MODEL == LOCAL_OCR_MODEL_TEXT_EXTRACT_OPTION
+)
+initial_aws_textract_visible = (
+ DEFAULT_TEXT_EXTRACTION_MODEL == TEXTRACT_TEXT_EXTRACT_OPTION
+)
+
+text_extract_method_radio_message = """Choose text extraction method"""
+
+
+walkthrough_text_extract_method_radio = gr.Radio(
+ label=text_extract_method_radio_message,
+ value=DEFAULT_TEXT_EXTRACTION_MODEL,
+ choices=TEXT_EXTRACTION_MODELS,
+ visible=True,
+)
+
+# Set initial value for walkthrough local OCR method based on default text extraction method
+# If Bedrock VLM is the default, set to "bedrock-vlm", otherwise use DEFAULT_LOCAL_OCR_MODEL
+initial_walkthrough_local_ocr_value = DEFAULT_LOCAL_OCR_MODEL
+if (
+ DEFAULT_TEXT_EXTRACTION_MODEL == BEDROCK_VLM_TEXT_EXTRACT_OPTION
+ and "bedrock-vlm" in LOCAL_OCR_MODEL_OPTIONS
+):
+ initial_walkthrough_local_ocr_value = "bedrock-vlm"
+
+walkthrough_local_ocr_method_radio = gr.Radio(
+ label=CHOSEN_LOCAL_MODEL_INTRO_TEXT,
+ value=initial_walkthrough_local_ocr_value,
+ choices=LOCAL_OCR_MODEL_OPTIONS,
+ interactive=True,
+ visible=True,
+)
+
+walkthrough_handwrite_signature_checkbox = gr.CheckboxGroup(
+ label="AWS Textract extraction settings",
+ choices=HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS,
+ value=DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX,
+ visible=True,
+)
+
+walkthrough_pii_identification_method_drop = gr.Radio(
+ label="""Choose personal information detection model. Note that AWS Comprehend, if shown, has a cost of around £0.0075 ($0.01) per 10,000 characters.""",
+ value=DEFAULT_PII_DETECTION_MODEL,
+ choices=PII_DETECTION_MODELS,
+ visible=initial_show_pii_method,
+)
+
+walkthrough_deny_list_state = gr.Dropdown(
+ allow_custom_value=True,
+ label="Deny list (always redact these words)",
+ interactive=True,
+ multiselect=True,
+ visible=True,
+)
+
+walkthrough_allow_list_state = gr.Dropdown(
+ allow_custom_value=True,
+ label="Allow list (never redact these words)",
+ interactive=True,
+ multiselect=True,
+ visible=True,
+)
+
+walkthrough_fully_redacted_list_state = gr.Dropdown(
+ allow_custom_value=True,
+ label="Fully redacted pages (fully redact these page numbers)",
+ interactive=True,
+ multiselect=True,
+ visible=True,
+)
+
+# State variable to sync the checkbox value across both locations
+redact_duplicate_pages_state = gr.State(value=False)
+
+# Checkbox for automatically redacting duplicate pages
+redact_duplicate_pages_checkbox = gr.Checkbox(
+ info="Find and redact whole pages that contain duplicate text. See the 'Identify duplicate pages' tab for all settings and duplicate sentence/passage redaction.",
+ label="Redact duplicate pages",
+ value=False,
+ visible=SHOW_PII_IDENTIFICATION_OPTIONS,
+ elem_id="redact_duplicate_pages_checkbox",
+)
+if SHOW_AWS_PII_DETECTION_OPTIONS:
+ aws_comprehend_cost_message = (
+ ". AWS Comprehend has a cost of approximately $0.01 per 10,000 characters."
+ )
+else:
+ aws_comprehend_cost_message = ""
+walkthrough_pii_identification_method_drop_tabular = gr.Radio(
+ label="Choose PII detection method" + aws_comprehend_cost_message,
+ value=DEFAULT_PII_DETECTION_MODEL,
+ choices=TABULAR_PII_DETECTION_MODELS,
+ visible=False,
+)
+
+walkthrough_anon_strategy = gr.Radio(
+ choices=[
+ "replace with 'REDACTED'",
+ "replace with ",
+ "redact completely",
+ "hash",
+ "mask",
+ ],
+ label="Select an anonymisation method",
+ value=DEFAULT_TABULAR_ANONYMISATION_STRATEGY,
+ visible=False,
+)
+
+walkthrough_do_initial_clean = gr.Checkbox(
+ label="Do initial clean of text (remove URLs, HTML tags, and non-ASCII characters)",
+ value=DO_INITIAL_TABULAR_DATA_CLEAN,
+ visible=False,
+)
+
+walkthrough_in_redact_llm_entities = gr.Dropdown(
+ value=CHOSEN_LLM_ENTITIES,
+ choices=FULL_LLM_ENTITY_LIST,
+ multiselect=True,
+ label="LLM PII identification model - subset of entities for LLM detection (click empty space in box for full list)",
+ visible=True,
+ allow_custom_value=True,
+)
+
+walkthrough_custom_llm_instructions_textbox = gr.Textbox(
+ label="Custom instructions for LLM-based entity detection",
+ placeholder="Specify new labels to redact with a description. E.g. 'Redact information related to Mark Wilson with the label MARK_WILSON' or 'redact all company names with the label COMPANY_NAME'.",
+ value="",
+ lines=3,
+ visible=True,
+)
+
+## Redaction examples
+in_doc_files = gr.File(
+ label="Choose a PDF document or image file (PDF, JPG, PNG)",
+ file_count="multiple",
+ file_types=[".pdf", ".jpg", ".png", ".json", ".zip"],
+ height=FILE_INPUT_HEIGHT,
+)
+
+total_pdf_page_count = gr.Number(
+ label="Total page count",
+ value=0,
+ visible=SHOW_COSTS,
+ interactive=False,
+)
+
+# Override options if OCR GUI is not shown
+if not SHOW_OCR_GUI_OPTIONS:
+ # SHOW_AWS_TEXT_EXTRACTION_OPTIONS = False
+ SHOW_INFERENCE_SERVER_VLM_MODEL_OPTIONS = False
+ SHOW_LOCAL_OCR_MODEL_OPTIONS = False
+
+text_extract_method_radio = gr.Radio(
+ label=text_extract_method_radio_message,
+ value=DEFAULT_TEXT_EXTRACTION_MODEL,
+ choices=TEXT_EXTRACTION_MODELS,
+ visible=SHOW_OCR_GUI_OPTIONS,
+)
+
+# Set initial value for local OCR method based on default text extraction method
+# If Bedrock VLM is the default, set to "bedrock-vlm", otherwise use DEFAULT_LOCAL_OCR_MODEL
+initial_local_ocr_value = DEFAULT_LOCAL_OCR_MODEL
+if (
+ DEFAULT_TEXT_EXTRACTION_MODEL == BEDROCK_VLM_TEXT_EXTRACT_OPTION
+ and "bedrock-vlm" in LOCAL_OCR_MODEL_OPTIONS
+):
+ initial_local_ocr_value = "bedrock-vlm"
+
+local_ocr_method_radio = gr.Radio(
+ label=CHOSEN_LOCAL_MODEL_INTRO_TEXT,
+ value=initial_local_ocr_value,
+ choices=LOCAL_OCR_MODEL_OPTIONS,
+ interactive=True,
+ visible=SHOW_LOCAL_OCR_MODEL_OPTIONS,
+)
+
+handwrite_signature_checkbox = gr.CheckboxGroup(
+ label="AWS Textract extraction settings",
+ choices=HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS,
+ value=DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX,
+ visible=SHOW_AWS_TEXT_EXTRACTION_OPTIONS,
+)
+
+inference_server_vlm_model_textbox = gr.Textbox(
+ label="Inference Server VLM Model Name",
+ placeholder="e.g., 'qwen2-vl-7b-instruct' or leave empty to use default",
+ value=(
+ DEFAULT_INFERENCE_SERVER_VLM_MODEL if DEFAULT_INFERENCE_SERVER_VLM_MODEL else ""
+ ),
+ lines=1,
+ visible=SHOW_INFERENCE_SERVER_VLM_MODEL_OPTIONS,
+)
+
+# PII identification components
+
+# Override options if PII identification is not shown
+if not SHOW_PII_IDENTIFICATION_OPTIONS:
+ SHOW_TRANSFORMERS_LLM_PII_DETECTION_OPTIONS = False
+
+redaction_method_radio = gr.Radio(
+ label="Choose redaction method",
+ choices=[
+ "Extract text only",
+ "Redact all PII",
+ "Redact selected terms",
+ ],
+ value="Redact all PII",
+ interactive=True,
+)
+
+pii_identification_method_drop = gr.Radio(
+ label="""Choose personal information detection model. Note that AWS Comprehend, if shown, has a cost of around £0.0075 ($0.01) per 10,000 characters.""",
+ value=DEFAULT_PII_DETECTION_MODEL,
+ choices=PII_DETECTION_MODELS,
+ visible=SHOW_PII_IDENTIFICATION_OPTIONS,
+)
+
+in_redact_entities = gr.Dropdown(
+ value=CHOSEN_REDACT_ENTITIES,
+ choices=FULL_ENTITY_LIST,
+ multiselect=True,
+ label="Local PII identification model (click empty space in box for full list)",
+ visible=initial_show_local_entities,
+ allow_custom_value=True,
+)
+in_redact_comprehend_entities = gr.Dropdown(
+ value=CHOSEN_COMPREHEND_ENTITIES,
+ choices=FULL_COMPREHEND_ENTITY_LIST,
+ multiselect=True,
+ label="AWS Comprehend PII identification model (click empty space in box for full list)",
+ visible=initial_show_comprehend_entities,
+ allow_custom_value=True,
+)
+
+in_redact_llm_entities = gr.Dropdown(
+ value=CHOSEN_LLM_ENTITIES,
+ choices=FULL_LLM_ENTITY_LIST,
+ multiselect=True,
+ label="LLM PII identification model - subset of entities for LLM detection (click empty space in box for full list)",
+ visible=initial_is_llm_method,
+ allow_custom_value=True,
+)
+
+custom_llm_instructions_textbox = gr.Textbox(
+ label="Custom instructions for LLM-based entity detection",
+ placeholder="Specify new labels to redact with a description. E.g. 'Redact information related to Mark Wilson with the label MARK_WILSON' or 'redact all company names with the label COMPANY_NAME'.",
+ value="",
+ lines=3,
+ visible=True,
+)
+
+# Allow / deny / fully redacted lists
+
+in_deny_list_state = gr.Dropdown(
+ allow_custom_value=True,
+ label="Deny list (always redact these words)",
+ interactive=True,
+ multiselect=True,
+ visible=SHOW_PII_IDENTIFICATION_OPTIONS,
+)
+
+in_allow_list_state = gr.Dropdown(
+ allow_custom_value=True,
+ label="Allow list (never redact these words)",
+ interactive=True,
+ multiselect=True,
+ visible=SHOW_PII_IDENTIFICATION_OPTIONS,
+)
+
+in_fully_redacted_list_state = gr.Dropdown(
+ allow_custom_value=True,
+ label="Fully redact these pages",
+ interactive=True,
+ multiselect=True,
+ visible=SHOW_PII_IDENTIFICATION_OPTIONS,
+)
+
+in_deny_list = gr.File(
+ label="Import custom deny list - csv table with one column of a different word/phrase on each row (case insensitive). Terms in this file will always be redacted.",
+ file_count="multiple",
+ height=FILE_INPUT_HEIGHT,
+)
+
+in_fully_redacted_list = gr.File(
+ label="Import fully redacted pages list - csv table with one column of page numbers on each row. Page numbers in this file will be fully redacted.",
+ file_count="multiple",
+ height=FILE_INPUT_HEIGHT,
+)
+
+max_fuzzy_spelling_mistakes_num = gr.Number(
+ label="Maximum spelling mistakes for matching deny list terms (slows down PII detection).",
+ value=DEFAULT_FUZZY_SPELLING_MISTAKES_NUM,
+ minimum=0,
+ maximum=9,
+ precision=0,
+)
+
+## Cost codes
+cost_code_dataframe = gr.Dataframe(
+ value=pd.DataFrame(columns=["Cost code", "Description"]),
+ row_count=(0, "dynamic"),
+ label="Existing cost codes",
+ type="pandas",
+ interactive=True,
+ show_search="filter",
+ wrap=True,
+ max_height=200,
+ visible=GET_COST_CODES or ENFORCE_COST_CODES,
+)
+cost_code_choice_drop = gr.Dropdown(
+ value=DEFAULT_COST_CODE,
+ label="Choose cost code for analysis",
+ choices=[DEFAULT_COST_CODE],
+ allow_custom_value=False,
+ visible=GET_COST_CODES or ENFORCE_COST_CODES,
+)
+set_default_cost_code_button = gr.Button(
+ value="Set default cost code",
+ visible=GET_COST_CODES or ENFORCE_COST_CODES,
+)
+
+reset_cost_code_dataframe_button = gr.Button(
+ value="Reset code code table filter",
+ visible=GET_COST_CODES or ENFORCE_COST_CODES,
+)
+
+## Page options
+
+page_min = gr.Number(
+ value=DEFAULT_PAGE_MIN,
+ precision=0,
+ minimum=0,
+ maximum=9999,
+ label="Lowest page to redact (set to 0 to redact from the first page)",
+)
+
+page_max = gr.Number(
+ value=DEFAULT_PAGE_MAX,
+ precision=0,
+ minimum=0,
+ maximum=9999,
+ label="Highest page to redact (set to 0 to redact to the last page)",
+)
+
+## Deduplication examples
+in_duplicate_pages = gr.File(
+ label="Upload one or multiple 'ocr_output.csv' files to find duplicate pages and subdocuments",
+ file_count="multiple",
+ height=FILE_INPUT_HEIGHT,
+ file_types=[".csv"],
+)
+
+duplicate_threshold_input = gr.Number(
+ value=DEFAULT_DUPLICATE_DETECTION_THRESHOLD,
+ label="Similarity threshold",
+ info="Score (0-1) to consider pages/text lines a match.",
+)
+
+min_word_count_input = gr.Number(
+ value=DEFAULT_MIN_WORD_COUNT,
+ label="Minimum word count",
+ info="Pages/text lines with fewer words than this value are ignored.",
+)
+
+combine_page_text_for_duplicates_bool = gr.Radio(
+ label="Duplicate matching mode",
+ choices=[
+ ("Find duplicates by page", True),
+ ("Find duplicates by text line", False),
+ ],
+ value=True,
+ info="By page: compare full-page text. By text line: compare individual lines.",
+)
+
+## Tabular examples
+in_data_files = gr.File(
+ label="Choose Excel or csv files",
+ file_count="multiple",
+ file_types=[".xlsx", ".xls", ".csv", ".parquet", ".docx"],
+ height=FILE_INPUT_HEIGHT,
+)
+
+in_colnames = gr.Dropdown(
+ choices=["Choose columns to anonymise"],
+ multiselect=True,
+ allow_custom_value=True,
+ label="Select columns that you want to anonymise (showing columns present across all files).",
+)
+
+in_excel_sheets = gr.Dropdown(
+ choices=["Choose Excel sheets to anonymise"],
+ multiselect=True,
+ label="Select Excel sheets that you want to anonymise (showing sheets present across all Excel files).",
+ visible=False,
+ allow_custom_value=True,
+)
+
+pii_identification_method_drop_tabular = gr.Radio(
+ label="Choose PII detection method. Specific entities for the chosen redaction model type can be chosen on the Redact PDF/image tab"
+ + aws_comprehend_cost_message,
+ value=DEFAULT_PII_DETECTION_MODEL,
+ choices=TABULAR_PII_DETECTION_MODELS,
+)
+
+anon_strategy = gr.Radio(
+ choices=[
+ "replace with 'REDACTED'",
+ "replace with ",
+ "redact completely",
+ "hash",
+ "mask",
+ ],
+ label="Select an anonymisation method.",
+ value=DEFAULT_TABULAR_ANONYMISATION_STRATEGY,
+) # , "encrypt", "fake_first_name" are also available, but are not currently included as not that useful in current form
+
+do_initial_clean = gr.Checkbox(
+ label="Do initial clean of text (remove URLs, HTML tags, and non-ASCII characters)",
+ value=DO_INITIAL_TABULAR_DATA_CLEAN,
+)
+
+in_tabular_duplicate_files = gr.File(
+ label="Upload CSV, Excel, or Parquet files to find duplicate cells/rows. Note that the app will remove duplicates from later cells/files that are found in earlier cells/files and not vice versa.",
+ file_count="multiple",
+ file_types=[".csv", ".xlsx", ".xls", ".parquet"],
+ height=FILE_INPUT_HEIGHT,
+)
+
+tabular_text_columns = gr.Dropdown(
+ label="Choose columns to deduplicate",
+ multiselect=True,
+ allow_custom_value=True,
+)
+
+tabular_min_word_count = gr.Number(
+ value=DEFAULT_MIN_WORD_COUNT,
+ label="Minimum word count",
+ info="Cells with fewer words than this are ignored.",
+)
+
+### All output file components
+all_output_files_btn = gr.Button(
+ "Refresh files in output folder",
+ variant="secondary",
+ visible=SHOW_ALL_OUTPUTS_IN_OUTPUT_FOLDER,
+)
+all_output_files = gr.FileExplorer(
+ root_dir=OUTPUT_FOLDER,
+ label="Choose output files for download",
+ file_count="multiple",
+ visible=SHOW_ALL_OUTPUTS_IN_OUTPUT_FOLDER,
+ interactive=True,
+ max_height=400,
+)
+
+all_outputs_file_download = gr.File(
+ label="Download output files",
+ file_count="multiple",
+ file_types=[
+ ".pdf",
+ ".jpg",
+ ".jpeg",
+ ".png",
+ ".csv",
+ ".xlsx",
+ ".xls",
+ ".txt",
+ ".doc",
+ ".docx",
+ ".json",
+ ],
+ interactive=False,
+ visible=SHOW_ALL_OUTPUTS_IN_OUTPUT_FOLDER,
+ height=200,
+)
+
+clean_path = f"/{ROOT_PATH.strip('/')}"
+base_href = f"{clean_path}/" if clean_path != "/" else "/"
+
+if ROOT_PATH:
+ print(f"Setting HTML base href for Gradio to: '{base_href}'")
+
+head_html = f"""
+
+"""
+
+css = """
+/* Target tab navigation buttons only - not buttons inside tab content */
+/* Gradio renders tab buttons with role="tab" in the navigation area */
+button[role="tab"] {
+ font-size: 1.2em !important;
+ padding: 0.75em 1.4em !important;
+}
+
+/* Alternative selectors for different Gradio versions */
+.tab-nav button,
+nav button[role="tab"],
+div[class*="tab-nav"] button {
+ font-size: 1.2em !important;
+ padding: 0.75em 1.4em !important;
+}
+"""
+
+# Create the gradio interface.
+if RUN_FASTAPI:
+ blocks = gr.Blocks(
+ analytics_enabled=False,
+ title="Document Redaction App",
+ delete_cache=(43200, 43200), # Temporary file cache deleted every 12 hours
+ fill_width=True,
+ )
+else:
+ blocks = gr.Blocks(
+ analytics_enabled=False,
+ title="Document Redaction App",
+ delete_cache=(43200, 43200), # Temporary file cache deleted every 12 hours
+ fill_width=True,
+ )
+
+with blocks:
+
+ ###
+ # STATE VARIABLES
+ ###
+
+ # Pymupdf doc needs to be stored as State objects as they do not have a standard Gradio component equivalent
+ pdf_doc_state = gr.State(list())
+ all_image_annotations_state = gr.State(list())
+
+ all_decision_process_table_state = gr.State(pd.DataFrame())
+
+ all_page_line_level_ocr_results = gr.State(list())
+ all_page_line_level_ocr_results_with_words = gr.State(list())
+
+ session_hash_state = gr.Textbox(label="session_hash_state", value="", visible=False)
+ host_name_textbox = gr.Textbox(
+ label="host_name_textbox", value=HOST_NAME, visible=False
+ )
+ s3_output_folder_state = gr.Textbox(
+ label="s3_output_folder_state", value=S3_OUTPUTS_FOLDER, visible=False
+ )
+ session_output_folder_textbox = gr.Textbox(
+ value=str(SESSION_OUTPUT_FOLDER),
+ label="session_output_folder_textbox",
+ visible=False,
+ )
+ output_folder_textbox = gr.Textbox(
+ value=OUTPUT_FOLDER, label="output_folder_textbox", visible=False
+ )
+ input_folder_textbox = gr.Textbox(
+ value=INPUT_FOLDER, label="input_folder_textbox", visible=False
+ )
+
+ first_loop_state = gr.Checkbox(label="first_loop_state", value=True, visible=False)
+ second_loop_state = gr.Checkbox(
+ label="second_loop_state", value=False, visible=False
+ )
+ do_not_save_pdf_state = gr.Checkbox(
+ label="do_not_save_pdf_state", value=False, visible=False
+ )
+ save_pdf_state = gr.Checkbox(label="save_pdf_state", value=True, visible=False)
+
+ prepared_pdf_state = gr.State(list())
+ document_cropboxes = gr.State(list())
+ page_sizes = gr.State(list())
+ images_pdf_state = gr.State(list())
+ all_img_details_state = gr.State(list())
+
+ output_image_files_state = gr.State(list())
+ output_file_list_state = gr.State(list())
+ text_output_file_list_state = gr.State(list())
+ log_files_output_list_state = gr.State(list())
+ duplication_file_path_outputs_list_state = gr.State(list())
+
+ # Backup versions of these objects in case you make a mistake
+ backup_review_state = gr.State(pd.DataFrame())
+ backup_image_annotations_state = gr.State(list())
+ backup_recogniser_entity_dataframe_base = gr.State(pd.DataFrame())
+ backup_all_page_line_level_ocr_results_with_words_df_base = gr.State(pd.DataFrame())
+
+ # Logging variables
+ access_logs_state = gr.State(value=ACCESS_LOGS_FOLDER + LOG_FILE_NAME)
+ access_s3_logs_loc_state = gr.State(value=S3_ACCESS_LOGS_FOLDER)
+ feedback_logs_state = gr.State(value=FEEDBACK_LOGS_FOLDER + FEEDBACK_LOG_FILE_NAME)
+ feedback_s3_logs_loc_state = gr.State(value=S3_FEEDBACK_LOGS_FOLDER)
+ usage_logs_state = gr.State(value=USAGE_LOGS_FOLDER + USAGE_LOG_FILE_NAME)
+ usage_s3_logs_loc_state = gr.State(value=S3_USAGE_LOGS_FOLDER)
+
+ session_hash_textbox = gr.State(value="")
+ textract_metadata_textbox = gr.State(value="")
+ comprehend_query_number = gr.State(value=0)
+ textract_query_number = gr.State(value=0)
+
+ # VLM and LLM tracking components for usage logs
+ vlm_model_name_textbox = gr.State(value="")
+ vlm_total_input_tokens_number = gr.State(value=0)
+ vlm_total_output_tokens_number = gr.State(value=0)
+ llm_model_name_textbox = gr.State(value="")
+ llm_total_input_tokens_number = gr.State(value=0)
+ llm_total_output_tokens_number = gr.State(value=0)
+
+ # Document file name state
+ doc_full_file_name_textbox = gr.State(value="")
+ doc_file_name_no_extension_textbox = gr.State(value="")
+ doc_file_name_with_extension_textbox = gr.State(value="")
+ doc_file_name_textbox_list = gr.State(value="")
+
+ # Left blank for when user does not want to report file names
+ blank_doc_file_name_no_extension_textbox_for_logs = gr.State(value="")
+ placeholder_doc_file_name_no_extension_textbox_for_logs = gr.State(value="document")
+
+ # Tabular data file name state
+ data_full_file_name_textbox = gr.State(value="")
+ data_file_name_no_extension_textbox = gr.State(value="")
+ data_file_name_with_extension_textbox = gr.State(value="")
+ data_file_name_textbox_list = gr.State(value="")
+ blank_data_file_name_no_extension_textbox_for_logs = gr.State(value="")
+
+ placeholder_data_file_name_no_extension_textbox_for_logs = gr.State(
+ value="data_file"
+ )
+
+ latest_review_file_path = gr.State(
+ value=""
+ ) # Latest review file path output from redaction
+ latest_ocr_file_path = gr.State(
+ value=""
+ ) # Latest ocr file path output from text extraction
+
+ # Constants just to use with the review dropdowns for filtering by various columns
+ label_name_const = gr.State(value="label")
+ text_name_const = gr.State(value="text")
+ page_name_const = gr.State(value="page")
+
+ actual_time_taken_number = gr.State(
+ value=0.0
+ ) # This keeps track of the time taken to redact files for logging purposes.
+ annotate_previous_page = gr.State(
+ value=0
+ ) # Keeps track of the last page that the annotator was on
+ s3_logs_output_textbox = gr.State(value="")
+
+ ## Annotator zoom value
+ annotator_zoom_number = gr.Number(
+ label="Current annotator zoom level", value=100, precision=0, visible=False
+ )
+ zoom_true_bool = gr.Checkbox(label="zoom_true_bool", value=True, visible=False)
+ zoom_false_bool = gr.Checkbox(label="zoom_false_bool", value=False, visible=False)
+
+ clear_all_page_redactions = gr.Checkbox(
+ label="clear_all_page_redactions", value=True, visible=False
+ )
+ prepare_for_review_bool = gr.Checkbox(
+ label="prepare_for_review_bool", value=True, visible=False
+ )
+ prepare_for_review_bool_false = gr.Checkbox(
+ label="prepare_for_review_bool_false", value=False, visible=False
+ )
+ prepare_images_bool_false = gr.Checkbox(
+ label="prepare_images_bool_false", value=False, visible=False
+ )
+
+ ## Settings page variables
+ default_deny_list_file_name = "default_deny_list.csv"
+ default_deny_list_loc = OUTPUT_FOLDER + "/" + default_deny_list_file_name
+ in_deny_list_text_in = gr.Textbox(value="deny_list", visible=False)
+
+ fully_redacted_list_file_name = "default_fully_redacted_list.csv"
+ fully_redacted_list_loc = OUTPUT_FOLDER + "/" + fully_redacted_list_file_name
+ in_fully_redacted_text_in = gr.Textbox(
+ value="fully_redacted_pages_list", visible=False
+ )
+
+ # S3 settings for default allow list load
+ s3_default_bucket = gr.State(value=DOCUMENT_REDACTION_BUCKET)
+ s3_default_allow_list_file = gr.State(value=S3_ALLOW_LIST_PATH)
+ default_allow_list_output_folder_location = gr.State(value=ALLOW_LIST_PATH)
+
+ s3_whole_document_textract_default_bucket = gr.State(
+ value=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET
+ )
+ s3_whole_document_textract_input_subfolder = gr.State(
+ value=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER
+ )
+ s3_whole_document_textract_output_subfolder = gr.State(
+ value=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER
+ )
+
+ successful_textract_api_call_number = gr.State(value=0)
+ no_redaction_method_drop = gr.State(value=NO_REDACTION_PII_OPTION)
+ textract_only_method_drop = gr.State(value=TEXTRACT_TEXT_EXTRACT_OPTION)
+ extract_text_only_tab_redaction_override = gr.State(value="Extract text only")
+
+ load_s3_whole_document_textract_logs_bool = gr.State(
+ value=LOAD_PREVIOUS_TEXTRACT_JOBS_S3
+ )
+ s3_whole_document_textract_logs_subfolder = gr.State(value=TEXTRACT_JOBS_S3_LOC)
+ local_whole_document_textract_logs_subfolder = gr.State(
+ value=TEXTRACT_JOBS_LOCAL_LOC
+ )
+
+ s3_default_cost_codes_file = gr.State(value=S3_COST_CODES_PATH)
+ default_cost_codes_output_folder_location = gr.State(value=OUTPUT_COST_CODES_PATH)
+ enforce_cost_code_bool = gr.State(value=ENFORCE_COST_CODES)
+ default_cost_code_textbox = gr.State(value=DEFAULT_COST_CODE)
+
+ # Base tables that are not modified subsequent to load
+ recogniser_entity_dataframe_base = gr.State(
+ pd.DataFrame(columns=["page", "label", "text", "id"])
+ )
+ all_page_line_level_ocr_results_df_base = gr.State(
+ pd.DataFrame(
+ columns=[
+ "page",
+ "text",
+ "left",
+ "top",
+ "width",
+ "height",
+ "line",
+ "conf",
+ ]
+ )
+ )
+ all_line_level_ocr_results_df_placeholder = gr.State(
+ pd.DataFrame(
+ columns=[
+ "page",
+ "text",
+ "left",
+ "top",
+ "width",
+ "height",
+ "line",
+ "conf",
+ ]
+ )
+ )
+
+ all_page_line_level_ocr_results_with_words_df_base = gr.State(
+ value=pd.DataFrame(
+ columns=[
+ "page",
+ "line",
+ "word_text",
+ "word_x0",
+ "word_y0",
+ "word_x1",
+ "word_y1",
+ "word_conf",
+ "line_text",
+ "line_x0",
+ "line_y0",
+ "line_x1",
+ "line_y1",
+ "line_conf",
+ ]
+ )
+ )
+
+ # Placeholder for selected entity dataframe row
+ selected_entity_id = gr.State(value="")
+ selected_entity_colour = gr.State(value="")
+ selected_entity_dataframe_row_text = gr.State(value="")
+ selected_entity_dataframe_row_text_redact = gr.State(value="")
+
+ # This is an invisible dataframe that holds all items from the redaction outputs that have the same text as the selected row
+ recogniser_entity_dataframe_same_text = gr.State(
+ value=pd.DataFrame(
+ data={"page": list(), "label": list(), "text": list(), "id": list()}
+ )
+ )
+
+ to_redact_dataframe_same_text = gr.State(
+ pd.DataFrame(
+ data={
+ "page": list(),
+ "line": list(),
+ "word_text": list(),
+ "word_x0": list(),
+ "word_y0": list(),
+ "word_x1": list(),
+ "word_y1": list(),
+ "index": list(),
+ }
+ )
+ )
+
+ # Duplicate page detection
+ selected_duplicate_data_row_index = gr.State(value=None)
+ full_duplicate_data_by_file = gr.State(
+ value={}
+ ) # A dictionary of the full duplicate data indexed by file
+
+ # Tracking variables for current page (not visible)
+ current_loop_page_number = gr.State(value=0)
+ page_break_return = gr.State(value=False)
+ latest_file_completed_num = gr.State(value=0)
+
+ # Base cost code dataframe that is not modified
+ cost_code_dataframe_base = gr.State(value=pd.DataFrame())
+
+ # Spacy analyser state
+ updated_nlp_analyser_state = gr.State(list())
+ tesseract_lang_data_file_path = gr.State(value="")
+
+ flag_value_placeholder = gr.State(value="") # Placeholder for flag value
+
+ # Placeholders for elements that may be made visible later below depending on environment variables
+
+ textract_output_found_checkbox = gr.Checkbox(
+ value=False,
+ label="Existing Textract output file found",
+ interactive=False,
+ visible=False,
+ )
+ relevant_ocr_output_with_words_found_checkbox = gr.Checkbox(
+ value=False,
+ label="Existing local OCR output file found",
+ interactive=False,
+ visible=False,
+ )
+
+ estimated_aws_costs_number = gr.Number(
+ label="Approximate AWS services cost ($)",
+ value=0,
+ visible=False,
+ precision=2,
+ )
+ estimated_time_taken_number = gr.Number(
+ label="Approximate time for task (minutes)",
+ value=0,
+ visible=False,
+ precision=2,
+ )
+
+ only_extract_text_radio = gr.Checkbox(
+ value=False, label="Only extract text (no redaction)", visible=False
+ )
+
+ # Textract API call placeholders in case option not selected in config
+
+ job_name_textbox = gr.Textbox(
+ value="", label="whole_document Textract call", visible=False
+ )
+ send_document_to_textract_api_btn = gr.Button(
+ "Analyse document with AWS Textract", variant="primary", visible=False
+ )
+
+ job_id_textbox = gr.Textbox(
+ label="Latest job ID for whole_document document analysis",
+ value="",
+ visible=False,
+ )
+ check_state_of_textract_api_call_btn = gr.Button(
+ "Check state of Textract document job and download",
+ variant="secondary",
+ visible=False,
+ )
+ job_current_status = gr.Textbox(
+ value="", label="Analysis job current status", visible=False
+ )
+ job_type_dropdown = gr.Dropdown(
+ value="document_text_detection",
+ choices=["document_text_detection", "document_analysis"],
+ label="Job type of Textract analysis job",
+ allow_custom_value=False,
+ visible=False,
+ )
+ textract_job_detail_df = gr.Dataframe(
+ pd.DataFrame(
+ columns=[
+ "job_id",
+ "file_name",
+ "job_type",
+ "signature_extraction",
+ "job_date_time",
+ ]
+ ),
+ label="Previous job details",
+ visible=False,
+ type="pandas",
+ wrap=True,
+ )
+ selected_job_id_row = gr.Dataframe(
+ pd.DataFrame(
+ columns=[
+ "job_id",
+ "file_name",
+ "job_type",
+ "signature_extraction",
+ "job_date_time",
+ ]
+ ),
+ label="Selected job id row",
+ visible=False,
+ type="pandas",
+ wrap=True,
+ )
+ is_a_textract_api_call = gr.Checkbox(
+ value=False, label="is_this_a_textract_api_call", visible=False
+ )
+ task_textbox = gr.Textbox(
+ value="redact", label="task", visible=False
+ ) # Track the task being performed
+ job_output_textbox = gr.Textbox(
+ value="", label="Textract call outputs", visible=False
+ )
+ job_input_textbox = gr.Textbox(
+ value=TEXTRACT_JOBS_S3_INPUT_LOC,
+ label="Textract call outputs",
+ visible=False,
+ )
+
+ textract_job_output_file = gr.File(
+ label="Textract job output files", height=FILE_INPUT_HEIGHT, visible=False
+ )
+ convert_textract_outputs_to_ocr_results = gr.Button(
+ "Placeholder - Convert Textract job outputs to OCR results (needs relevant document file uploaded above)",
+ variant="secondary",
+ visible=False,
+ )
+
+ ## Duplicate search object
+ new_duplicate_search_annotation_object = gr.Dropdown(
+ value=None,
+ label="new_duplicate_search_annotation_object",
+ allow_custom_value=True,
+ visible=False,
+ )
+
+ ###
+ # UI DESIGN
+ ###
+
+ gr.Markdown(INTRO_TEXT)
+
+ # Examples for PDF/image redaction
+ if SHOW_EXAMPLES:
+ gr.Markdown(
+ "### Try out general redaction tasks - click on an example below and then the 'Extract text and redact document' button:"
+ )
+
+ available_examples = list()
+ example_labels = list()
+
+ # Check each example file and add to examples if it exists
+ if os.path.exists(example_files[0]):
+ available_examples.append(
+ [
+ [example_files[0]],
+ "Local model - selectable text",
+ "Local",
+ [],
+ CHOSEN_REDACT_ENTITIES,
+ CHOSEN_COMPREHEND_ENTITIES,
+ [example_files[0]],
+ example_files[0],
+ os.path.splitext(os.path.basename(example_files[0]))[0],
+ [],
+ [],
+ [],
+ [],
+ 2,
+ ]
+ )
+ example_labels.append("PDF with selectable text redaction")
+
+ if os.path.exists(example_files[1]):
+ available_examples.append(
+ [
+ [example_files[1]],
+ "Local OCR model - PDFs without selectable text",
+ "Local",
+ [],
+ CHOSEN_REDACT_ENTITIES,
+ CHOSEN_COMPREHEND_ENTITIES,
+ [example_files[1]],
+ example_files[1],
+ os.path.splitext(os.path.basename(example_files[1]))[0],
+ [],
+ [],
+ [],
+ [],
+ 1,
+ ]
+ )
+ example_labels.append("Image redaction with local OCR")
+
+ if os.path.exists(example_files[2]):
+ available_examples.append(
+ [
+ [example_files[2]],
+ "Local OCR model - PDFs without selectable text",
+ "Local",
+ [],
+ ["TITLES", "PERSON", "DATE_TIME"],
+ ["TITLES", "NAME", "DATE_TIME"],
+ [example_files[2]],
+ example_files[2],
+ os.path.splitext(os.path.basename(example_files[2]))[0],
+ [],
+ [],
+ [],
+ [],
+ 1,
+ ]
+ )
+ example_labels.append(
+ "PDF redaction with custom entities (Titles, Person, Dates)"
+ )
+
+ if os.path.exists(example_files[3]):
+ if SHOW_AWS_EXAMPLES:
+ available_examples.append(
+ [
+ [example_files[3]],
+ "AWS Textract service - all PDF types",
+ "AWS Comprehend",
+ ["Extract handwriting", "Extract signatures"],
+ CHOSEN_REDACT_ENTITIES,
+ CHOSEN_COMPREHEND_ENTITIES,
+ [example_files[3]],
+ example_files[3],
+ os.path.splitext(os.path.basename(example_files[3]))[0],
+ [],
+ [],
+ [],
+ [],
+ 7,
+ ]
+ )
+ example_labels.append(
+ "PDF redaction with AWS services and signature detection"
+ )
+
+ # Add new example for custom deny list and whole page redaction
+ if (
+ os.path.exists(example_files[3])
+ and os.path.exists(example_files[4])
+ and os.path.exists(example_files[5])
+ ):
+ available_examples.append(
+ [
+ [example_files[3]],
+ "Local OCR model - PDFs without selectable text",
+ "Local",
+ [],
+ ["CUSTOM"], # Use CUSTOM entity to enable deny list functionality
+ ["CUSTOM"],
+ [example_files[3]],
+ example_files[3],
+ os.path.splitext(os.path.basename(example_files[3]))[0],
+ [example_files[4]],
+ [
+ "Sister",
+ "Sister City",
+ "Sister Cities",
+ "Friendship City",
+ ],
+ [example_files[5]],
+ [
+ 2,
+ 5,
+ ], # pd.DataFrame(data={"fully_redacted_pages_list": [2, 5]}),
+ 7,
+ ],
+ )
+ example_labels.append(
+ "PDF redaction with custom deny list and whole page redaction"
+ )
+
+ # When RUN_ALL_EXAMPLES_THROUGH_AWS, replace text extraction with AWS Textract and PII with AWS Comprehend (except "Only extract text")
+ if RUN_ALL_EXAMPLES_THROUGH_AWS:
+ for ex in available_examples:
+ ex[1] = TEXTRACT_TEXT_EXTRACT_OPTION
+ if ex[2] != NO_REDACTION_PII_OPTION:
+ ex[2] = AWS_PII_OPTION
+
+ # Only create examples if we have available files
+ if available_examples:
+
+ redaction_examples = gr.Examples(
+ examples=available_examples,
+ inputs=[
+ in_doc_files,
+ text_extract_method_radio,
+ pii_identification_method_drop,
+ handwrite_signature_checkbox,
+ in_redact_entities,
+ in_redact_comprehend_entities,
+ prepared_pdf_state,
+ doc_full_file_name_textbox,
+ doc_file_name_no_extension_textbox,
+ in_deny_list,
+ in_deny_list_state,
+ in_fully_redacted_list,
+ in_fully_redacted_list_state,
+ total_pdf_page_count,
+ ],
+ outputs=[
+ walkthrough_file_input,
+ walkthrough_in_redact_entities,
+ walkthrough_in_redact_comprehend_entities,
+ walkthrough_text_extract_method_radio,
+ walkthrough_local_ocr_method_radio,
+ walkthrough_handwrite_signature_checkbox,
+ walkthrough_pii_identification_method_drop,
+ walkthrough_allow_list_state,
+ walkthrough_deny_list_state,
+ walkthrough_fully_redacted_list_state,
+ in_redact_entities, # Main component - update visibility
+ in_redact_comprehend_entities, # Main component - update visibility
+ in_redact_llm_entities, # Main component - update visibility
+ custom_llm_instructions_textbox, # Main component - update visibility
+ ],
+ example_labels=example_labels,
+ fn=show_info_box_on_click,
+ run_on_click=True,
+ cache_examples=False,
+ )
+
+ def _ocr_method_for_difficult_example(desired: str) -> str:
+ """Use *desired* for difficult-OCR examples if it exists on the local OCR radio; else first available fallback."""
+ if desired in LOCAL_OCR_MODEL_OPTIONS:
+ return desired
+ chains = {
+ "vlm": (
+ "vlm",
+ "inference-server",
+ "paddle",
+ "tesseract",
+ ),
+ "hybrid-paddle-vlm": (
+ "hybrid-paddle-vlm",
+ "hybrid-paddle-inference-server",
+ "vlm",
+ "inference-server",
+ "paddle",
+ "tesseract",
+ ),
+ "paddle": ("paddle", "tesseract"),
+ "tesseract": ("tesseract",),
+ }
+ for candidate in chains.get(desired, ("tesseract",)):
+ if candidate in LOCAL_OCR_MODEL_OPTIONS:
+ return candidate
+ return "tesseract"
+
+ def _pii_method_for_difficult_example(desired: str) -> str:
+ """Map intended PII method to one present on pii_identification_method_drop (PII_DETECTION_MODELS)."""
+ if desired in PII_DETECTION_MODELS:
+ return desired
+ available = set(PII_DETECTION_MODELS)
+ chains = {
+ LOCAL_TRANSFORMERS_LLM_PII_OPTION: (
+ LOCAL_TRANSFORMERS_LLM_PII_OPTION,
+ INFERENCE_SERVER_PII_OPTION,
+ AWS_LLM_PII_OPTION,
+ LOCAL_PII_OPTION,
+ NO_REDACTION_PII_OPTION,
+ ),
+ INFERENCE_SERVER_PII_OPTION: (
+ INFERENCE_SERVER_PII_OPTION,
+ LOCAL_TRANSFORMERS_LLM_PII_OPTION,
+ AWS_LLM_PII_OPTION,
+ LOCAL_PII_OPTION,
+ NO_REDACTION_PII_OPTION,
+ ),
+ AWS_LLM_PII_OPTION: (
+ AWS_LLM_PII_OPTION,
+ LOCAL_TRANSFORMERS_LLM_PII_OPTION,
+ INFERENCE_SERVER_PII_OPTION,
+ LOCAL_PII_OPTION,
+ NO_REDACTION_PII_OPTION,
+ ),
+ AWS_PII_OPTION: (
+ AWS_PII_OPTION,
+ LOCAL_PII_OPTION,
+ NO_REDACTION_PII_OPTION,
+ ),
+ LOCAL_PII_OPTION: (
+ LOCAL_PII_OPTION,
+ INFERENCE_SERVER_PII_OPTION,
+ LOCAL_TRANSFORMERS_LLM_PII_OPTION,
+ AWS_LLM_PII_OPTION,
+ AWS_PII_OPTION,
+ NO_REDACTION_PII_OPTION,
+ ),
+ NO_REDACTION_PII_OPTION: (
+ NO_REDACTION_PII_OPTION,
+ LOCAL_PII_OPTION,
+ ),
+ }
+ for candidate in chains.get(desired, ()):
+ if candidate in available:
+ return candidate
+ return PII_DETECTION_MODELS[0] if PII_DETECTION_MODELS else desired
+
+ if SHOW_DIFFICULT_OCR_EXAMPLES:
+ gr.Markdown(
+ "### Test out the different OCR methods available. Click on an example below and then the 'Extract text and redact document' button:"
+ )
+
+ available_ocr_examples = list()
+ ocr_example_labels = list()
+ if os.path.exists(ocr_example_files[0]):
+ available_ocr_examples.append(
+ [
+ [ocr_example_files[0]],
+ "Local OCR model - PDFs without selectable text",
+ "Only extract text (no redaction)",
+ [],
+ [ocr_example_files[0]],
+ ocr_example_files[0],
+ os.path.splitext(os.path.basename(ocr_example_files[0]))[0],
+ 7,
+ 1,
+ 1,
+ _ocr_method_for_difficult_example("tesseract"),
+ CHOSEN_REDACT_ENTITIES,
+ CHOSEN_LLM_ENTITIES,
+ "",
+ ],
+ )
+ ocr_example_labels.append("Baseline 'easy' document page")
+
+ available_ocr_examples.append(
+ [
+ [ocr_example_files[0]],
+ "Local OCR model - PDFs without selectable text",
+ "Local",
+ ["Extract handwriting", "Extract signatures"],
+ [ocr_example_files[0]],
+ ocr_example_files[0],
+ os.path.splitext(os.path.basename(ocr_example_files[0]))[0],
+ 7,
+ 6,
+ 6,
+ _ocr_method_for_difficult_example("hybrid-paddle-vlm"),
+ CHOSEN_REDACT_ENTITIES + ["CUSTOM_VLM_SIGNATURE"],
+ CHOSEN_LLM_ENTITIES,
+ "",
+ ],
+ )
+ ocr_example_labels.append("Scanned document page with signatures")
+
+ if os.path.exists(ocr_example_files[1]):
+ available_ocr_examples.append(
+ [
+ [ocr_example_files[1]],
+ "Local OCR model - PDFs without selectable text",
+ "Only extract text (no redaction)",
+ ["Extract handwriting"],
+ [ocr_example_files[1]],
+ ocr_example_files[1],
+ os.path.splitext(os.path.basename(ocr_example_files[1]))[0],
+ 1,
+ 0,
+ 0,
+ _ocr_method_for_difficult_example("vlm"),
+ CHOSEN_REDACT_ENTITIES,
+ CHOSEN_LLM_ENTITIES,
+ "",
+ ],
+ )
+ ocr_example_labels.append("Unclear text on handwritten note")
+
+ if os.path.exists(ocr_example_files[2]):
+ available_ocr_examples.append(
+ [
+ [ocr_example_files[2]],
+ "Local OCR model - PDFs without selectable text",
+ "Local",
+ ["Extract handwriting"],
+ [ocr_example_files[2]],
+ ocr_example_files[2],
+ os.path.splitext(os.path.basename(ocr_example_files[2]))[0],
+ 1,
+ 0,
+ 0,
+ _ocr_method_for_difficult_example("hybrid-paddle-vlm"),
+ CHOSEN_REDACT_ENTITIES + ["CUSTOM_VLM_FACES"],
+ CHOSEN_LLM_ENTITIES,
+ "",
+ ],
+ )
+ ocr_example_labels.append("CV with photo - face identification")
+
+ if os.path.exists(example_files[0]):
+ available_ocr_examples.append(
+ [
+ [example_files[0]],
+ "Local model - selectable text",
+ LOCAL_TRANSFORMERS_LLM_PII_OPTION,
+ [],
+ [example_files[0]],
+ example_files[0],
+ os.path.splitext(os.path.basename(example_files[0]))[0],
+ 1,
+ 0,
+ 0,
+ _ocr_method_for_difficult_example("paddle"),
+ ["CUSTOM"],
+ ["CUSTOM"],
+ "Redact Lauren's name (always cover the full name if available), email addresses, and phone numbers with the label LAUREN. Redact university names with the label UNIVERSITY. Always include the full university name if available.",
+ ],
+ )
+ ocr_example_labels.append("Example email LLM PII detection")
+
+ # When RUN_ALL_EXAMPLES_THROUGH_AWS, use AWS Textract for text extraction and AWS Bedrock LLM for PII (except "Only extract text")
+ if RUN_ALL_EXAMPLES_THROUGH_AWS:
+ for ex in available_ocr_examples:
+ ex[1] = TEXTRACT_TEXT_EXTRACT_OPTION
+ if ex[2] != NO_REDACTION_PII_OPTION:
+ ex[2] = AWS_LLM_PII_OPTION
+
+ for ex in available_ocr_examples:
+ ex[2] = _pii_method_for_difficult_example(ex[2])
+
+ # Only create examples if we have available files
+ if available_ocr_examples:
+
+ ocr_examples = gr.Examples(
+ examples=available_ocr_examples,
+ inputs=[
+ in_doc_files,
+ text_extract_method_radio,
+ pii_identification_method_drop,
+ handwrite_signature_checkbox,
+ prepared_pdf_state,
+ doc_full_file_name_textbox,
+ doc_file_name_no_extension_textbox,
+ total_pdf_page_count,
+ page_min,
+ page_max,
+ local_ocr_method_radio,
+ in_redact_entities,
+ in_redact_llm_entities,
+ custom_llm_instructions_textbox,
+ ],
+ outputs=[
+ walkthrough_file_input,
+ walkthrough_in_redact_entities,
+ walkthrough_text_extract_method_radio,
+ walkthrough_local_ocr_method_radio,
+ walkthrough_handwrite_signature_checkbox,
+ walkthrough_pii_identification_method_drop,
+ walkthrough_in_redact_llm_entities,
+ walkthrough_custom_llm_instructions_textbox,
+ in_redact_llm_entities, # Main component
+ custom_llm_instructions_textbox, # Main component
+ ],
+ example_labels=ocr_example_labels,
+ fn=show_info_box_on_click_ocr_examples,
+ run_on_click=True,
+ cache_examples=False,
+ )
+
+ # Render walkthrough components in a hidden container when SHOW_QUICKSTART is False
+ # This ensures they exist for examples and event handlers even when Quickstart tab is hidden
+ if not SHOW_QUICKSTART:
+ walkthrough_is_data_file = gr.State(value=False)
+ with gr.Column(visible=False):
+ walkthrough_list_accordion = gr.Accordion(
+ "Allow, deny, and full page redaction list settings",
+ open=False,
+ visible=False,
+ )
+ with walkthrough_list_accordion:
+ walkthrough_deny_list_state.render()
+ walkthrough_allow_list_state.render()
+ walkthrough_fully_redacted_list_state.render()
+ walkthrough_file_input.render()
+ walkthrough_in_redact_entities.render()
+ walkthrough_in_redact_comprehend_entities.render()
+ walkthrough_in_redact_llm_entities.render()
+ walkthrough_custom_llm_instructions_textbox.render()
+ walkthrough_text_extract_method_radio.render()
+ walkthrough_local_ocr_method_radio.render()
+ walkthrough_handwrite_signature_checkbox.render()
+ walkthrough_pii_identification_method_drop.render()
+ walkthrough_pii_identification_method_drop_tabular.render()
+ walkthrough_anon_strategy.render()
+ walkthrough_do_initial_clean.render()
+ # Placeholder components so step_4_next_tabular_redact_btn.success() inputs exist
+ walkthrough_excel_sheets = gr.Dropdown(
+ choices=["Choose Excel sheets to anonymise"],
+ multiselect=True,
+ label="Select Excel sheets that you want to anonymise (showing sheets present across all Excel files).",
+ visible=False,
+ allow_custom_value=True,
+ )
+ walkthrough_colnames = gr.Dropdown(
+ choices=["Choose columns to anonymise"],
+ multiselect=True,
+ allow_custom_value=True,
+ label="Select columns that you want to anonymise (showing columns present across all files).",
+ visible=False,
+ )
+ walkthrough_max_fuzzy_spelling_mistakes_num = gr.Number(
+ label="Maximum spelling mistakes for matching deny list terms (slows down PII detection).",
+ value=DEFAULT_FUZZY_SPELLING_MISTAKES_NUM,
+ minimum=0,
+ maximum=9,
+ precision=0,
+ visible=False,
+ )
+ # Placeholder cost components so event handlers have valid outputs when SHOW_COSTS and not SHOW_QUICKSTART
+ if SHOW_COSTS:
+ walkthrough_textract_output_found_checkbox = gr.Checkbox(
+ value=False,
+ label="Existing Textract output file found",
+ interactive=False,
+ visible=False,
+ )
+ walkthrough_relevant_ocr_output_with_words_found_checkbox = gr.Checkbox(
+ value=False,
+ label="Existing local OCR output file found",
+ interactive=False,
+ visible=False,
+ )
+ walkthrough_total_pdf_page_count = gr.Number(
+ label="Total page count",
+ value=0,
+ visible=False,
+ interactive=False,
+ )
+ walkthrough_estimated_aws_costs_number = gr.Number(
+ label="Approximate AWS services cost (£)",
+ value=0.00,
+ precision=2,
+ visible=False,
+ interactive=False,
+ )
+ walkthrough_estimated_time_taken_number = gr.Number(
+ label="Approximate time for task (minutes)",
+ value=0,
+ visible=False,
+ precision=2,
+ interactive=False,
+ )
+ # Placeholder buttons so step_4_*_btn.click() handlers below are defined
+ step_4_next_document_redact_btn = gr.Button(
+ "Redact document", variant="primary", visible=False
+ )
+ step_4_next_tabular_redact_btn = gr.Button(
+ "Redact data files", variant="primary", visible=False
+ )
+
+ with gr.Tabs() as tabs:
+ ###
+ # QUICKSTART TAB
+ ###
+ if SHOW_QUICKSTART:
+ with gr.Tab("Quickstart", id=0):
+ # State to track if we're dealing with data files
+ walkthrough_is_data_file = gr.State(value=False)
+ # State to avoid re-running column dropdown update when Gradio re-fires in_data_files.change
+ walkthrough_last_data_file_keys = gr.State(value=None)
+
+ with gr.Walkthrough(selected=1) as walkthrough:
+ with gr.Step("Load document/data", id=1):
+
+ walkthrough_file_input.render()
+ with gr.Row():
+ step_1_back_btn = gr.Button("Back", variant="secondary")
+
+ step_1_next_btn = gr.Button("Next", variant="primary")
+ with gr.Step("Choose text extraction (OCR) method", id=2):
+ # Components for data files (conditionally visible)
+ walkthrough_excel_sheets = gr.Dropdown(
+ choices=["Choose Excel sheets to anonymise"],
+ multiselect=True,
+ label="Select Excel sheets that you want to anonymise (showing sheets present across all Excel files).",
+ visible=False,
+ allow_custom_value=True,
+ )
+ walkthrough_colnames = gr.Dropdown(
+ choices=["Choose columns to anonymise"],
+ multiselect=True,
+ allow_custom_value=True,
+ label="Select columns that you want to anonymise (showing columns present across all files).",
+ visible=False,
+ )
+ # Text extraction method radio (conditionally visible)
+ walkthrough_text_extract_method_radio.render()
+ # Local OCR method radio (shown only if Local OCR model is selected)
+ walkthrough_local_ocr_accordion = gr.Accordion(
+ "Local OCR method",
+ open=True,
+ visible=initial_local_ocr_visible,
+ )
+ walkthrough_aws_textract_accordion = gr.Accordion(
+ "AWS Textract settings",
+ open=True,
+ visible=initial_aws_textract_visible,
+ )
+ with walkthrough_local_ocr_accordion:
+ walkthrough_local_ocr_method_radio.render()
+ with walkthrough_aws_textract_accordion:
+ walkthrough_handwrite_signature_checkbox.render()
+
+ with gr.Row():
+ step_2_back_btn = gr.Button("Back", variant="secondary")
+
+ step_2_next_btn = gr.Button("Next", variant="primary")
+ with gr.Step("Choose PII detection method", id=3):
+
+ with gr.Row(equal_height=True):
+ with gr.Column(scale=3):
+ walkthrough_redaction_method_dropdown = gr.Radio(
+ label="Choose redaction method",
+ choices=[
+ "Extract text only",
+ "Redact all PII",
+ "Redact selected terms",
+ ],
+ value="Redact all PII",
+ interactive=True,
+ )
+ with gr.Column(scale=1):
+ # Checkbox for automatically redacting duplicate pages
+ walkthrough_redact_duplicate_pages_checkbox = gr.Checkbox(
+ info="Find and redact whole pages that contain duplicate text. See the 'Identify duplicate pages' tab for all settings and duplicate sentence/passage redaction.",
+ label="Redact duplicate pages",
+ value=False,
+ visible=True,
+ elem_id="redact_duplicate_pages_checkbox_walkthrough",
+ )
+
+ # Alternatively, if it's a data file analysis, show the checkbox for initial text clean
+ walkthrough_do_initial_clean.render()
+
+ walkthrough_pii_identification_method_drop.render()
+
+ walkthrough_in_redact_entities.render()
+
+ walkthrough_in_redact_comprehend_entities.render()
+
+ walkthrough_llm_entities_accordion = gr.Accordion(
+ "LLM PII identification model",
+ open=True,
+ visible=initial_is_llm_method,
+ )
+ with walkthrough_llm_entities_accordion:
+ walkthrough_in_redact_llm_entities.render()
+ walkthrough_custom_llm_instructions_textbox.render()
+
+ # Components for "Redact selected terms" option (conditionally visible)
+ # Note: Accordion removed to avoid block ID mismatches
+ with gr.Row(equal_height=True):
+ with gr.Column(scale=3):
+ walkthrough_list_accordion = gr.Accordion(
+ "Allow, deny, and full page redaction list settings",
+ open=True,
+ visible=True,
+ )
+ with walkthrough_list_accordion:
+ with gr.Row(equal_height=True):
+ walkthrough_deny_list_state.render()
+ walkthrough_allow_list_state.render()
+ walkthrough_fully_redacted_list_state.render()
+
+ with gr.Column(scale=1):
+
+ walkthrough_max_fuzzy_spelling_mistakes_num = gr.Number(
+ label="Maximum spelling mistakes for matching deny list terms (slows down PII detection).",
+ value=DEFAULT_FUZZY_SPELLING_MISTAKES_NUM,
+ minimum=0,
+ maximum=9,
+ precision=0,
+ visible=True,
+ )
+
+ # Tabular data redaction options (conditionally visible for data files)
+
+ walkthrough_pii_identification_method_drop_tabular.render()
+
+ walkthrough_anon_strategy.render()
+
+ with gr.Row():
+ step_3_back_btn = gr.Button("Back", variant="secondary")
+
+ step_3_next_btn = gr.Button("Next", variant="primary")
+ with gr.Step("Redact", id=4):
+ # Page selection (always visible)
+ with gr.Accordion(
+ "Redact only selected pages (default is all pages)",
+ open=False,
+ ):
+ with gr.Row():
+ walkthrough_page_min = gr.Number(
+ value=DEFAULT_PAGE_MIN,
+ precision=0,
+ minimum=0,
+ maximum=9999,
+ label="Lowest page to redact (set to 0 to redact from the first page)",
+ )
+ walkthrough_page_max = gr.Number(
+ value=DEFAULT_PAGE_MAX,
+ precision=0,
+ minimum=0,
+ maximum=9999,
+ label="Highest page to redact (set to 0 to redact to the last page)",
+ )
+ with gr.Accordion(
+ "Costs and time taken estimates",
+ open=True,
+ visible=SHOW_COSTS,
+ ):
+ with gr.Row():
+ # Cost-related components (conditionally visible)
+ walkthrough_textract_output_found_checkbox = (
+ gr.Checkbox(
+ value=False,
+ label="Existing Textract output file found",
+ interactive=False,
+ visible=SHOW_COSTS,
+ )
+ )
+ walkthrough_relevant_ocr_output_with_words_found_checkbox = gr.Checkbox(
+ value=False,
+ label="Existing local OCR output file found",
+ interactive=False,
+ visible=SHOW_COSTS,
+ )
+ walkthrough_total_pdf_page_count = gr.Number(
+ label="Total page count",
+ value=0,
+ visible=SHOW_COSTS,
+ interactive=False,
+ )
+ walkthrough_estimated_aws_costs_number = gr.Number(
+ label="Approximate AWS services cost (£)",
+ value=0.00,
+ precision=2,
+ visible=SHOW_COSTS,
+ interactive=False,
+ )
+ walkthrough_estimated_time_taken_number = gr.Number(
+ label="Approximate time for task (minutes)",
+ value=0,
+ visible=SHOW_COSTS,
+ precision=2,
+ interactive=False,
+ )
+
+ show_cost_codes = GET_COST_CODES or ENFORCE_COST_CODES
+ with gr.Accordion(
+ "Cost code selection", open=True, visible=show_cost_codes
+ ):
+ with gr.Row():
+ # Cost code components (conditionally visible)
+
+ with gr.Column():
+ with gr.Accordion(
+ "Existing cost codes table",
+ open=False,
+ visible=show_cost_codes,
+ ):
+ walkthrough_cost_code_dataframe = gr.Dataframe(
+ value=pd.DataFrame(
+ columns=["Cost code", "Description"]
+ ),
+ row_count=(0, "dynamic"),
+ label="Existing cost codes",
+ type="pandas",
+ interactive=True,
+ show_search="filter",
+ visible=show_cost_codes,
+ wrap=True,
+ max_height=200,
+ )
+ walkthrough_reset_cost_code_dataframe_button = (
+ gr.Button(
+ value="Reset code code table filter",
+ visible=show_cost_codes,
+ )
+ )
+ with gr.Column():
+ walkthrough_cost_code_choice_drop = gr.Dropdown(
+ value=DEFAULT_COST_CODE,
+ label="Choose cost code for analysis",
+ choices=[DEFAULT_COST_CODE],
+ allow_custom_value=False,
+ visible=show_cost_codes,
+ )
+ walkthrough_set_default_cost_code_button = (
+ gr.Button(
+ value="Set default cost code",
+ visible=show_cost_codes,
+ )
+ )
+
+ with gr.Row():
+ step_4_back_btn = gr.Button("Back", variant="secondary")
+
+ step_4_next_document_redact_btn = gr.Button(
+ "Redact document", variant="primary", visible=True
+ )
+ step_4_next_tabular_redact_btn = gr.Button(
+ "Redact data files", variant="primary", visible=False
+ )
+
+ ###
+ # QUICKSTART WALKTHROUGH EVENT HANDLERS
+ ###
+ step_1_back_btn.click(
+ lambda: gr.Walkthrough(selected=0), outputs=walkthrough
+ )
+ # Step 1: Route files to appropriate component when Next is clicked
+ step_1_next_btn.click(
+ fn=route_walkthrough_files,
+ inputs=[walkthrough_file_input],
+ outputs=[
+ in_doc_files,
+ in_data_files,
+ walkthrough_is_data_file,
+ walkthrough,
+ walkthrough_text_extract_method_radio,
+ walkthrough_local_ocr_accordion,
+ walkthrough_aws_textract_accordion,
+ ],
+ )
+
+ ### Step 2
+
+ # Note: in_excel_sheets is defined in the "Word or Excel/CSV files" tab (id=5)
+ # Both tabs are in the same gr.Tabs() context, so components are accessible at runtime
+ step_2_back_btn.click(
+ lambda: gr.Walkthrough(selected=1), outputs=walkthrough
+ )
+
+ step_2_next_btn.click(
+ fn=handle_step_2_next,
+ inputs=[
+ in_data_files,
+ walkthrough_is_data_file,
+ walkthrough_colnames,
+ walkthrough_excel_sheets,
+ walkthrough_text_extract_method_radio,
+ ],
+ outputs=[
+ walkthrough_colnames,
+ walkthrough_excel_sheets,
+ in_colnames,
+ in_excel_sheets,
+ walkthrough_text_extract_method_radio,
+ walkthrough,
+ ], # type: ignore
+ )
+
+ # Update local OCR method radio and AWS Textract settings visibility when text extraction method is selected
+ # queue=False: handler is trivial (visibility only); skip queue so no loading spinner / wait behind other jobs
+ walkthrough_text_extract_method_radio.change(
+ fn=handle_text_extract_method_selection,
+ inputs=[walkthrough_text_extract_method_radio],
+ outputs=[
+ walkthrough_local_ocr_accordion,
+ walkthrough_aws_textract_accordion,
+ ],
+ queue=False,
+ postprocess=False,
+ )
+
+ # When data files are uploaded in walkthrough, automatically populate dropdowns
+ # queue=False and state guard avoid infinite loading when Gradio re-fires change on step 2
+ in_data_files.change(
+ fn=update_step_2_on_data_file_upload,
+ inputs=[
+ in_data_files,
+ walkthrough_is_data_file,
+ walkthrough_last_data_file_keys,
+ ],
+ outputs=[
+ walkthrough_colnames,
+ walkthrough_excel_sheets,
+ walkthrough_last_data_file_keys,
+ ],
+ queue=False,
+ )
+
+ # Update Step 3 components visibility when redaction method is selected
+ walkthrough_redaction_method_dropdown.change(
+ fn=handle_redaction_method_selection,
+ inputs=[
+ walkthrough_redaction_method_dropdown,
+ walkthrough_pii_identification_method_drop,
+ ],
+ outputs=[
+ walkthrough_pii_identification_method_drop,
+ walkthrough_in_redact_entities,
+ walkthrough_in_redact_comprehend_entities,
+ walkthrough_llm_entities_accordion,
+ walkthrough_in_redact_llm_entities,
+ walkthrough_list_accordion,
+ walkthrough_max_fuzzy_spelling_mistakes_num,
+ ],
+ )
+
+ # Update entity dropdowns when PII method is selected (document path)
+ walkthrough_pii_identification_method_drop.change(
+ fn=handle_pii_method_selection,
+ inputs=[walkthrough_pii_identification_method_drop],
+ outputs=[
+ walkthrough_in_redact_entities,
+ walkthrough_in_redact_comprehend_entities,
+ walkthrough_llm_entities_accordion,
+ ],
+ queue=False,
+ postprocess=False,
+ )
+
+ # Update entity dropdowns when tabular PII method is selected.
+ # Use handle_pii_method_selection_tabular so nested LLM components get no-op
+ # updates and don't hang on loading (accordion visibility alone shows the block).
+ walkthrough_pii_identification_method_drop_tabular.change(
+ fn=handle_pii_method_selection_tabular,
+ inputs=[walkthrough_pii_identification_method_drop_tabular],
+ outputs=[
+ walkthrough_in_redact_entities,
+ walkthrough_in_redact_comprehend_entities,
+ walkthrough_llm_entities_accordion,
+ ],
+ queue=False,
+ postprocess=False,
+ )
+
+ # Update Step 3 component visibility based on file type (hide document-only when CSV/Excel)
+ walkthrough_is_data_file.change(
+ fn=update_step_3_tabular_visibility,
+ inputs=[walkthrough_is_data_file],
+ outputs=[
+ walkthrough_local_ocr_method_radio,
+ walkthrough_pii_identification_method_drop,
+ walkthrough_fully_redacted_list_state,
+ walkthrough_redact_duplicate_pages_checkbox,
+ walkthrough_pii_identification_method_drop_tabular,
+ walkthrough_anon_strategy,
+ walkthrough_do_initial_clean,
+ ],
+ )
+
+ ### Step 3
+ step_3_back_btn.click(
+ lambda: gr.Walkthrough(selected=2), outputs=walkthrough
+ )
+
+ step_3_next_btn.click(
+ fn=handle_step_3_next,
+ inputs=[
+ walkthrough_text_extract_method_radio,
+ walkthrough_local_ocr_method_radio,
+ walkthrough_handwrite_signature_checkbox,
+ walkthrough_pii_identification_method_drop,
+ walkthrough_in_redact_entities,
+ walkthrough_in_redact_comprehend_entities,
+ walkthrough_in_redact_llm_entities,
+ walkthrough_custom_llm_instructions_textbox,
+ walkthrough_deny_list_state,
+ walkthrough_allow_list_state,
+ walkthrough_fully_redacted_list_state,
+ walkthrough_pii_identification_method_drop_tabular,
+ walkthrough_anon_strategy,
+ walkthrough_do_initial_clean,
+ walkthrough_redact_duplicate_pages_checkbox,
+ walkthrough_max_fuzzy_spelling_mistakes_num,
+ ],
+ outputs=[
+ text_extract_method_radio,
+ local_ocr_method_radio,
+ handwrite_signature_checkbox,
+ pii_identification_method_drop,
+ in_redact_entities,
+ in_redact_comprehend_entities,
+ in_redact_llm_entities,
+ custom_llm_instructions_textbox,
+ in_deny_list_state,
+ in_allow_list_state,
+ in_fully_redacted_list_state,
+ pii_identification_method_drop_tabular,
+ anon_strategy,
+ do_initial_clean,
+ redact_duplicate_pages_checkbox,
+ walkthrough,
+ max_fuzzy_spelling_mistakes_num,
+ ],
+ )
+
+ # Reset cost code dataframe filter in walkthrough
+ if GET_COST_CODES or ENFORCE_COST_CODES:
+ from tools.helper_functions import reset_base_dataframe
+
+ walkthrough_reset_cost_code_dataframe_button.click(
+ reset_base_dataframe,
+ inputs=[cost_code_dataframe_base],
+ outputs=[walkthrough_cost_code_dataframe],
+ )
+
+ def _walkthrough_save_default_cost_code(sh, choice, df, output_folder):
+ msg = save_default_cost_code_for_session(
+ sh, choice, df, output_folder
+ )
+ gr.Info(msg)
+
+ walkthrough_set_default_cost_code_button.click(
+ _walkthrough_save_default_cost_code,
+ inputs=[
+ session_hash_textbox,
+ walkthrough_cost_code_choice_drop,
+ walkthrough_cost_code_dataframe,
+ input_folder_textbox,
+ ],
+ outputs=[],
+ )
+
+ # Update Step 4 component visibility based on file type
+ walkthrough_is_data_file.change(
+ fn=update_step_4_visibility,
+ inputs=[walkthrough_is_data_file],
+ outputs=[
+ step_4_next_document_redact_btn,
+ step_4_next_tabular_redact_btn,
+ ],
+ )
+
+ ### Step 4
+
+ step_4_back_btn.click(
+ lambda: gr.Walkthrough(selected=3), outputs=walkthrough
+ )
+
+ ## Step 4 next button actions are further down the file (step_4_next_document_redact_btn.click and step_4_next_tabular_redact_btn.click)
+
+ ###
+ # REDACTION PDF/IMAGES TABLE
+ ###
+ with gr.Tab("Redact PDFs/images", id=1):
+
+ if SHOW_QUICKSTART:
+ show_main_redaction_accordion = False
+ else:
+ show_main_redaction_accordion = True
+
+ with gr.Accordion("Redaction settings", open=show_main_redaction_accordion):
+ in_doc_files.render()
+ textract_text = ""
+
+ if (
+ SHOW_AWS_TEXT_EXTRACTION_OPTIONS
+ and DEFAULT_TEXT_EXTRACTION_MODEL == TEXTRACT_TEXT_EXTRACT_OPTION
+ ):
+ textract_text = ". AWS Textract has a cost per page - $1.50 without signature detection (default), $3.50 per 1,000 pages with signature detection. Enable this in the tab below (AWS Textract signature detection)."
+ else:
+ textract_text = ""
+
+ with gr.Accordion(
+ label=f"Change text extraction settings{textract_text}".strip(),
+ open=EXTRACTION_AND_PII_OPTIONS_OPEN_BY_DEFAULT,
+ ):
+
+ with gr.Accordion(
+ "Change text extraction OCR method",
+ open=True,
+ visible=SHOW_OCR_GUI_OPTIONS,
+ ):
+ text_extract_method_radio.render()
+ # Store accordion references for dynamic visibility control
+ # Initialise visibility based on default text extraction method
+ local_ocr_accordion = gr.Accordion(
+ label="Change local OCR model",
+ open=EXTRACTION_AND_PII_OPTIONS_OPEN_BY_DEFAULT,
+ visible=(
+ DEFAULT_TEXT_EXTRACTION_MODEL
+ == LOCAL_OCR_MODEL_TEXT_EXTRACT_OPTION
+ ),
+ )
+ with local_ocr_accordion:
+ local_ocr_method_radio.render()
+
+ inference_server_vlm_accordion = gr.Accordion(
+ "Inference Server VLM Model (for inference-server OCR only)",
+ open=False,
+ visible=(
+ SHOW_INFERENCE_SERVER_VLM_MODEL_OPTIONS
+ and DEFAULT_TEXT_EXTRACTION_MODEL
+ == LOCAL_OCR_MODEL_TEXT_EXTRACT_OPTION
+ ),
+ )
+ with inference_server_vlm_accordion:
+ inference_server_vlm_model_textbox.render()
+
+ aws_textract_signature_accordion = gr.Accordion(
+ "Enable AWS Textract signature detection (default is off)",
+ open=False,
+ visible=(
+ SHOW_AWS_TEXT_EXTRACTION_OPTIONS
+ and DEFAULT_TEXT_EXTRACTION_MODEL
+ == TEXTRACT_TEXT_EXTRACT_OPTION
+ ),
+ )
+ with aws_textract_signature_accordion:
+ handwrite_signature_checkbox.render()
+
+ if (
+ SHOW_AWS_PII_DETECTION_OPTIONS
+ and DEFAULT_PII_DETECTION_MODEL == AWS_PII_OPTION
+ ):
+ comprehend_text = (
+ ". AWS Comprehend has a small cost per character processed."
+ )
+ else:
+ comprehend_text = ""
+
+ with gr.Accordion(
+ f"Change PII identification method{comprehend_text}".strip(),
+ open=True,
+ visible=SHOW_PII_IDENTIFICATION_OPTIONS,
+ ):
+ with gr.Row(equal_height=True):
+ with gr.Column(scale=3):
+ redaction_method_radio.render()
+ with gr.Column(scale=1):
+ # Checkbox for automatically redacting duplicate pages
+ redact_duplicate_pages_checkbox.render()
+ with gr.Row(equal_height=True):
+ pii_identification_method_drop.render()
+
+ entity_types_to_redact_accordion = gr.Accordion(
+ "Select entity types to redact", open=True
+ )
+ with entity_types_to_redact_accordion:
+ # Store accordion references for dynamic visibility control
+ # Determine initial visibility based on default PII method
+ default_pii_method = DEFAULT_PII_DETECTION_MODEL
+ is_no_redaction_init = (
+ default_pii_method == NO_REDACTION_PII_OPTION
+ )
+ show_local_entities_init = not is_no_redaction_init and (
+ default_pii_method == LOCAL_PII_OPTION
+ )
+ show_comprehend_entities_init = (
+ not is_no_redaction_init
+ and (default_pii_method == AWS_PII_OPTION)
+ )
+ is_llm_method_init = not is_no_redaction_init and (
+ default_pii_method == LOCAL_TRANSFORMERS_LLM_PII_OPTION
+ or default_pii_method == INFERENCE_SERVER_PII_OPTION
+ or default_pii_method == AWS_LLM_PII_OPTION
+ )
+
+ in_redact_entities.render()
+ in_redact_comprehend_entities.render()
+ in_redact_llm_entities.render()
+
+ custom_llm_entities_accordion = gr.Accordion(
+ "Custom instructions for LLM-based entity detection",
+ open=True,
+ visible=initial_is_llm_method,
+ )
+ with custom_llm_entities_accordion:
+ custom_llm_instructions_textbox.render()
+
+ with gr.Row(equal_height=True):
+ terms_accordion = gr.Accordion(
+ "Terms to always include or exclude in redactions, and whole page redaction. To add many terms at once, you can load in a file on the Redaction Settings tab.",
+ open=True,
+ )
+ with terms_accordion:
+ with gr.Row(equal_height=True):
+ with gr.Column(scale=3):
+ with gr.Row(equal_height=True):
+ in_allow_list_state.render()
+ in_deny_list_state.render()
+ in_fully_redacted_list_state.render()
+ with gr.Column(scale=1):
+ max_fuzzy_spelling_mistakes_num.render()
+
+ if SHOW_COSTS:
+ with gr.Accordion(
+ "Estimated costs and time taken. Note that costs shown only include direct usage of AWS services and do not include other running costs (e.g. storage, run-time costs). Costs are an upper bound - if there are many PDF pages with selectable text in your document, then they may be skipped in practice if you are not extracting signatures.",
+ open=True,
+ visible=True,
+ ):
+ with gr.Row(equal_height=True):
+ with gr.Column(scale=1):
+ textract_output_found_checkbox = gr.Checkbox(
+ value=False,
+ label="Existing Textract output file found",
+ interactive=False,
+ visible=True,
+ )
+ relevant_ocr_output_with_words_found_checkbox = (
+ gr.Checkbox(
+ value=False,
+ label="Existing local OCR output file found",
+ interactive=False,
+ visible=True,
+ )
+ )
+ with gr.Column(scale=4):
+ with gr.Row(equal_height=True):
+ total_pdf_page_count.render()
+ estimated_aws_costs_number = gr.Number(
+ label="Approximate AWS services cost (£)",
+ value=0.00,
+ precision=2,
+ visible=True,
+ interactive=False,
+ )
+ estimated_time_taken_number = gr.Number(
+ label="Approximate time for task (minutes)",
+ value=0,
+ visible=True,
+ precision=2,
+ interactive=False,
+ )
+ else:
+ total_pdf_page_count.render() # Need to render in both cases, as included in examples
+
+ if GET_COST_CODES or ENFORCE_COST_CODES:
+ with gr.Accordion(
+ "Assign task to cost code", open=True, visible=True
+ ):
+ gr.Markdown(
+ "Please ensure that you have approval from your budget holder before using this app for redaction tasks that incur a cost."
+ )
+ with gr.Row():
+ with gr.Column():
+ with gr.Accordion(
+ "View and filter cost code table",
+ open=False,
+ visible=True,
+ ):
+ cost_code_dataframe.render()
+ reset_cost_code_dataframe_button.render()
+ with gr.Column():
+ cost_code_choice_drop.render()
+ set_default_cost_code_button.render()
+ else:
+ cost_code_dataframe.render()
+ cost_code_choice_drop.render()
+ reset_cost_code_dataframe_button.render()
+ set_default_cost_code_button.render()
+
+ if SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS:
+ with gr.Accordion(
+ "Submit whole document to AWS Textract API (quickest text extraction for large documents)",
+ open=False,
+ visible=True,
+ ):
+ with gr.Row(equal_height=True):
+ gr.Markdown(
+ """Document will be submitted to AWS Textract API service to extract all text in the document. Processing will take place on (secure) AWS servers, and outputs will be stored on S3 for up to 7 days. To download the results, click 'Check status' below and they will be downloaded if ready."""
+ )
+ with gr.Row(equal_height=True):
+ send_document_to_textract_api_btn = gr.Button(
+ "Analyse document with AWS Textract API call",
+ variant="primary",
+ visible=True,
+ )
+ with gr.Row(equal_height=False):
+ with gr.Column(scale=2):
+ textract_job_detail_df = gr.Dataframe(
+ pd.DataFrame(
+ columns=[
+ "job_id",
+ "file_name",
+ "job_type",
+ "signature_extraction",
+ "job_date_time",
+ ]
+ ),
+ label="Previous job details",
+ visible=True,
+ type="pandas",
+ wrap=True,
+ )
+ with gr.Column(scale=1):
+ job_id_textbox = gr.Textbox(
+ label="Job ID to check status",
+ value="",
+ visible=True,
+ lines=2,
+ )
+ check_state_of_textract_api_call_btn = gr.Button(
+ "Check status of Textract job and download",
+ variant="secondary",
+ visible=True,
+ )
+ with gr.Row():
+ with gr.Column():
+ textract_job_output_file = gr.File(
+ label="Textract job output files",
+ height=100,
+ visible=True,
+ )
+ with gr.Column():
+ job_current_status = gr.Textbox(
+ value="",
+ label="Analysis job current status",
+ visible=True,
+ )
+ convert_textract_outputs_to_ocr_results = gr.Button(
+ "Convert Textract job outputs to OCR results",
+ variant="secondary",
+ visible=True,
+ )
+
+ with gr.Accordion(label="Extract text and redact document", open=True):
+
+ document_redact_btn = gr.Button(
+ "Extract text and redact document",
+ variant="secondary",
+ scale=4,
+ elem_id="document-redact-btn",
+ )
+
+ with gr.Row(equal_height=True):
+ with gr.Column(scale=1):
+ redaction_output_summary_textbox = gr.Textbox(
+ label="Output summary", scale=1, lines=4
+ )
+ with gr.Column(scale=2):
+ output_file = gr.File(
+ label="Output files", scale=2
+ ) # , height=FILE_INPUT_HEIGHT)
+
+ go_to_review_redactions_tab_btn = gr.Button(
+ "Review and modify redactions", variant="primary", scale=1
+ )
+
+ # Feedback elements are invisible until revealed by redaction action
+ pdf_feedback_title = gr.Markdown(
+ value="## Please give feedback", visible=False
+ )
+ pdf_feedback_radio = gr.Radio(
+ label="Quality of results",
+ choices=["The results were good", "The results were not good"],
+ visible=False,
+ )
+ pdf_further_details_text = gr.Textbox(
+ label="Please give more detailed feedback about the results:",
+ visible=False,
+ )
+ pdf_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
+
+ ###
+ # REVIEW REDACTIONS TAB
+ ###
+ with gr.Tab("Review redactions", id=2):
+
+ with gr.Accordion(
+ label="Upload PDFs/images and OCR results for review", open=True
+ ):
+ with gr.Row(equal_height=True):
+ with gr.Column(scale=2):
+ input_pdf_for_review = gr.File(
+ label="1. Upload original or previously redacted '..._for_review.pdf' document to review redactions.",
+ file_count="multiple",
+ height=FILE_INPUT_HEIGHT,
+ )
+ upload_pdf_for_review_btn = gr.Button(
+ "1. Load in original PDF or review PDF with redactions",
+ variant="secondary",
+ visible=False,
+ )
+ with gr.Column(scale=1):
+ input_review_files = gr.File(
+ label="2. An '...ocr_results_with_words' file can be uploaded here for searching text and making new redactions.",
+ file_count="multiple",
+ height=FILE_INPUT_HEIGHT,
+ )
+ upload_review_files_btn = gr.Button(
+ "2. Upload review or OCR csv files",
+ variant="secondary",
+ visible=False,
+ )
+ with gr.Row():
+ annotate_zoom_in = gr.Button("Zoom in", visible=False)
+ annotate_zoom_out = gr.Button("Zoom out", visible=False)
+ with gr.Row():
+ clear_all_redactions_on_page_btn = gr.Button(
+ "Clear all redactions on page", visible=False
+ )
+
+ with gr.Accordion(label="View review file data", open=False):
+ review_file_df = gr.Dataframe(
+ value=pd.DataFrame(),
+ headers=[
+ "image",
+ "page",
+ "label",
+ "color",
+ "xmin",
+ "ymin",
+ "xmax",
+ "ymax",
+ "text",
+ "id",
+ ],
+ row_count=(0, "dynamic"),
+ label="Review file data",
+ visible=True,
+ type="pandas",
+ wrap=True,
+ show_search="search",
+ max_height=400,
+ )
+
+ with gr.Row():
+ with gr.Column(scale=2):
+ with gr.Row(equal_height=True):
+ annotation_last_page_button = gr.Button(
+ "Previous page", scale=4
+ )
+ annotate_current_page = gr.Number(
+ value=1,
+ label="Current page",
+ precision=0,
+ scale=2,
+ min_width=50,
+ minimum=1,
+ )
+ annotate_max_pages = gr.Number(
+ value=1,
+ label="Total pages",
+ precision=0,
+ interactive=False,
+ scale=2,
+ min_width=50,
+ minimum=1,
+ )
+ annotation_next_page_button = gr.Button("Next page", scale=4)
+
+ zoom_str = str(annotator_zoom_number) + "%"
+
+ annotator = image_annotator(
+ label="Modify redaction boxes",
+ label_list=["Redaction"],
+ label_colors=[(0, 0, 0)],
+ show_label=False,
+ height=zoom_str,
+ width=zoom_str,
+ boxes_alpha=0.1,
+ box_min_size=1,
+ box_selected_thickness=2,
+ handle_size=4,
+ sources=None,
+ show_clear_button=False,
+ show_share_button=False,
+ show_remove_button=False,
+ handles_cursor=True,
+ interactive=True,
+ enable_keyboard_shortcuts=True,
+ use_default_label=False,
+ image_type="numpy",
+ )
+
+ with gr.Row(equal_height=True):
+ annotation_last_page_button_bottom = gr.Button(
+ "Previous page", scale=4
+ )
+ annotate_current_page_bottom = gr.Number(
+ value=1,
+ label="Current page",
+ precision=0,
+ interactive=True,
+ scale=2,
+ min_width=50,
+ minimum=1,
+ )
+ annotate_max_pages_bottom = gr.Number(
+ value=1,
+ label="Total pages",
+ precision=0,
+ interactive=False,
+ scale=2,
+ min_width=50,
+ minimum=1,
+ )
+ annotation_next_page_button_bottom = gr.Button(
+ "Next page", scale=4
+ )
+
+ with gr.Column(scale=1):
+ annotation_button_apply = gr.Button(
+ "Apply revised redactions to PDF", variant="primary"
+ )
+ update_current_page_redactions_btn = gr.Button(
+ value="Save changes on current page to file",
+ variant="secondary",
+ )
+
+ with gr.Tab("Modify redactions", id=3):
+ with gr.Accordion("Search suggested redactions", open=True):
+ with gr.Row(equal_height=True):
+ recogniser_entity_dropdown = gr.Dropdown(
+ label="Redaction category",
+ value="ALL",
+ allow_custom_value=True,
+ )
+ page_entity_dropdown = gr.Dropdown(
+ label="Page", value="ALL", allow_custom_value=True
+ )
+ text_entity_dropdown = gr.Dropdown(
+ label="Text", value="ALL", allow_custom_value=True
+ )
+ reset_dropdowns_btn = gr.Button(value="Reset filters")
+ recogniser_entity_dataframe = gr.Dataframe(
+ pd.DataFrame(
+ data={
+ "page": list(),
+ "label": list(),
+ "text": list(),
+ "id": list(),
+ }
+ ),
+ row_count=(0, "dynamic"),
+ type="pandas",
+ label="Click table row to select and go to page",
+ headers=["page", "label", "text", "id"],
+ wrap=True,
+ max_height=400,
+ show_search="filter",
+ )
+
+ with gr.Row(equal_height=True):
+ exclude_selected_btn = gr.Button(
+ value="Exclude all redactions in table"
+ )
+
+ with gr.Accordion("Selected redaction row", open=True):
+ selected_entity_dataframe_row = gr.Dataframe(
+ pd.DataFrame(
+ data={
+ "page": list(),
+ "label": list(),
+ "text": list(),
+ "id": list(),
+ }
+ ),
+ row_count=(0, "dynamic"),
+ type="pandas",
+ visible=True,
+ headers=["page", "label", "text", "id"],
+ wrap=True,
+ )
+ exclude_selected_row_btn = gr.Button(
+ value="Exclude specific redaction row"
+ )
+ exclude_text_with_same_as_selected_row_btn = gr.Button(
+ value="Exclude all redactions with same text as selected row"
+ )
+
+ undo_last_removal_btn = gr.Button(
+ value="Undo last element removal", variant="primary"
+ )
+
+ with gr.Tab("Search text and redact", id=7):
+ with gr.Accordion("Search text", open=True):
+ with gr.Row(equal_height=True):
+ page_entity_dropdown_redaction = gr.Dropdown(
+ label="Page",
+ value="1",
+ allow_custom_value=True,
+ scale=4,
+ )
+ reset_dropdowns_btn_new = gr.Button(
+ value="Reset page filter", scale=1
+ )
+
+ with gr.Row(equal_height=True):
+ multi_word_search_text = gr.Textbox(
+ label="Multi-word text search (regex enabled below)",
+ value="",
+ scale=4,
+ )
+ multi_word_search_text_btn = gr.Button(
+ value="Search", scale=1
+ )
+
+ with gr.Accordion("Search options", open=False):
+ similarity_search_score_minimum = gr.Number(
+ value=1.0,
+ minimum=0.4,
+ maximum=1.0,
+ label="Minimum similarity score for match (max=1)",
+ visible=False,
+ ) # Not used anymore for this exact search
+
+ with gr.Row():
+ with gr.Column():
+ new_redaction_text_label = gr.Textbox(
+ label="Label for new redactions",
+ value="Redaction",
+ )
+ colour_label = gr.ColorPicker(
+ label="Colour for labels",
+ value=CUSTOM_BOX_COLOUR,
+ )
+ with gr.Column():
+ use_regex_search = gr.Checkbox(
+ label="Enable regex pattern matching",
+ value=False,
+ info="When enabled, the search text will be treated as a regular expression pattern instead of literal text",
+ )
+
+ all_page_line_level_ocr_results_with_words_df = (
+ gr.Dataframe(
+ pd.DataFrame(
+ data={
+ "page": list(),
+ "line": list(),
+ "word_text": list(),
+ "index": list(),
+ }
+ ),
+ row_count=(0, "dynamic"),
+ type="pandas",
+ label="Click table row to select and go to page",
+ headers=[
+ "page",
+ "line",
+ "word_text",
+ "index",
+ ],
+ wrap=False,
+ max_height=400,
+ show_search="filter",
+ column_widths=["15%", "15%", "55%", "15%"],
+ )
+ )
+
+ redact_selected_btn = gr.Button(
+ value="Redact all text in table"
+ )
+ reset_ocr_with_words_df_btn = gr.Button(
+ value="Reset table to original state"
+ )
+
+ with gr.Accordion("Selected row", open=True):
+ selected_entity_dataframe_row_redact = gr.Dataframe(
+ pd.DataFrame(
+ data={
+ "page": list(),
+ "line": list(),
+ "word_text": list(),
+ "index": list(),
+ }
+ ),
+ row_count=(0, "dynamic"),
+ type="pandas",
+ headers=[
+ "page",
+ "line",
+ "word_text",
+ "index",
+ ],
+ wrap=False,
+ column_widths=["15%", "15%", "55%", "15%"],
+ )
+ redact_selected_row_btn = gr.Button(
+ value="Redact specific text row"
+ )
+ redact_text_with_same_as_selected_row_btn = gr.Button(
+ value="Redact all words with same text as selected row"
+ )
+
+ undo_last_redact_btn = gr.Button(
+ value="Undo latest redaction", variant="primary"
+ )
+ with gr.Tab("View text", id=10):
+ with gr.Accordion("View extracted text", open=True):
+ all_page_line_level_ocr_results_df = gr.Dataframe(
+ value=pd.DataFrame(columns=["page", "line", "text"]),
+ headers=["page", "line", "text"],
+ row_count=(0, "dynamic"),
+ label="All OCR results",
+ visible=True,
+ type="pandas",
+ wrap=True,
+ show_label=False,
+ buttons=["copy", "fullscreen"],
+ column_widths=["15%", "15%", "70%"],
+ max_height=800,
+ show_search="search",
+ )
+ reset_all_ocr_results_btn = gr.Button(
+ value="Reset OCR output table filter"
+ )
+ selected_ocr_dataframe_row = gr.Dataframe(
+ pd.DataFrame(
+ data={
+ "page": list(),
+ "line": list(),
+ "text": list(),
+ }
+ ),
+ column_count=3,
+ type="pandas",
+ visible=False,
+ headers=["page", "line", "text"],
+ wrap=True,
+ )
+
+ with gr.Accordion(
+ "Convert review files loaded above to Adobe format, or convert from Adobe format to review file",
+ open=False,
+ ):
+ convert_review_file_to_adobe_btn = gr.Button(
+ "Convert review file to Adobe comment format", variant="primary"
+ )
+ adobe_review_files_out = gr.File(
+ label="Output Adobe comment files will appear here. If converting from .xfdf file to review_file.csv, upload the original pdf with the xfdf file here then click Convert below.",
+ file_count="multiple",
+ file_types=[".csv", ".xfdf", ".pdf"],
+ )
+ convert_adobe_to_review_file_btn = gr.Button(
+ "Convert Adobe .xfdf comment file to review_file.csv",
+ variant="secondary",
+ )
+
+ ###
+ # IDENTIFY DUPLICATE PAGES TAB
+ ###
+ with gr.Tab(label="Identify duplicate pages", id=4):
+ gr.Markdown(
+ "Search for duplicate pages/subdocuments in your ocr_output files. By default, this function will search for duplicate text across multiple pages, and then join consecutive matching pages together into matched 'subdocuments'. The results can be reviewed below, false positives removed, and then the verified results applied to a document you have loaded in on the 'Review redactions' tab."
+ )
+
+ # Examples for duplicate page detection
+ if SHOW_EXAMPLES:
+ gr.Markdown(
+ "### Try an example - Click on an example below and then the 'Identify duplicate pages/subdocuments' button:"
+ )
+
+ # Check if duplicate example file exists
+ duplicate_example_file = "example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv"
+
+ if os.path.exists(duplicate_example_file):
+
+ duplicate_examples = gr.Examples(
+ examples=[
+ [
+ [duplicate_example_file],
+ 0.95,
+ 10,
+ True,
+ ],
+ [
+ [duplicate_example_file],
+ 0.95,
+ 3,
+ False,
+ ],
+ ],
+ inputs=[
+ in_duplicate_pages,
+ duplicate_threshold_input,
+ min_word_count_input,
+ combine_page_text_for_duplicates_bool,
+ ],
+ example_labels=[
+ "Find duplicate pages of text in document OCR outputs",
+ "Find duplicate text lines in document OCR outputs",
+ ],
+ fn=show_duplicate_info_box_on_click,
+ run_on_click=True,
+ cache_examples=False,
+ )
+
+ with gr.Accordion("Step 1: Configure and run analysis", open=True):
+ in_duplicate_pages.render()
+
+ with gr.Accordion("Duplicate matching parameters", open=False):
+ with gr.Row():
+ duplicate_threshold_input.render()
+
+ min_word_count_input.render()
+
+ combine_page_text_for_duplicates_bool.render()
+
+ gr.Markdown("#### Matching Strategy")
+ greedy_match_input = gr.Checkbox(
+ label="Combine consecutive matches into a single match (subdocument match)",
+ value=USE_GREEDY_DUPLICATE_DETECTION,
+ info="If checked, combines the longest possible sequence of consecutive matching pages into a single match.",
+ )
+ min_consecutive_pages_input = gr.Slider(
+ minimum=1,
+ maximum=20,
+ value=DEFAULT_MIN_CONSECUTIVE_PAGES,
+ step=1,
+ label="Minimum consecutive matches to be considered a match",
+ info="A text match will need to have this minimum number of consecutive matches to be considered a match. E.g. if set to 3 for page matching, the text for three consecutive pages will need to be the same in two places in the document to be considered a match.",
+ visible=not USE_GREEDY_DUPLICATE_DETECTION,
+ )
+
+ find_duplicate_pages_btn = gr.Button(
+ value="Identify duplicate pages/subdocuments",
+ variant="primary",
+ elem_id="duplicate-detection-btn",
+ )
+
+ with gr.Accordion("Step 2: Review and refine results", open=True):
+ gr.Markdown(
+ "### Analysis summary\nClick on a row to select it for preview or exclusion."
+ )
+
+ with gr.Row():
+ results_df_preview = gr.Dataframe(
+ label="Similarity Results",
+ headers=[
+ "Page1_File",
+ "Page1_Start_Page",
+ "Page1_End_Page",
+ "Page2_File",
+ "Page2_Start_Page",
+ "Page2_End_Page",
+ "Match_Length",
+ "Avg_Similarity",
+ "Page1_Text",
+ "Page2_Text",
+ ],
+ wrap=True,
+ show_search="search",
+ )
+ with gr.Row():
+ exclude_match_btn = gr.Button(
+ value="❌ Exclude Selected Match", variant="stop"
+ )
+ gr.Markdown(
+ "Click a row in the table, then click this button to remove it from the results and update the downloadable files."
+ )
+
+ gr.Markdown("### Full Text Preview of Selected Match")
+ with gr.Row():
+ page1_text_preview = gr.Dataframe(
+ label="Match Source (Document 1)",
+ wrap=True,
+ headers=["page", "text"],
+ show_search="search",
+ )
+ page2_text_preview = gr.Dataframe(
+ label="Match Duplicate (Document 2)",
+ wrap=True,
+ headers=["page", "text"],
+ show_search="search",
+ )
+
+ gr.Markdown("### Downloadable Files")
+ duplicate_files_out = gr.File(
+ label="Download analysis summary and redaction lists (.csv)",
+ file_count="multiple",
+ height=FILE_INPUT_HEIGHT,
+ )
+
+ with gr.Row():
+ apply_match_btn = gr.Button(
+ value="Apply relevant duplicate page output to document currently under review",
+ variant="secondary",
+ elem_id="apply-duplicate-pages-btn",
+ )
+ go_to_review_redactions_tab_btn_2 = gr.Button(
+ "Review and modify redactions", variant="primary", scale=1
+ )
+
+ ###
+ # WORD / TABULAR DATA TAB
+ ###
+ with gr.Tab(label="Word or Excel/CSV files", id=5):
+
+ gr.Markdown(
+ """Choose a Word or tabular data file (xlsx or csv) to redact. Note that when redacting complex Word files with e.g. images, some content/formatting will be removed, and it may not attempt to redact headers. You may prefer to convert the doc file to PDF in Word, and then run it through the first tab of this app (Print to PDF in print settings). Alternatively, an xlsx file output is provided when redacting docx files directly to allow for copying and pasting outputs back into the original document if preferred."""
+ )
+
+ # Examples for Word/Excel/csv redaction and tabular duplicate detection
+ if SHOW_EXAMPLES:
+ gr.Markdown(
+ "### Try an example - Click on an example below and then the 'Redact text/data files' button for redaction, or the 'Find duplicate cells/rows' button for duplicate detection:"
+ )
+
+ # Check which tabular example files exist
+ tabular_example_files = [
+ "example_data/combined_case_notes.csv",
+ "example_data/Bold minimalist professional cover letter.docx",
+ "example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv",
+ ]
+
+ available_tabular_examples = list()
+ tabular_example_labels = list()
+
+ # Check each tabular example file and add to examples if it exists
+ if os.path.exists(tabular_example_files[0]):
+ available_tabular_examples.append(
+ [
+ [tabular_example_files[0]],
+ ["Case Note", "Client"],
+ "Local",
+ "replace with 'REDACTED'",
+ [tabular_example_files[0]],
+ ["Case Note"],
+ 3,
+ ]
+ )
+ tabular_example_labels.append(
+ "CSV file redaction with specific columns - remove text"
+ )
+
+ if os.path.exists(tabular_example_files[1]):
+ available_tabular_examples.append(
+ [
+ [tabular_example_files[1]],
+ [],
+ "Local",
+ "replace with 'REDACTED'",
+ [],
+ [],
+ 3,
+ ]
+ )
+ tabular_example_labels.append(
+ "Word document redaction - replace with REDACTED"
+ )
+
+ if os.path.exists(tabular_example_files[2]):
+ available_tabular_examples.append(
+ [
+ [tabular_example_files[2]],
+ ["text"],
+ "Local",
+ "replace with 'REDACTED'",
+ [tabular_example_files[2]],
+ ["text"],
+ 3,
+ ]
+ )
+ tabular_example_labels.append(
+ "Tabular duplicate detection in CSV files"
+ )
+
+ # When RUN_ALL_EXAMPLES_THROUGH_AWS, replace PII with AWS Comprehend for tabular examples
+ if RUN_ALL_EXAMPLES_THROUGH_AWS:
+ for ex in available_tabular_examples:
+ ex[2] = AWS_PII_OPTION
+
+ # Only create examples if we have available files
+ if available_tabular_examples:
+
+ tabular_examples = gr.Examples(
+ examples=available_tabular_examples,
+ inputs=[
+ in_data_files,
+ in_colnames,
+ pii_identification_method_drop_tabular,
+ anon_strategy,
+ in_tabular_duplicate_files,
+ tabular_text_columns,
+ tabular_min_word_count,
+ ],
+ outputs=[
+ walkthrough_file_input,
+ walkthrough_pii_identification_method_drop_tabular,
+ walkthrough_anon_strategy,
+ ],
+ example_labels=tabular_example_labels,
+ fn=show_tabular_info_box_on_click,
+ run_on_click=True,
+ cache_examples=False,
+ )
+
+ with gr.Accordion(
+ "Redact Word or Excel/CSV files options",
+ open=show_main_redaction_accordion,
+ ):
+ with gr.Accordion("Upload docx, xlsx, or csv files", open=True):
+ in_data_files.render()
+ with gr.Accordion("Redact open text", open=False):
+ in_text = gr.Textbox(
+ label="Enter open text",
+ lines=10,
+ max_length=MAX_OPEN_TEXT_CHARACTERS,
+ )
+
+ in_excel_sheets.render()
+
+ in_colnames.render()
+
+ pii_identification_method_drop_tabular.render()
+
+ with gr.Accordion(
+ "Anonymisation output format - by default will replace PII with a blank space",
+ open=False,
+ ):
+ with gr.Row():
+ anon_strategy.render()
+
+ do_initial_clean.render()
+
+ with gr.Accordion(label="Redact Word/data files", open=True):
+ tabular_data_redact_btn = gr.Button(
+ "Redact text/data files",
+ variant="primary",
+ elem_id="tabular-redact-btn",
+ )
+ with gr.Row():
+ text_output_summary = gr.Textbox(label="Output result", lines=4)
+ text_output_file = gr.File(label="Output files")
+ text_tabular_files_done = gr.Number(
+ value=0,
+ label="Number of tabular files redacted",
+ interactive=False,
+ visible=False,
+ )
+
+ ###
+ # TABULAR DUPLICATE DETECTION
+ ###
+ # List of duplicate (Page2) page numbers from run_duplicate_analysis; used by apply_whole_page_redactions_from_list in the upload flow
+ duplicate_pages_list_state = gr.Dropdown(
+ value=[],
+ multiselect=True,
+ allow_custom_value=True,
+ label="Duplicate pages list",
+ visible=False,
+ )
+
+ with gr.Accordion(label="Find duplicate cells in tabular data", open=False):
+ gr.Markdown(
+ """Find duplicate cells or rows in CSV, Excel, or Parquet files. This tool analyses text content across all columns to identify similar or identical entries that may be duplicates. You can review the results and choose to remove duplicate rows from your files."""
+ )
+
+ with gr.Accordion(
+ "Step 1: Upload files and configure analysis", open=True
+ ):
+ in_tabular_duplicate_files.render()
+
+ with gr.Row(equal_height=True):
+ tabular_duplicate_threshold = gr.Number(
+ value=DEFAULT_DUPLICATE_DETECTION_THRESHOLD,
+ label="Similarity threshold",
+ info="Score (0-1) to consider cells a match. 1 = perfect match.",
+ )
+
+ tabular_min_word_count.render()
+
+ do_initial_clean_dup = gr.Checkbox(
+ label="Do initial clean of text (remove URLs, HTML tags, and non-ASCII characters)",
+ value=DO_INITIAL_TABULAR_DATA_CLEAN,
+ )
+ remove_duplicate_rows = gr.Checkbox(
+ label="Remove duplicate rows from deduplicated files",
+ value=REMOVE_DUPLICATE_ROWS,
+ )
+
+ with gr.Row():
+ in_excel_tabular_sheets = gr.Dropdown(
+ choices=list(),
+ multiselect=True,
+ label="Select Excel sheet names that you want to deduplicate (showing sheets present across all Excel files).",
+ visible=True,
+ allow_custom_value=True,
+ )
+
+ tabular_text_columns.render()
+
+ find_tabular_duplicates_btn = gr.Button(
+ value="Find duplicate cells/rows", variant="primary"
+ )
+
+ with gr.Accordion("Step 2: Review results", open=True):
+ gr.Markdown(
+ "### Duplicate Analysis Results\nClick on a row to see more details about the duplicate match."
+ )
+
+ tabular_results_df = gr.Dataframe(
+ label="Duplicate Cell Matches",
+ headers=[
+ "File1",
+ "Row1",
+ "File2",
+ "Row2",
+ "Similarity_Score",
+ "Text1",
+ "Text2",
+ ],
+ wrap=True,
+ show_search="search",
+ )
+
+ with gr.Row(equal_height=True):
+ tabular_selected_row_index = gr.Number(
+ value=None, visible=False
+ )
+ tabular_text1_preview = gr.Textbox(
+ label="Text from File 1", lines=3, interactive=False
+ )
+ tabular_text2_preview = gr.Textbox(
+ label="Text from File 2", lines=3, interactive=False
+ )
+
+ with gr.Accordion("Step 3: Remove duplicates", open=True):
+ gr.Markdown(
+ "### Remove duplicate rows\nSelect a file and click to remove duplicate rows based on the analysis above."
+ )
+
+ with gr.Row():
+ tabular_file_to_clean = gr.Dropdown(
+ choices=list(),
+ label="Select file to clean",
+ info="Choose which file to remove duplicates from",
+ visible=False,
+ )
+ clean_duplicates_btn = gr.Button(
+ value="Remove duplicate rows from selected file",
+ variant="secondary",
+ visible=False,
+ )
+
+ tabular_cleaned_file = gr.File(
+ label="Download cleaned file (duplicates removed)",
+ visible=True,
+ interactive=False,
+ )
+
+ # Feedback elements are invisible until revealed by redaction action
+ data_feedback_title = gr.Markdown(
+ value="## Please give feedback", visible=False
+ )
+ data_feedback_radio = gr.Radio(
+ label="Please give some feedback about the results of the redaction.",
+ choices=["The results were good", "The results were not good"],
+ visible=False,
+ show_label=True,
+ )
+ data_further_details_text = gr.Textbox(
+ label="Please give more detailed feedback about the results:",
+ visible=False,
+ )
+ data_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
+
+ ###
+ # DOCUMENT SUMMARISATION TAB
+ ###
+ # Build summarization inference method options based on the same flags used for PII detection
+ # Only show options that are available: AWS_LLM_PII_OPTION, LOCAL_TRANSFORMERS_LLM_PII_OPTION, INFERENCE_SERVER_PII_OPTION
+ summarisation_inference_method_options = []
+ if SHOW_AWS_PII_DETECTION_OPTIONS:
+ summarisation_inference_method_options.append(AWS_LLM_PII_OPTION)
+ if SHOW_TRANSFORMERS_LLM_PII_DETECTION_OPTIONS:
+ summarisation_inference_method_options.append(
+ LOCAL_TRANSFORMERS_LLM_PII_OPTION
+ )
+ if SHOW_INFERENCE_SERVER_PII_OPTIONS:
+ summarisation_inference_method_options.append(INFERENCE_SERVER_PII_OPTION)
+
+ # Determine default value
+ default_summarisation_inference_method = None
+ if summarisation_inference_method_options:
+ if SHOW_AWS_PII_DETECTION_OPTIONS:
+ default_summarisation_inference_method = AWS_LLM_PII_OPTION
+ elif SHOW_TRANSFORMERS_LLM_PII_DETECTION_OPTIONS:
+ default_summarisation_inference_method = (
+ LOCAL_TRANSFORMERS_LLM_PII_OPTION
+ )
+ elif SHOW_INFERENCE_SERVER_PII_OPTIONS:
+ default_summarisation_inference_method = INFERENCE_SERVER_PII_OPTION
+ else:
+ default_summarisation_inference_method = (
+ summarisation_inference_method_options[0]
+ )
+
+ # Only show the tab if at least one inference method is available
+ visible_summarisation_tab = SHOW_SUMMARISATION and (
+ SHOW_AWS_PII_DETECTION_OPTIONS
+ or SHOW_TRANSFORMERS_LLM_PII_DETECTION_OPTIONS
+ or SHOW_INFERENCE_SERVER_PII_OPTIONS
+ )
+ with gr.Tab(
+ label="Document summarisation", id=8, visible=visible_summarisation_tab
+ ):
+
+ gr.Markdown(
+ """This tab allows you to summarise documents using Large Language Model (LLM)-based summarisation. Upload a PDF or OCR CSV file (from a previous redaction run) to summarise. The summarisation process:
+ 1. Groups pages to fit within the maximum LLM context length, or by a maximum number of pages per group defined below if smaller
+ 2. Summarises each page group
+ 3. Creates an overall summary of the entire document based on the page group summaries
+
+ Large language models can hallucinate or make mistakes - the summaries produced here are intended for informational purposes only and not for further distribution or use.""",
+ line_breaks=True,
+ )
+
+ if SHOW_COSTS:
+ gr.Markdown(
+ "Note that the summarisation process using AWS Bedrock has a cost (approximately $0.50 per 1,000 pages summarised), and this will be charged to the same cost code as the redaction process (see Redact PDFs/images tab to select a cost code)."
+ )
+
+ in_summarisation_ocr_files = gr.File(
+ label="Upload PDF or OCR CSV files to summarise",
+ file_count="multiple",
+ height=FILE_INPUT_HEIGHT,
+ file_types=[".csv", ".pdf"],
+ )
+
+ with gr.Accordion("Summarisation Settings", open=True):
+ with gr.Row():
+ summarisation_inference_method = gr.Radio(
+ label="Choose LLM inference method for summarisation",
+ choices=summarisation_inference_method_options,
+ value=default_summarisation_inference_method,
+ interactive=True,
+ )
+ summarisation_temperature = gr.Slider(
+ label="Temperature",
+ minimum=0.0,
+ maximum=2.0,
+ value=0.6,
+ step=0.1,
+ interactive=True,
+ visible=False,
+ )
+ summarisation_max_pages_per_group = gr.Number(
+ label="Max pages per page-group summary",
+ info="No single page group will exceed this many pages (in addition to context-length token limits).",
+ value=30,
+ minimum=1,
+ maximum=9999,
+ precision=0,
+ interactive=True,
+ visible=True,
+ )
+
+ with gr.Row():
+ summarisation_api_key = gr.Textbox(
+ label="API Key (if required)",
+ type="password",
+ visible=False,
+ )
+ summarisation_context = gr.Textbox(
+ label="Additional context (optional)",
+ placeholder="e.g., 'This is a consultation response document'",
+ lines=2,
+ visible=False,
+ )
+
+ with gr.Row():
+ summarisation_format = gr.Radio(
+ label="Summary format",
+ choices=[
+ concise_summary_format_prompt,
+ detailed_summary_format_prompt,
+ ],
+ value=detailed_summary_format_prompt,
+ interactive=True,
+ )
+ summarisation_additional_instructions = gr.Textbox(
+ label="Additional summary instructions (optional)",
+ placeholder="e.g., 'Focus on key decisions and recommendations'",
+ lines=2,
+ )
+
+ # Note: AWS credentials are shared with the main redaction settings
+ # Use existing components from Settings tab (aws_access_key_textbox, aws_secret_key_textbox)
+ # For other settings not exposed in Settings tab, create hidden components with config defaults
+ summarisation_aws_region_hidden = gr.Textbox(
+ value=AWS_REGION,
+ visible=False,
+ )
+ summarisation_hf_api_key_hidden = gr.Textbox(
+ value="", # Not exposed in Settings tab, use empty string
+ visible=False,
+ )
+ summarisation_azure_endpoint_hidden = gr.Textbox(
+ value=AZURE_OPENAI_INFERENCE_ENDPOINT,
+ visible=False,
+ )
+ summarisation_api_url_hidden = gr.Textbox(
+ value=INFERENCE_SERVER_API_URL,
+ visible=False,
+ )
+
+ summarise_btn = gr.Button(
+ "Generate summary",
+ variant="primary",
+ elem_id="summarise-document-btn",
+ )
+
+ with gr.Row(equal_height=True):
+ summarisation_status = gr.Textbox(
+ label="Status",
+ lines=3,
+ interactive=False,
+ )
+ summarisation_output_files = gr.File(
+ label="Download Summary Files",
+ file_count="multiple",
+ interactive=False,
+ )
+
+ summarisation_display = gr.Markdown(
+ label="Summary",
+ value="",
+ line_breaks=True,
+ buttons=["copy"],
+ visible=True,
+ )
+
+ ###
+ # SETTINGS TAB
+ ###
+ with gr.Tab(label="Settings", id=9):
+ with gr.Accordion(
+ "Custom allow, deny, and full page redaction lists", open=True
+ ):
+ with gr.Row():
+ with gr.Column():
+ in_allow_list = gr.File(
+ label="Import allow list file - csv table with one column of a different word/phrase on each row (case insensitive). Terms in this file will not be redacted.",
+ file_count="multiple",
+ height=FILE_INPUT_HEIGHT,
+ )
+ in_allow_list_text = gr.Textbox(
+ label="Custom allow list load status"
+ )
+ with gr.Column():
+ in_deny_list.render() # Defined at beginning of file
+ in_deny_list_text = gr.Textbox(
+ label="Custom deny list load status"
+ )
+ with gr.Column():
+ in_fully_redacted_list.render() # Defined at beginning of file
+ in_fully_redacted_list_text = gr.Textbox(
+ label="Fully redacted page list load status"
+ )
+
+ with gr.Row():
+ with gr.Column(scale=2):
+ markdown_placeholder = gr.Markdown("")
+ with gr.Column(scale=1):
+ apply_fully_redacted_list_btn = gr.Button(
+ value="Apply whole page redaction list to document currently under review",
+ variant="secondary",
+ )
+
+ with gr.Accordion(
+ "Select entity types to redact", open=True, visible=False
+ ):
+
+ with gr.Row():
+
+ match_fuzzy_whole_phrase_bool = gr.Checkbox(
+ label="Should fuzzy search match on entire phrases in deny list (as opposed to each word individually)?",
+ value=True,
+ visible=False,
+ )
+
+ with gr.Accordion("Redact only selected pages", open=False):
+ with gr.Row():
+ page_min.render()
+ page_max.render()
+
+ with gr.Accordion("Advanced OCR settings", open=False):
+ with gr.Row(equal_height=True):
+ with gr.Column(scale=5):
+ with gr.Row():
+ efficient_ocr_checkbox = gr.Checkbox(
+ label="Use efficient OCR",
+ value=EFFICIENT_OCR,
+ )
+ efficient_ocr_min_words_number = gr.Number(
+ label="Minimum words on page to run text-only extraction with efficient OCR",
+ value=EFFICIENT_OCR_MIN_WORDS,
+ precision=0,
+ minimum=0,
+ step=1,
+ )
+ efficient_ocr_min_image_coverage_number = gr.Number(
+ label="Min. page-area fraction for an embedded image to force OCR (0 = word count only)",
+ value=EFFICIENT_OCR_MIN_IMAGE_COVERAGE_FRACTION,
+ precision=3,
+ minimum=0.0,
+ maximum=1.0,
+ step=0.005,
+ )
+ with gr.Column(scale=1):
+ overwrite_existing_ocr_checkbox = gr.Checkbox(
+ label="Always overwrite existing OCR results for new redaction tasks",
+ value=OVERWRITE_EXISTING_OCR_RESULTS,
+ )
+ with gr.Column(scale=1):
+ save_page_ocr_visualisations_checkbox = gr.Checkbox(
+ label="Save page OCR visualisations (debug bounding boxes)",
+ value=SAVE_PAGE_OCR_VISUALISATIONS,
+ )
+ with gr.Column(scale=1):
+ high_quality_textract_ocr_checkbox = gr.Checkbox(
+ label="High-quality Textract OCR (re-run low-confidence lines with Bedrock VLM for higher quality)",
+ value=HYBRID_TEXTRACT_BEDROCK_VLM,
+ visible=SHOW_AWS_TEXT_EXTRACTION_OPTIONS
+ and SHOW_HYBRID_TEXTRACT_BEDROCK_CHECKBOX,
+ )
+
+ with gr.Accordion(
+ "Language selection", open=False, visible=SHOW_LANGUAGE_SELECTION
+ ):
+ gr.Markdown(
+ """Note that AWS Textract is compatible with English, Spanish, Italian, Portuguese, French, and German, and handwriting detection is only available in English. AWS Comprehend for detecting PII is only compatible with English and Spanish.
+ The local models (Tesseract and SpaCy) are compatible with the other languages in the list below. However, the language packs for these models need to be installed on your system. When you first run a document through the app, the language packs will be downloaded automatically, but please expect a delay as the models are large."""
+ )
+ with gr.Row():
+ chosen_language_full_name_drop = gr.Dropdown(
+ value=DEFAULT_LANGUAGE_FULL_NAME,
+ choices=MAPPED_LANGUAGE_CHOICES,
+ label="Chosen language",
+ multiselect=False,
+ visible=True,
+ )
+ chosen_language_drop = gr.Dropdown(
+ value=DEFAULT_LANGUAGE,
+ choices=LANGUAGE_CHOICES,
+ label="Chosen language short code",
+ multiselect=False,
+ visible=True,
+ interactive=False,
+ )
+
+ with gr.Accordion(
+ "Use API keys for AWS services", open=False, visible=SHOW_AWS_API_KEYS
+ ):
+ with gr.Row():
+ aws_access_key_textbox = gr.Textbox(
+ value="",
+ label="AWS access key for account with permissions for AWS Textract and Comprehend",
+ visible=True,
+ type="password",
+ )
+ aws_secret_key_textbox = gr.Textbox(
+ value="",
+ label="AWS secret key for account with permissions for AWS Textract and Comprehend",
+ visible=True,
+ type="password",
+ )
+
+ with gr.Accordion("Log file outputs", open=False):
+ log_files_output = gr.File(label="Log file output", interactive=False)
+
+ with gr.Accordion(
+ "S3 output settings", open=False, visible=SAVE_OUTPUTS_TO_S3
+ ):
+ save_outputs_to_s3_checkbox = gr.Checkbox(
+ label="Save redaction outputs to S3",
+ value=SAVE_OUTPUTS_TO_S3,
+ visible=SAVE_OUTPUTS_TO_S3,
+ )
+ s3_output_folder_display = gr.Textbox(
+ label="S3 outputs folder",
+ value="",
+ interactive=False,
+ visible=SAVE_OUTPUTS_TO_S3,
+ )
+
+ with gr.Accordion("Combine multiple review PDFs or CSV files", open=False):
+ gr.Markdown(
+ "Upload multiple '_redactions_for_review' PDFs from the same base document. "
+ "All files must share the same base file name and page count. "
+ "Comments from all files will be merged into one PDF: base_name_redactions_for_review_combined.pdf"
+ )
+ combine_review_pdfs_in_out = gr.File(
+ label="Combine multiple _redactions_for_review PDFs",
+ file_count="multiple",
+ file_types=[".pdf"],
+ )
+ combine_review_pdfs_btn = gr.Button(
+ "Combine review PDFs into one", variant="primary"
+ )
+
+ multiple_review_files_in_out = gr.File(
+ label="Combine multiple review_file.csv files together here.",
+ file_count="multiple",
+ file_types=[".csv"],
+ )
+ merge_multiple_review_files_btn = gr.Button(
+ "Merge multiple review files into one", variant="primary"
+ )
+
+ if SHOW_ALL_OUTPUTS_IN_OUTPUT_FOLDER:
+ with gr.Accordion(
+ "View all and download all output files from this session",
+ open=False,
+ ):
+ all_output_files_btn.render()
+ all_output_files.render()
+ all_outputs_file_download.render()
+ else:
+ all_output_files_btn.render()
+ all_output_files.render()
+ all_outputs_file_download.render()
+
+ ###
+ # UI INTERACTION
+ ###
+
+ ###
+ # PDF/IMAGE REDACTION
+ ###
+ # Wrappers to duplicate main cost/time values into walkthrough components (used by upload and cost handlers when SHOW_COSTS)
+ def _get_document_file_names_with_walkthrough(files):
+ r = get_document_file_names(files)
+ return (*r, r[4])
+
+ def _prepare_image_or_pdf_with_walkthrough_sync(
+ file_paths,
+ text_extract_method,
+ all_page_line_level_ocr_results_df_base,
+ all_page_line_level_ocr_results_with_words_df_base,
+ latest_file_completed_num,
+ out_message,
+ first_loop_state,
+ number_of_pages,
+ all_annotations_object,
+ prepare_for_review,
+ in_fully_redacted_list,
+ output_folder,
+ input_folder,
+ efficient_ocr,
+ prepare_images_bool_false,
+ page_sizes,
+ pymupdf_doc,
+ page_min,
+ page_max,
+ ):
+ r = prepare_image_or_pdf_with_efficient_ocr(
+ file_paths,
+ text_extract_method,
+ all_page_line_level_ocr_results_df_base,
+ all_page_line_level_ocr_results_with_words_df_base,
+ latest_file_completed_num,
+ out_message,
+ first_loop_state,
+ number_of_pages,
+ all_annotations_object,
+ prepare_for_review,
+ in_fully_redacted_list,
+ output_folder,
+ input_folder,
+ efficient_ocr,
+ prepare_images_bool_false,
+ page_sizes,
+ pymupdf_doc,
+ page_min,
+ page_max,
+ )
+ return (*r, r[10], r[13])
+
+ def _check_for_existing_textract_file_sync(doc_name, output_folder, handwrite):
+ x = check_for_existing_textract_file(doc_name, output_folder, handwrite)
+ return (x, x)
+
+ def _check_for_relevant_ocr_output_with_words_sync(
+ doc_name, text_extract, output_folder
+ ):
+ x = check_for_relevant_ocr_output_with_words(
+ doc_name, text_extract, output_folder
+ )
+ return (x, x)
+
+ # Recalculate estimated costs based on changes to inputs
+ if SHOW_COSTS:
+
+ def _calculate_aws_costs_sync(
+ number_of_pages,
+ text_extract_method_radio,
+ handwrite_signature_checkbox,
+ pii_identification_method_drop,
+ textract_output_found_checkbox,
+ only_extract_text_radio,
+ ):
+ cost = calculate_aws_costs(
+ number_of_pages,
+ text_extract_method_radio,
+ handwrite_signature_checkbox,
+ pii_identification_method_drop,
+ textract_output_found_checkbox,
+ only_extract_text_radio,
+ )
+ return (cost, cost)
+
+ def _calculate_time_taken_sync(
+ number_of_pages,
+ text_extract_method_radio,
+ pii_identification_method_drop,
+ textract_output_found_checkbox,
+ only_extract_text_radio,
+ relevant_ocr_output_with_words_found_checkbox,
+ handwrite_signature_checkbox,
+ ):
+ t = calculate_time_taken(
+ number_of_pages,
+ text_extract_method_radio,
+ pii_identification_method_drop,
+ textract_output_found_checkbox,
+ only_extract_text_radio,
+ relevant_ocr_output_with_words_found_checkbox,
+ handwrite_signature_checkbox,
+ )
+ return (t, t)
+
+ # Calculate costs
+ total_pdf_page_count.change(
+ _calculate_aws_costs_sync,
+ inputs=[
+ total_pdf_page_count,
+ text_extract_method_radio,
+ handwrite_signature_checkbox,
+ pii_identification_method_drop,
+ textract_output_found_checkbox,
+ only_extract_text_radio,
+ ],
+ outputs=[
+ estimated_aws_costs_number,
+ walkthrough_estimated_aws_costs_number,
+ ],
+ )
+ text_extract_method_radio.change(
+ fn=_check_for_relevant_ocr_output_with_words_sync,
+ inputs=[
+ doc_file_name_no_extension_textbox,
+ text_extract_method_radio,
+ output_folder_textbox,
+ ],
+ outputs=[
+ relevant_ocr_output_with_words_found_checkbox,
+ walkthrough_relevant_ocr_output_with_words_found_checkbox,
+ ],
+ ).success(
+ _calculate_aws_costs_sync,
+ inputs=[
+ total_pdf_page_count,
+ text_extract_method_radio,
+ handwrite_signature_checkbox,
+ pii_identification_method_drop,
+ textract_output_found_checkbox,
+ only_extract_text_radio,
+ ],
+ outputs=[
+ estimated_aws_costs_number,
+ walkthrough_estimated_aws_costs_number,
+ ],
+ )
+ pii_identification_method_drop.change(
+ _calculate_aws_costs_sync,
+ inputs=[
+ total_pdf_page_count,
+ text_extract_method_radio,
+ handwrite_signature_checkbox,
+ pii_identification_method_drop,
+ textract_output_found_checkbox,
+ only_extract_text_radio,
+ ],
+ outputs=[
+ estimated_aws_costs_number,
+ walkthrough_estimated_aws_costs_number,
+ ],
+ )
+ handwrite_signature_checkbox.change(
+ fn=_check_for_existing_textract_file_sync,
+ inputs=[
+ doc_file_name_no_extension_textbox,
+ output_folder_textbox,
+ handwrite_signature_checkbox,
+ ],
+ outputs=[
+ textract_output_found_checkbox,
+ walkthrough_textract_output_found_checkbox,
+ ],
+ ).then(
+ _calculate_aws_costs_sync,
+ inputs=[
+ total_pdf_page_count,
+ text_extract_method_radio,
+ handwrite_signature_checkbox,
+ pii_identification_method_drop,
+ textract_output_found_checkbox,
+ only_extract_text_radio,
+ ],
+ outputs=[
+ estimated_aws_costs_number,
+ walkthrough_estimated_aws_costs_number,
+ ],
+ )
+ textract_output_found_checkbox.change(
+ _calculate_aws_costs_sync,
+ inputs=[
+ total_pdf_page_count,
+ text_extract_method_radio,
+ handwrite_signature_checkbox,
+ pii_identification_method_drop,
+ textract_output_found_checkbox,
+ only_extract_text_radio,
+ ],
+ outputs=[
+ estimated_aws_costs_number,
+ walkthrough_estimated_aws_costs_number,
+ ],
+ )
+ only_extract_text_radio.change(
+ _calculate_aws_costs_sync,
+ inputs=[
+ total_pdf_page_count,
+ text_extract_method_radio,
+ handwrite_signature_checkbox,
+ pii_identification_method_drop,
+ textract_output_found_checkbox,
+ only_extract_text_radio,
+ ],
+ outputs=[
+ estimated_aws_costs_number,
+ walkthrough_estimated_aws_costs_number,
+ ],
+ )
+ textract_output_found_checkbox.change(
+ _calculate_aws_costs_sync,
+ inputs=[
+ total_pdf_page_count,
+ text_extract_method_radio,
+ handwrite_signature_checkbox,
+ pii_identification_method_drop,
+ textract_output_found_checkbox,
+ only_extract_text_radio,
+ ],
+ outputs=[
+ estimated_aws_costs_number,
+ walkthrough_estimated_aws_costs_number,
+ ],
+ )
+
+ # Calculate time taken
+ total_pdf_page_count.change(
+ _calculate_time_taken_sync,
+ inputs=[
+ total_pdf_page_count,
+ text_extract_method_radio,
+ pii_identification_method_drop,
+ textract_output_found_checkbox,
+ only_extract_text_radio,
+ relevant_ocr_output_with_words_found_checkbox,
+ handwrite_signature_checkbox,
+ ],
+ outputs=[
+ estimated_time_taken_number,
+ walkthrough_estimated_time_taken_number,
+ ],
+ )
+ text_extract_method_radio.change(
+ _calculate_time_taken_sync,
+ inputs=[
+ total_pdf_page_count,
+ text_extract_method_radio,
+ pii_identification_method_drop,
+ textract_output_found_checkbox,
+ only_extract_text_radio,
+ relevant_ocr_output_with_words_found_checkbox,
+ handwrite_signature_checkbox,
+ ],
+ outputs=[
+ estimated_time_taken_number,
+ walkthrough_estimated_time_taken_number,
+ ],
+ )
+ pii_identification_method_drop.change(
+ _calculate_time_taken_sync,
+ inputs=[
+ total_pdf_page_count,
+ text_extract_method_radio,
+ pii_identification_method_drop,
+ textract_output_found_checkbox,
+ only_extract_text_radio,
+ relevant_ocr_output_with_words_found_checkbox,
+ handwrite_signature_checkbox,
+ ],
+ outputs=[
+ estimated_time_taken_number,
+ walkthrough_estimated_time_taken_number,
+ ],
+ )
+ handwrite_signature_checkbox.change(
+ fn=_check_for_existing_textract_file_sync,
+ inputs=[
+ doc_file_name_no_extension_textbox,
+ output_folder_textbox,
+ handwrite_signature_checkbox,
+ ],
+ outputs=[
+ textract_output_found_checkbox,
+ walkthrough_textract_output_found_checkbox,
+ ],
+ ).then(
+ _calculate_time_taken_sync,
+ inputs=[
+ total_pdf_page_count,
+ text_extract_method_radio,
+ pii_identification_method_drop,
+ textract_output_found_checkbox,
+ only_extract_text_radio,
+ relevant_ocr_output_with_words_found_checkbox,
+ handwrite_signature_checkbox,
+ ],
+ outputs=[
+ estimated_time_taken_number,
+ walkthrough_estimated_time_taken_number,
+ ],
+ )
+ textract_output_found_checkbox.change(
+ _calculate_time_taken_sync,
+ inputs=[
+ total_pdf_page_count,
+ text_extract_method_radio,
+ pii_identification_method_drop,
+ textract_output_found_checkbox,
+ only_extract_text_radio,
+ relevant_ocr_output_with_words_found_checkbox,
+ handwrite_signature_checkbox,
+ ],
+ outputs=[
+ estimated_time_taken_number,
+ walkthrough_estimated_time_taken_number,
+ ],
+ )
+ only_extract_text_radio.change(
+ _calculate_time_taken_sync,
+ inputs=[
+ total_pdf_page_count,
+ text_extract_method_radio,
+ pii_identification_method_drop,
+ textract_output_found_checkbox,
+ only_extract_text_radio,
+ relevant_ocr_output_with_words_found_checkbox,
+ handwrite_signature_checkbox,
+ ],
+ outputs=[
+ estimated_time_taken_number,
+ walkthrough_estimated_time_taken_number,
+ ],
+ )
+ textract_output_found_checkbox.change(
+ _calculate_time_taken_sync,
+ inputs=[
+ total_pdf_page_count,
+ text_extract_method_radio,
+ pii_identification_method_drop,
+ textract_output_found_checkbox,
+ only_extract_text_radio,
+ relevant_ocr_output_with_words_found_checkbox,
+ handwrite_signature_checkbox,
+ ],
+ outputs=[
+ estimated_time_taken_number,
+ walkthrough_estimated_time_taken_number,
+ ],
+ )
+ relevant_ocr_output_with_words_found_checkbox.change(
+ _calculate_time_taken_sync,
+ inputs=[
+ total_pdf_page_count,
+ text_extract_method_radio,
+ pii_identification_method_drop,
+ textract_output_found_checkbox,
+ only_extract_text_radio,
+ relevant_ocr_output_with_words_found_checkbox,
+ handwrite_signature_checkbox,
+ ],
+ outputs=[
+ estimated_time_taken_number,
+ walkthrough_estimated_time_taken_number,
+ ],
+ )
+
+ text_extract_method_radio.change(
+ fn=auto_set_local_ocr_for_bedrock_vlm,
+ inputs=[text_extract_method_radio],
+ outputs=[local_ocr_method_radio],
+ )
+
+ # Update visibility of OCR-related accordions based on text extraction method selection
+ text_extract_method_radio.change(
+ fn=handle_main_text_extract_method_selection,
+ inputs=[text_extract_method_radio],
+ outputs=[
+ local_ocr_accordion,
+ inference_server_vlm_accordion,
+ aws_textract_signature_accordion,
+ ],
+ )
+
+ redaction_method_radio.change(
+ fn=handle_main_redaction_method_selection,
+ inputs=[redaction_method_radio, pii_identification_method_drop],
+ outputs=[
+ pii_identification_method_drop,
+ in_redact_entities,
+ in_redact_comprehend_entities,
+ in_redact_llm_entities,
+ custom_llm_entities_accordion,
+ walkthrough_list_accordion,
+ max_fuzzy_spelling_mistakes_num,
+ entity_types_to_redact_accordion,
+ terms_accordion,
+ only_extract_text_radio,
+ ],
+ )
+
+ # Update visibility of PII-related accordions based on PII method selection
+ pii_identification_method_drop.change(
+ fn=handle_main_pii_method_selection,
+ inputs=[pii_identification_method_drop],
+ outputs=[
+ in_redact_entities,
+ in_redact_comprehend_entities,
+ in_redact_llm_entities,
+ custom_llm_entities_accordion,
+ ],
+ )
+
+ # Allow user to select items from cost code dataframe for cost code
+ if SHOW_COSTS and (GET_COST_CODES or ENFORCE_COST_CODES):
+ cost_code_dataframe.select(
+ df_select_callback_cost,
+ inputs=[cost_code_dataframe],
+ outputs=[cost_code_choice_drop],
+ )
+ reset_cost_code_dataframe_button.click(
+ reset_base_dataframe,
+ inputs=[cost_code_dataframe_base],
+ outputs=[cost_code_dataframe],
+ )
+
+ cost_code_choice_drop.select(
+ update_cost_code_dataframe_from_dropdown_select,
+ inputs=[cost_code_choice_drop, cost_code_dataframe_base],
+ outputs=[cost_code_dataframe],
+ )
+
+ def _save_default_cost_code_and_notify(
+ session_hash, cost_code_choice, cost_code_df, output_folder
+ ):
+ msg = save_default_cost_code_for_session(
+ session_hash, cost_code_choice, cost_code_df, output_folder
+ )
+ gr.Info(msg)
+
+ set_default_cost_code_button.click(
+ _save_default_cost_code_and_notify,
+ inputs=[
+ session_hash_textbox,
+ cost_code_choice_drop,
+ cost_code_dataframe,
+ input_folder_textbox,
+ ],
+ outputs=[],
+ )
+
+ # Uploading a file writes to state variables
+ _doc_upload_fn = (
+ _get_document_file_names_with_walkthrough
+ if SHOW_COSTS
+ else get_document_file_names
+ )
+ _doc_upload_outputs = [
+ doc_file_name_no_extension_textbox,
+ doc_file_name_with_extension_textbox,
+ doc_full_file_name_textbox,
+ doc_file_name_textbox_list,
+ total_pdf_page_count,
+ ]
+ if SHOW_COSTS:
+ _doc_upload_outputs = _doc_upload_outputs + [walkthrough_total_pdf_page_count]
+ _prepare_fn = (
+ _prepare_image_or_pdf_with_walkthrough_sync
+ if SHOW_COSTS
+ else prepare_image_or_pdf_with_efficient_ocr
+ )
+ _prepare_outputs = [
+ redaction_output_summary_textbox,
+ prepared_pdf_state,
+ images_pdf_state,
+ annotate_max_pages,
+ annotate_max_pages_bottom,
+ pdf_doc_state,
+ all_image_annotations_state,
+ review_file_df,
+ document_cropboxes,
+ page_sizes,
+ textract_output_found_checkbox,
+ all_img_details_state,
+ all_page_line_level_ocr_results_df_base,
+ relevant_ocr_output_with_words_found_checkbox,
+ all_page_line_level_ocr_results_with_words_df_base,
+ ]
+ if SHOW_COSTS:
+ _prepare_outputs = _prepare_outputs + [
+ walkthrough_textract_output_found_checkbox,
+ walkthrough_relevant_ocr_output_with_words_found_checkbox,
+ ]
+ _textract_check_fn = (
+ _check_for_existing_textract_file_sync
+ if SHOW_COSTS
+ else check_for_existing_textract_file
+ )
+ _textract_check_outputs = (
+ [textract_output_found_checkbox, walkthrough_textract_output_found_checkbox]
+ if SHOW_COSTS
+ else [textract_output_found_checkbox]
+ )
+ _ocr_check_fn = (
+ _check_for_relevant_ocr_output_with_words_sync
+ if SHOW_COSTS
+ else check_for_relevant_ocr_output_with_words
+ )
+ _ocr_check_outputs = (
+ [
+ relevant_ocr_output_with_words_found_checkbox,
+ walkthrough_relevant_ocr_output_with_words_found_checkbox,
+ ]
+ if SHOW_COSTS
+ else [relevant_ocr_output_with_words_found_checkbox]
+ )
+ in_doc_files.upload(
+ fn=_doc_upload_fn,
+ inputs=[in_doc_files],
+ outputs=_doc_upload_outputs,
+ ).success(
+ fn=_prepare_fn,
+ inputs=[
+ in_doc_files,
+ text_extract_method_radio,
+ all_page_line_level_ocr_results_df_base,
+ all_page_line_level_ocr_results_with_words_df_base,
+ latest_file_completed_num,
+ redaction_output_summary_textbox,
+ first_loop_state,
+ annotate_max_pages,
+ all_image_annotations_state,
+ prepare_for_review_bool_false,
+ in_fully_redacted_list_state,
+ output_folder_textbox,
+ input_folder_textbox,
+ efficient_ocr_checkbox,
+ prepare_images_bool_false,
+ page_sizes,
+ pdf_doc_state,
+ page_min,
+ page_max,
+ ],
+ outputs=_prepare_outputs,
+ show_progress_on=[redaction_output_summary_textbox],
+ ).success(
+ fn=_textract_check_fn,
+ inputs=[
+ doc_file_name_no_extension_textbox,
+ output_folder_textbox,
+ handwrite_signature_checkbox,
+ ],
+ outputs=_textract_check_outputs,
+ ).success(
+ fn=_ocr_check_fn,
+ inputs=[
+ doc_file_name_no_extension_textbox,
+ text_extract_method_radio,
+ output_folder_textbox,
+ ],
+ outputs=_ocr_check_outputs,
+ )
+
+ # Same process as above for walkthrough file input
+ walkthrough_file_input.upload(
+ fn=_doc_upload_fn,
+ inputs=[walkthrough_file_input],
+ outputs=_doc_upload_outputs,
+ ).success(
+ fn=_prepare_fn,
+ inputs=[
+ walkthrough_file_input,
+ text_extract_method_radio,
+ all_page_line_level_ocr_results_df_base,
+ all_page_line_level_ocr_results_with_words_df_base,
+ latest_file_completed_num,
+ redaction_output_summary_textbox,
+ first_loop_state,
+ annotate_max_pages,
+ all_image_annotations_state,
+ prepare_for_review_bool_false,
+ in_fully_redacted_list_state,
+ output_folder_textbox,
+ input_folder_textbox,
+ efficient_ocr_checkbox,
+ prepare_images_bool_false,
+ page_sizes,
+ pdf_doc_state,
+ page_min,
+ page_max,
+ ],
+ outputs=_prepare_outputs,
+ show_progress_on=[redaction_output_summary_textbox],
+ ).success(
+ fn=_textract_check_fn,
+ inputs=[
+ doc_file_name_no_extension_textbox,
+ output_folder_textbox,
+ handwrite_signature_checkbox,
+ ],
+ outputs=_textract_check_outputs,
+ ).success(
+ fn=_ocr_check_fn,
+ inputs=[
+ doc_file_name_no_extension_textbox,
+ text_extract_method_radio,
+ output_folder_textbox,
+ ],
+ outputs=_ocr_check_outputs,
+ )
+
+ ###
+ # Run redaction function from walkthrough button or main redaction tab button
+ ###
+
+ # Log processing usage - time taken for redaction queries, and also logs for queries to Textract/Comprehend
+ usage_callback = CSVLogger_custom(dataset_file_name=USAGE_LOG_FILE_NAME)
+
+ if DISPLAY_FILE_NAMES_IN_LOGS:
+ usage_callback.setup(
+ [
+ session_hash_textbox,
+ doc_file_name_no_extension_textbox,
+ data_file_name_with_extension_textbox,
+ total_pdf_page_count,
+ actual_time_taken_number,
+ textract_query_number,
+ pii_identification_method_drop,
+ comprehend_query_number,
+ cost_code_choice_drop,
+ handwrite_signature_checkbox,
+ host_name_textbox,
+ text_extract_method_radio,
+ is_a_textract_api_call,
+ task_textbox,
+ vlm_model_name_textbox,
+ vlm_total_input_tokens_number,
+ vlm_total_output_tokens_number,
+ llm_model_name_textbox,
+ llm_total_input_tokens_number,
+ llm_total_output_tokens_number,
+ ],
+ USAGE_LOGS_FOLDER,
+ )
+ else:
+ usage_callback.setup(
+ [
+ session_hash_textbox,
+ blank_doc_file_name_no_extension_textbox_for_logs,
+ blank_data_file_name_no_extension_textbox_for_logs,
+ total_pdf_page_count,
+ actual_time_taken_number,
+ textract_query_number,
+ pii_identification_method_drop,
+ comprehend_query_number,
+ cost_code_choice_drop,
+ handwrite_signature_checkbox,
+ host_name_textbox,
+ text_extract_method_radio,
+ is_a_textract_api_call,
+ task_textbox,
+ vlm_model_name_textbox,
+ vlm_total_input_tokens_number,
+ vlm_total_output_tokens_number,
+ llm_model_name_textbox,
+ llm_total_input_tokens_number,
+ llm_total_output_tokens_number,
+ ],
+ USAGE_LOGS_FOLDER,
+ )
+
+ ## From walkthrough tab button
+ step_4_next_document_redact_btn.click(
+ change_tab_to_tabular_or_document_redactions,
+ inputs=walkthrough_is_data_file,
+ outputs=tabs,
+ ).then(
+ fn=reset_state_vars,
+ outputs=[
+ all_image_annotations_state,
+ all_page_line_level_ocr_results_df_base,
+ all_decision_process_table_state,
+ comprehend_query_number,
+ textract_metadata_textbox,
+ annotator,
+ output_file_list_state,
+ log_files_output_list_state,
+ recogniser_entity_dataframe,
+ recogniser_entity_dataframe_base,
+ pdf_doc_state,
+ duplication_file_path_outputs_list_state,
+ redaction_output_summary_textbox,
+ is_a_textract_api_call,
+ textract_query_number,
+ all_page_line_level_ocr_results_with_words,
+ input_review_files,
+ latest_file_completed_num,
+ llm_total_input_tokens_number,
+ llm_total_output_tokens_number,
+ vlm_total_input_tokens_number,
+ vlm_total_output_tokens_number,
+ ],
+ ).success(
+ fn=enforce_cost_codes,
+ inputs=[
+ enforce_cost_code_bool,
+ cost_code_choice_drop,
+ cost_code_dataframe_base,
+ ],
+ ).success(
+ fn=choose_and_run_redactor,
+ inputs=[
+ in_doc_files,
+ prepared_pdf_state,
+ images_pdf_state,
+ in_redact_entities,
+ in_redact_comprehend_entities,
+ in_redact_llm_entities,
+ text_extract_method_radio,
+ in_allow_list_state,
+ in_deny_list_state,
+ in_fully_redacted_list_state,
+ latest_file_completed_num,
+ redaction_output_summary_textbox,
+ output_file_list_state,
+ log_files_output_list_state,
+ first_loop_state,
+ page_min,
+ page_max,
+ actual_time_taken_number,
+ handwrite_signature_checkbox,
+ textract_metadata_textbox,
+ all_image_annotations_state,
+ all_page_line_level_ocr_results_df_base,
+ all_decision_process_table_state,
+ pdf_doc_state,
+ current_loop_page_number,
+ page_break_return,
+ pii_identification_method_drop,
+ comprehend_query_number,
+ max_fuzzy_spelling_mistakes_num,
+ match_fuzzy_whole_phrase_bool,
+ aws_access_key_textbox,
+ aws_secret_key_textbox,
+ annotate_max_pages,
+ review_file_df,
+ output_folder_textbox,
+ document_cropboxes,
+ page_sizes,
+ textract_output_found_checkbox,
+ only_extract_text_radio,
+ duplication_file_path_outputs_list_state,
+ latest_review_file_path,
+ input_folder_textbox,
+ textract_query_number,
+ latest_ocr_file_path,
+ all_page_line_level_ocr_results,
+ all_page_line_level_ocr_results_with_words,
+ all_page_line_level_ocr_results_with_words_df_base,
+ local_ocr_method_radio,
+ chosen_language_drop,
+ input_review_files,
+ custom_llm_instructions_textbox,
+ inference_server_vlm_model_textbox,
+ efficient_ocr_checkbox,
+ efficient_ocr_min_words_number,
+ efficient_ocr_min_image_coverage_number,
+ high_quality_textract_ocr_checkbox,
+ overwrite_existing_ocr_checkbox,
+ llm_model_name_textbox,
+ llm_total_input_tokens_number,
+ llm_total_output_tokens_number,
+ vlm_model_name_textbox,
+ vlm_total_input_tokens_number,
+ vlm_total_output_tokens_number,
+ save_page_ocr_visualisations_checkbox,
+ ],
+ outputs=[
+ redaction_output_summary_textbox,
+ output_file,
+ output_file_list_state,
+ latest_file_completed_num,
+ log_files_output,
+ log_files_output_list_state,
+ actual_time_taken_number,
+ textract_metadata_textbox,
+ pdf_doc_state,
+ all_image_annotations_state,
+ current_loop_page_number,
+ page_break_return,
+ all_page_line_level_ocr_results_df_base,
+ all_decision_process_table_state,
+ comprehend_query_number,
+ input_pdf_for_review,
+ annotate_max_pages,
+ annotate_max_pages_bottom,
+ prepared_pdf_state,
+ images_pdf_state,
+ review_file_df,
+ page_sizes,
+ duplication_file_path_outputs_list_state,
+ in_duplicate_pages,
+ in_summarisation_ocr_files,
+ latest_review_file_path,
+ textract_query_number,
+ latest_ocr_file_path,
+ all_page_line_level_ocr_results,
+ all_page_line_level_ocr_results_with_words,
+ all_page_line_level_ocr_results_with_words_df_base,
+ backup_review_state,
+ task_textbox,
+ input_review_files,
+ vlm_model_name_textbox,
+ vlm_total_input_tokens_number,
+ vlm_total_output_tokens_number,
+ llm_model_name_textbox,
+ llm_total_input_tokens_number,
+ llm_total_output_tokens_number,
+ total_pdf_page_count,
+ ],
+ api_name="redact_doc",
+ show_progress_on=[redaction_output_summary_textbox],
+ ).success(
+ fn=lambda *args: usage_callback.flag(
+ list(args),
+ save_to_csv=SAVE_LOGS_TO_CSV,
+ save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB,
+ dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME,
+ dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS,
+ replacement_headers=CSV_USAGE_LOG_HEADERS,
+ ),
+ inputs=(
+ [
+ session_hash_textbox,
+ doc_file_name_no_extension_textbox,
+ blank_data_file_name_no_extension_textbox_for_logs,
+ actual_time_taken_number,
+ total_pdf_page_count,
+ textract_query_number,
+ pii_identification_method_drop,
+ comprehend_query_number,
+ cost_code_choice_drop,
+ handwrite_signature_checkbox,
+ host_name_textbox,
+ text_extract_method_radio,
+ is_a_textract_api_call,
+ task_textbox,
+ vlm_model_name_textbox,
+ vlm_total_input_tokens_number,
+ vlm_total_output_tokens_number,
+ llm_model_name_textbox,
+ llm_total_input_tokens_number,
+ llm_total_output_tokens_number,
+ ]
+ if DISPLAY_FILE_NAMES_IN_LOGS
+ else [
+ session_hash_textbox,
+ placeholder_doc_file_name_no_extension_textbox_for_logs,
+ blank_data_file_name_no_extension_textbox_for_logs,
+ actual_time_taken_number,
+ total_pdf_page_count,
+ textract_query_number,
+ pii_identification_method_drop,
+ comprehend_query_number,
+ cost_code_choice_drop,
+ handwrite_signature_checkbox,
+ host_name_textbox,
+ text_extract_method_radio,
+ is_a_textract_api_call,
+ task_textbox,
+ vlm_model_name_textbox,
+ vlm_total_input_tokens_number,
+ vlm_total_output_tokens_number,
+ llm_model_name_textbox,
+ llm_total_input_tokens_number,
+ llm_total_output_tokens_number,
+ ]
+ ),
+ outputs=[flag_value_placeholder],
+ preprocess=False,
+ api_name="usage_logs",
+ ).success(
+ fn=upload_log_file_to_s3,
+ inputs=[usage_logs_state, usage_s3_logs_loc_state],
+ outputs=[s3_logs_output_textbox],
+ ).success(
+ fn=export_outputs_to_s3,
+ inputs=[
+ output_file_list_state,
+ s3_output_folder_state,
+ save_outputs_to_s3_checkbox,
+ in_doc_files,
+ ],
+ outputs=None,
+ ).success(
+ fn=update_annotator_object_and_filter_df,
+ inputs=[
+ all_image_annotations_state,
+ page_min,
+ recogniser_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ text_entity_dropdown,
+ recogniser_entity_dataframe_base,
+ annotator_zoom_number,
+ review_file_df,
+ page_sizes,
+ doc_full_file_name_textbox,
+ input_folder_textbox,
+ ],
+ outputs=[
+ annotator,
+ annotate_current_page,
+ annotate_current_page_bottom,
+ annotate_previous_page,
+ recogniser_entity_dropdown,
+ recogniser_entity_dataframe,
+ recogniser_entity_dataframe_base,
+ text_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ page_sizes,
+ all_image_annotations_state,
+ ],
+ show_progress_on=[annotator],
+ ).success(
+ fn=check_for_existing_textract_file,
+ inputs=[
+ doc_file_name_no_extension_textbox,
+ output_folder_textbox,
+ handwrite_signature_checkbox,
+ ],
+ outputs=[textract_output_found_checkbox],
+ ).success(
+ fn=check_for_relevant_ocr_output_with_words,
+ inputs=[
+ doc_file_name_no_extension_textbox,
+ text_extract_method_radio,
+ output_folder_textbox,
+ ],
+ outputs=[relevant_ocr_output_with_words_found_checkbox],
+ ).success(
+ fn=reveal_feedback_buttons,
+ outputs=[
+ pdf_feedback_radio,
+ pdf_further_details_text,
+ pdf_submit_feedback_btn,
+ pdf_feedback_title,
+ ],
+ ).success(
+ fn=check_duplicate_pages_checkbox,
+ inputs=[redact_duplicate_pages_checkbox],
+ outputs=None,
+ ).failure( # Failure case enables branching for when duplicate analysis textbox is enabled
+ fn=lambda: None
+ ).then(
+ fn=reset_aws_call_vars,
+ outputs=[
+ comprehend_query_number,
+ textract_query_number,
+ vlm_total_input_tokens_number,
+ vlm_total_output_tokens_number,
+ llm_total_input_tokens_number,
+ llm_total_output_tokens_number,
+ llm_model_name_textbox,
+ vlm_model_name_textbox,
+ ],
+ ).success(
+ fn=run_duplicate_analysis,
+ inputs=[
+ all_page_line_level_ocr_results_df_base,
+ duplicate_threshold_input,
+ min_word_count_input,
+ min_consecutive_pages_input,
+ greedy_match_input,
+ all_page_line_level_ocr_results_df_base,
+ input_review_files,
+ combine_page_text_for_duplicates_bool,
+ doc_file_name_with_extension_textbox,
+ output_folder_textbox,
+ ],
+ outputs=[
+ results_df_preview,
+ duplicate_files_out,
+ full_duplicate_data_by_file,
+ actual_time_taken_number,
+ task_textbox,
+ all_page_line_level_ocr_results_df_base,
+ input_review_files,
+ duplicate_pages_list_state,
+ ],
+ show_progress_on=[results_df_preview, redaction_output_summary_textbox],
+ ).success(
+ fn=export_outputs_to_s3,
+ # duplicate_files_out returns a single file path; export helper will normalise it
+ inputs=[
+ duplicate_files_out,
+ s3_output_folder_state,
+ save_outputs_to_s3_checkbox,
+ in_duplicate_pages,
+ ],
+ outputs=None,
+ ).success(
+ fn=lambda: "deduplicate",
+ outputs=[task_textbox],
+ ).success(
+ fn=lambda *args: usage_callback.flag(
+ list(args),
+ save_to_csv=SAVE_LOGS_TO_CSV,
+ save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB,
+ dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME,
+ dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS,
+ replacement_headers=CSV_USAGE_LOG_HEADERS,
+ ),
+ inputs=(
+ [
+ session_hash_textbox,
+ doc_file_name_no_extension_textbox,
+ blank_data_file_name_no_extension_textbox_for_logs,
+ actual_time_taken_number,
+ total_pdf_page_count,
+ textract_query_number,
+ pii_identification_method_drop,
+ comprehend_query_number,
+ cost_code_choice_drop,
+ handwrite_signature_checkbox,
+ host_name_textbox,
+ text_extract_method_radio,
+ is_a_textract_api_call,
+ task_textbox,
+ vlm_model_name_textbox,
+ vlm_total_input_tokens_number,
+ vlm_total_output_tokens_number,
+ llm_model_name_textbox,
+ llm_total_input_tokens_number,
+ llm_total_output_tokens_number,
+ ]
+ if DISPLAY_FILE_NAMES_IN_LOGS
+ else [
+ session_hash_textbox,
+ placeholder_doc_file_name_no_extension_textbox_for_logs,
+ blank_data_file_name_no_extension_textbox_for_logs,
+ actual_time_taken_number,
+ total_pdf_page_count,
+ textract_query_number,
+ pii_identification_method_drop,
+ comprehend_query_number,
+ cost_code_choice_drop,
+ handwrite_signature_checkbox,
+ host_name_textbox,
+ text_extract_method_radio,
+ is_a_textract_api_call,
+ task_textbox,
+ vlm_model_name_textbox,
+ vlm_total_input_tokens_number,
+ vlm_total_output_tokens_number,
+ llm_model_name_textbox,
+ llm_total_input_tokens_number,
+ llm_total_output_tokens_number,
+ ]
+ ),
+ outputs=[flag_value_placeholder],
+ preprocess=False,
+ ).success(
+ fn=upload_log_file_to_s3,
+ inputs=[usage_logs_state, usage_s3_logs_loc_state],
+ outputs=[s3_logs_output_textbox],
+ ).success(
+ fn=create_annotation_objects_from_duplicates,
+ inputs=[
+ results_df_preview,
+ all_page_line_level_ocr_results_df_base,
+ page_sizes,
+ combine_page_text_for_duplicates_bool,
+ ],
+ outputs=[new_duplicate_search_annotation_object],
+ show_progress_on=[
+ new_duplicate_search_annotation_object,
+ redaction_output_summary_textbox,
+ ],
+ ).success(
+ fn=apply_whole_page_redactions_from_list,
+ inputs=[
+ duplicate_pages_list_state,
+ doc_file_name_with_extension_textbox,
+ review_file_df,
+ duplicate_files_out,
+ pdf_doc_state,
+ page_sizes,
+ all_image_annotations_state,
+ combine_page_text_for_duplicates_bool,
+ new_duplicate_search_annotation_object,
+ latest_review_file_path,
+ ],
+ outputs=[review_file_df, all_image_annotations_state],
+ ).success(
+ update_annotator_page_from_review_df,
+ inputs=[
+ review_file_df,
+ images_pdf_state,
+ page_sizes,
+ all_image_annotations_state,
+ annotator,
+ selected_entity_dataframe_row,
+ input_folder_textbox,
+ doc_full_file_name_textbox,
+ ],
+ outputs=[
+ annotator,
+ all_image_annotations_state,
+ annotate_current_page,
+ page_sizes,
+ review_file_df,
+ annotate_previous_page,
+ ],
+ show_progress_on=[annotator],
+ ).success(
+ update_annotator_object_and_filter_df,
+ inputs=[
+ all_image_annotations_state,
+ annotate_current_page,
+ recogniser_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ text_entity_dropdown,
+ recogniser_entity_dataframe_base,
+ annotator_zoom_number,
+ review_file_df,
+ page_sizes,
+ doc_full_file_name_textbox,
+ input_folder_textbox,
+ ],
+ outputs=[
+ annotator,
+ annotate_current_page,
+ annotate_current_page_bottom,
+ annotate_previous_page,
+ recogniser_entity_dropdown,
+ recogniser_entity_dataframe,
+ recogniser_entity_dataframe_base,
+ text_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ page_sizes,
+ all_image_annotations_state,
+ ],
+ show_progress_on=[annotator],
+ )
+
+ # Run redaction function from document redaction tab button
+ document_redact_btn.click(
+ fn=reset_state_vars,
+ outputs=[
+ all_image_annotations_state,
+ all_page_line_level_ocr_results_df_base,
+ all_decision_process_table_state,
+ comprehend_query_number,
+ textract_metadata_textbox,
+ annotator,
+ output_file_list_state,
+ log_files_output_list_state,
+ recogniser_entity_dataframe,
+ recogniser_entity_dataframe_base,
+ pdf_doc_state,
+ duplication_file_path_outputs_list_state,
+ redaction_output_summary_textbox,
+ is_a_textract_api_call,
+ textract_query_number,
+ all_page_line_level_ocr_results_with_words,
+ input_review_files,
+ latest_file_completed_num,
+ llm_total_input_tokens_number,
+ llm_total_output_tokens_number,
+ vlm_total_input_tokens_number,
+ vlm_total_output_tokens_number,
+ ],
+ ).success(
+ fn=enforce_cost_codes,
+ inputs=[
+ enforce_cost_code_bool,
+ cost_code_choice_drop,
+ cost_code_dataframe_base,
+ ],
+ ).success(
+ fn=choose_and_run_redactor,
+ inputs=[
+ in_doc_files,
+ prepared_pdf_state,
+ images_pdf_state,
+ in_redact_entities,
+ in_redact_comprehend_entities,
+ in_redact_llm_entities,
+ text_extract_method_radio,
+ in_allow_list_state,
+ in_deny_list_state,
+ in_fully_redacted_list_state,
+ latest_file_completed_num,
+ redaction_output_summary_textbox,
+ output_file_list_state,
+ log_files_output_list_state,
+ first_loop_state,
+ page_min,
+ page_max,
+ actual_time_taken_number,
+ handwrite_signature_checkbox,
+ textract_metadata_textbox,
+ all_image_annotations_state,
+ all_page_line_level_ocr_results_df_base,
+ all_decision_process_table_state,
+ pdf_doc_state,
+ current_loop_page_number,
+ page_break_return,
+ pii_identification_method_drop,
+ comprehend_query_number,
+ max_fuzzy_spelling_mistakes_num,
+ match_fuzzy_whole_phrase_bool,
+ aws_access_key_textbox,
+ aws_secret_key_textbox,
+ annotate_max_pages,
+ review_file_df,
+ output_folder_textbox,
+ document_cropboxes,
+ page_sizes,
+ textract_output_found_checkbox,
+ only_extract_text_radio,
+ duplication_file_path_outputs_list_state,
+ latest_review_file_path,
+ input_folder_textbox,
+ textract_query_number,
+ latest_ocr_file_path,
+ all_page_line_level_ocr_results,
+ all_page_line_level_ocr_results_with_words,
+ all_page_line_level_ocr_results_with_words_df_base,
+ local_ocr_method_radio,
+ chosen_language_drop,
+ input_review_files,
+ custom_llm_instructions_textbox,
+ inference_server_vlm_model_textbox,
+ efficient_ocr_checkbox,
+ efficient_ocr_min_words_number,
+ efficient_ocr_min_image_coverage_number,
+ high_quality_textract_ocr_checkbox,
+ overwrite_existing_ocr_checkbox,
+ llm_model_name_textbox,
+ llm_total_input_tokens_number,
+ llm_total_output_tokens_number,
+ vlm_model_name_textbox,
+ vlm_total_input_tokens_number,
+ vlm_total_output_tokens_number,
+ save_page_ocr_visualisations_checkbox,
+ ],
+ outputs=[
+ redaction_output_summary_textbox,
+ output_file,
+ output_file_list_state,
+ latest_file_completed_num,
+ log_files_output,
+ log_files_output_list_state,
+ actual_time_taken_number,
+ textract_metadata_textbox,
+ pdf_doc_state,
+ all_image_annotations_state,
+ current_loop_page_number,
+ page_break_return,
+ all_page_line_level_ocr_results_df_base,
+ all_decision_process_table_state,
+ comprehend_query_number,
+ input_pdf_for_review,
+ annotate_max_pages,
+ annotate_max_pages_bottom,
+ prepared_pdf_state,
+ images_pdf_state,
+ review_file_df,
+ page_sizes,
+ duplication_file_path_outputs_list_state,
+ in_duplicate_pages,
+ in_summarisation_ocr_files,
+ latest_review_file_path,
+ textract_query_number,
+ latest_ocr_file_path,
+ all_page_line_level_ocr_results,
+ all_page_line_level_ocr_results_with_words,
+ all_page_line_level_ocr_results_with_words_df_base,
+ backup_review_state,
+ task_textbox,
+ input_review_files,
+ vlm_model_name_textbox,
+ vlm_total_input_tokens_number,
+ vlm_total_output_tokens_number,
+ llm_model_name_textbox,
+ llm_total_input_tokens_number,
+ llm_total_output_tokens_number,
+ total_pdf_page_count,
+ ],
+ api_name="redact_doc",
+ show_progress_on=[redaction_output_summary_textbox],
+ ).success(
+ fn=lambda *args: usage_callback.flag(
+ list(args),
+ save_to_csv=SAVE_LOGS_TO_CSV,
+ save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB,
+ dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME,
+ dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS,
+ replacement_headers=CSV_USAGE_LOG_HEADERS,
+ ),
+ inputs=(
+ [
+ session_hash_textbox,
+ doc_file_name_no_extension_textbox,
+ blank_data_file_name_no_extension_textbox_for_logs,
+ actual_time_taken_number,
+ total_pdf_page_count,
+ textract_query_number,
+ pii_identification_method_drop,
+ comprehend_query_number,
+ cost_code_choice_drop,
+ handwrite_signature_checkbox,
+ host_name_textbox,
+ text_extract_method_radio,
+ is_a_textract_api_call,
+ task_textbox,
+ vlm_model_name_textbox,
+ vlm_total_input_tokens_number,
+ vlm_total_output_tokens_number,
+ llm_model_name_textbox,
+ llm_total_input_tokens_number,
+ llm_total_output_tokens_number,
+ ]
+ if DISPLAY_FILE_NAMES_IN_LOGS
+ else [
+ session_hash_textbox,
+ placeholder_doc_file_name_no_extension_textbox_for_logs,
+ blank_data_file_name_no_extension_textbox_for_logs,
+ actual_time_taken_number,
+ total_pdf_page_count,
+ textract_query_number,
+ pii_identification_method_drop,
+ comprehend_query_number,
+ cost_code_choice_drop,
+ handwrite_signature_checkbox,
+ host_name_textbox,
+ text_extract_method_radio,
+ is_a_textract_api_call,
+ task_textbox,
+ vlm_model_name_textbox,
+ vlm_total_input_tokens_number,
+ vlm_total_output_tokens_number,
+ llm_model_name_textbox,
+ llm_total_input_tokens_number,
+ llm_total_output_tokens_number,
+ ]
+ ),
+ outputs=[flag_value_placeholder],
+ preprocess=False,
+ api_name="usage_logs",
+ ).success(
+ fn=upload_log_file_to_s3,
+ inputs=[usage_logs_state, usage_s3_logs_loc_state],
+ outputs=[s3_logs_output_textbox],
+ ).success(
+ fn=export_outputs_to_s3,
+ inputs=[
+ output_file_list_state,
+ s3_output_folder_state,
+ save_outputs_to_s3_checkbox,
+ in_doc_files,
+ ],
+ outputs=None,
+ ).success(
+ fn=update_annotator_object_and_filter_df,
+ inputs=[
+ all_image_annotations_state,
+ page_min,
+ recogniser_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ text_entity_dropdown,
+ recogniser_entity_dataframe_base,
+ annotator_zoom_number,
+ review_file_df,
+ page_sizes,
+ doc_full_file_name_textbox,
+ input_folder_textbox,
+ ],
+ outputs=[
+ annotator,
+ annotate_current_page,
+ annotate_current_page_bottom,
+ annotate_previous_page,
+ recogniser_entity_dropdown,
+ recogniser_entity_dataframe,
+ recogniser_entity_dataframe_base,
+ text_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ page_sizes,
+ all_image_annotations_state,
+ ],
+ show_progress_on=[annotator],
+ ).success(
+ fn=check_for_existing_textract_file,
+ inputs=[
+ doc_file_name_no_extension_textbox,
+ output_folder_textbox,
+ handwrite_signature_checkbox,
+ ],
+ outputs=[textract_output_found_checkbox],
+ ).success(
+ fn=check_for_relevant_ocr_output_with_words,
+ inputs=[
+ doc_file_name_no_extension_textbox,
+ text_extract_method_radio,
+ output_folder_textbox,
+ ],
+ outputs=[relevant_ocr_output_with_words_found_checkbox],
+ ).success(
+ fn=reveal_feedback_buttons,
+ outputs=[
+ pdf_feedback_radio,
+ pdf_further_details_text,
+ pdf_submit_feedback_btn,
+ pdf_feedback_title,
+ ],
+ ).success(
+ fn=check_duplicate_pages_checkbox,
+ inputs=[redact_duplicate_pages_checkbox],
+ outputs=None,
+ ).failure( # Failure case enables branching for when duplicate analysis textbox is enabled
+ fn=lambda: None
+ ).then(
+ fn=reset_aws_call_vars,
+ outputs=[
+ comprehend_query_number,
+ textract_query_number,
+ vlm_total_input_tokens_number,
+ vlm_total_output_tokens_number,
+ llm_total_input_tokens_number,
+ llm_total_output_tokens_number,
+ llm_model_name_textbox,
+ vlm_model_name_textbox,
+ ],
+ ).success(
+ fn=run_duplicate_analysis,
+ inputs=[
+ all_page_line_level_ocr_results_df_base,
+ duplicate_threshold_input,
+ min_word_count_input,
+ min_consecutive_pages_input,
+ greedy_match_input,
+ all_page_line_level_ocr_results_df_base,
+ input_review_files,
+ combine_page_text_for_duplicates_bool,
+ doc_file_name_with_extension_textbox,
+ output_folder_textbox,
+ ],
+ outputs=[
+ results_df_preview,
+ duplicate_files_out,
+ full_duplicate_data_by_file,
+ actual_time_taken_number,
+ task_textbox,
+ all_page_line_level_ocr_results_df_base,
+ input_review_files,
+ duplicate_pages_list_state,
+ ],
+ show_progress_on=[results_df_preview, redaction_output_summary_textbox],
+ ).success(
+ fn=export_outputs_to_s3,
+ # duplicate_files_out returns a single file path; export helper will normalise it
+ inputs=[
+ duplicate_files_out,
+ s3_output_folder_state,
+ save_outputs_to_s3_checkbox,
+ in_duplicate_pages,
+ ],
+ outputs=None,
+ ).success(
+ fn=lambda: "deduplicate",
+ outputs=[task_textbox],
+ ).success(
+ fn=lambda *args: usage_callback.flag(
+ list(args),
+ save_to_csv=SAVE_LOGS_TO_CSV,
+ save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB,
+ dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME,
+ dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS,
+ replacement_headers=CSV_USAGE_LOG_HEADERS,
+ ),
+ inputs=(
+ [
+ session_hash_textbox,
+ doc_file_name_no_extension_textbox,
+ blank_data_file_name_no_extension_textbox_for_logs,
+ actual_time_taken_number,
+ total_pdf_page_count,
+ textract_query_number,
+ pii_identification_method_drop,
+ comprehend_query_number,
+ cost_code_choice_drop,
+ handwrite_signature_checkbox,
+ host_name_textbox,
+ text_extract_method_radio,
+ is_a_textract_api_call,
+ task_textbox,
+ vlm_model_name_textbox,
+ vlm_total_input_tokens_number,
+ vlm_total_output_tokens_number,
+ llm_model_name_textbox,
+ llm_total_input_tokens_number,
+ llm_total_output_tokens_number,
+ ]
+ if DISPLAY_FILE_NAMES_IN_LOGS
+ else [
+ session_hash_textbox,
+ placeholder_doc_file_name_no_extension_textbox_for_logs,
+ blank_data_file_name_no_extension_textbox_for_logs,
+ actual_time_taken_number,
+ total_pdf_page_count,
+ textract_query_number,
+ pii_identification_method_drop,
+ comprehend_query_number,
+ cost_code_choice_drop,
+ handwrite_signature_checkbox,
+ host_name_textbox,
+ text_extract_method_radio,
+ is_a_textract_api_call,
+ task_textbox,
+ vlm_model_name_textbox,
+ vlm_total_input_tokens_number,
+ vlm_total_output_tokens_number,
+ llm_model_name_textbox,
+ llm_total_input_tokens_number,
+ llm_total_output_tokens_number,
+ ]
+ ),
+ outputs=[flag_value_placeholder],
+ preprocess=False,
+ ).success(
+ fn=upload_log_file_to_s3,
+ inputs=[usage_logs_state, usage_s3_logs_loc_state],
+ outputs=[s3_logs_output_textbox],
+ ).success(
+ fn=create_annotation_objects_from_duplicates,
+ inputs=[
+ results_df_preview,
+ all_page_line_level_ocr_results_df_base,
+ page_sizes,
+ combine_page_text_for_duplicates_bool,
+ ],
+ outputs=[new_duplicate_search_annotation_object],
+ show_progress_on=[
+ new_duplicate_search_annotation_object,
+ redaction_output_summary_textbox,
+ ],
+ ).success(
+ fn=apply_whole_page_redactions_from_list,
+ inputs=[
+ duplicate_pages_list_state,
+ doc_file_name_with_extension_textbox,
+ review_file_df,
+ duplicate_files_out,
+ pdf_doc_state,
+ page_sizes,
+ all_image_annotations_state,
+ combine_page_text_for_duplicates_bool,
+ new_duplicate_search_annotation_object,
+ latest_review_file_path,
+ ],
+ outputs=[review_file_df, all_image_annotations_state],
+ ).success(
+ update_annotator_page_from_review_df,
+ inputs=[
+ review_file_df,
+ images_pdf_state,
+ page_sizes,
+ all_image_annotations_state,
+ annotator,
+ selected_entity_dataframe_row,
+ input_folder_textbox,
+ doc_full_file_name_textbox,
+ ],
+ outputs=[
+ annotator,
+ all_image_annotations_state,
+ annotate_current_page,
+ page_sizes,
+ review_file_df,
+ annotate_previous_page,
+ ],
+ show_progress_on=[annotator],
+ ).success(
+ update_annotator_object_and_filter_df,
+ inputs=[
+ all_image_annotations_state,
+ annotate_current_page,
+ recogniser_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ text_entity_dropdown,
+ recogniser_entity_dataframe_base,
+ annotator_zoom_number,
+ review_file_df,
+ page_sizes,
+ doc_full_file_name_textbox,
+ input_folder_textbox,
+ ],
+ outputs=[
+ annotator,
+ annotate_current_page,
+ annotate_current_page_bottom,
+ annotate_previous_page,
+ recogniser_entity_dropdown,
+ recogniser_entity_dataframe,
+ recogniser_entity_dataframe_base,
+ text_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ page_sizes,
+ all_image_annotations_state,
+ ],
+ show_progress_on=[annotator],
+ )
+
+ # If the line level ocr results are changed by load in by user or by a new redaction task, replace the ocr results displayed in the table
+ all_page_line_level_ocr_results_df_base.change(
+ reset_ocr_base_dataframe,
+ inputs=[all_page_line_level_ocr_results_df_base],
+ outputs=[all_page_line_level_ocr_results_df],
+ )
+ all_page_line_level_ocr_results_with_words_df_base.change(
+ reset_ocr_with_words_base_dataframe,
+ inputs=[
+ all_page_line_level_ocr_results_with_words_df_base,
+ page_entity_dropdown_redaction,
+ ],
+ outputs=[
+ all_page_line_level_ocr_results_with_words_df,
+ backup_all_page_line_level_ocr_results_with_words_df_base,
+ ],
+ )
+
+ # Send whole document to Textract for text extraction
+ send_document_to_textract_api_btn.click(
+ analyse_document_with_textract_api,
+ inputs=[
+ prepared_pdf_state,
+ s3_whole_document_textract_input_subfolder,
+ s3_whole_document_textract_output_subfolder,
+ textract_job_detail_df,
+ s3_whole_document_textract_default_bucket,
+ output_folder_textbox,
+ handwrite_signature_checkbox,
+ successful_textract_api_call_number,
+ total_pdf_page_count,
+ ],
+ outputs=[
+ job_output_textbox,
+ job_id_textbox,
+ job_type_dropdown,
+ successful_textract_api_call_number,
+ is_a_textract_api_call,
+ textract_query_number,
+ task_textbox,
+ ],
+ show_progress_on=[job_current_status],
+ ).success(check_for_provided_job_id, inputs=[job_id_textbox]).success(
+ poll_whole_document_textract_analysis_progress_and_download,
+ inputs=[
+ job_id_textbox,
+ job_type_dropdown,
+ s3_whole_document_textract_output_subfolder,
+ doc_file_name_no_extension_textbox,
+ textract_job_detail_df,
+ s3_whole_document_textract_default_bucket,
+ output_folder_textbox,
+ s3_whole_document_textract_logs_subfolder,
+ local_whole_document_textract_logs_subfolder,
+ ],
+ outputs=[
+ textract_job_output_file,
+ job_current_status,
+ textract_job_detail_df,
+ doc_file_name_no_extension_textbox,
+ ],
+ show_progress_on=[job_current_status],
+ ).success(
+ fn=check_for_existing_textract_file,
+ inputs=[doc_file_name_no_extension_textbox, output_folder_textbox],
+ outputs=[textract_output_found_checkbox],
+ show_progress_on=[job_current_status],
+ )
+
+ check_state_of_textract_api_call_btn.click(
+ check_for_provided_job_id,
+ inputs=[job_id_textbox],
+ show_progress_on=[job_current_status],
+ ).success(
+ poll_whole_document_textract_analysis_progress_and_download,
+ inputs=[
+ job_id_textbox,
+ job_type_dropdown,
+ s3_whole_document_textract_output_subfolder,
+ doc_file_name_no_extension_textbox,
+ textract_job_detail_df,
+ s3_whole_document_textract_default_bucket,
+ output_folder_textbox,
+ s3_whole_document_textract_logs_subfolder,
+ local_whole_document_textract_logs_subfolder,
+ ],
+ outputs=[
+ textract_job_output_file,
+ job_current_status,
+ textract_job_detail_df,
+ doc_file_name_no_extension_textbox,
+ ],
+ show_progress_on=[job_current_status],
+ ).success(
+ fn=check_for_existing_textract_file,
+ inputs=[doc_file_name_no_extension_textbox, output_folder_textbox],
+ outputs=[textract_output_found_checkbox],
+ show_progress_on=[job_current_status],
+ )
+
+ textract_job_detail_df.select(
+ df_select_callback_textract_api,
+ inputs=[textract_output_found_checkbox],
+ outputs=[job_id_textbox, job_type_dropdown, selected_job_id_row],
+ )
+
+ convert_textract_outputs_to_ocr_results.click(
+ replace_existing_pdf_input_for_whole_document_outputs,
+ inputs=[
+ s3_whole_document_textract_input_subfolder,
+ doc_file_name_no_extension_textbox,
+ output_folder_textbox,
+ s3_whole_document_textract_default_bucket,
+ in_doc_files,
+ input_folder_textbox,
+ ],
+ outputs=[
+ in_doc_files,
+ doc_file_name_no_extension_textbox,
+ doc_file_name_with_extension_textbox,
+ doc_full_file_name_textbox,
+ doc_file_name_textbox_list,
+ total_pdf_page_count,
+ ],
+ show_progress_on=[redaction_output_summary_textbox],
+ ).success(
+ fn=prepare_image_or_pdf_with_efficient_ocr,
+ inputs=[
+ in_doc_files,
+ text_extract_method_radio,
+ all_page_line_level_ocr_results_df_base,
+ all_page_line_level_ocr_results_with_words_df_base,
+ latest_file_completed_num,
+ redaction_output_summary_textbox,
+ first_loop_state,
+ annotate_max_pages,
+ all_image_annotations_state,
+ prepare_for_review_bool_false,
+ in_fully_redacted_list_state,
+ output_folder_textbox,
+ input_folder_textbox,
+ efficient_ocr_checkbox,
+ prepare_images_bool_false,
+ page_sizes,
+ pdf_doc_state,
+ page_min,
+ page_max,
+ ],
+ outputs=[
+ redaction_output_summary_textbox,
+ prepared_pdf_state,
+ images_pdf_state,
+ annotate_max_pages,
+ annotate_max_pages_bottom,
+ pdf_doc_state,
+ all_image_annotations_state,
+ review_file_df,
+ document_cropboxes,
+ page_sizes,
+ textract_output_found_checkbox,
+ all_img_details_state,
+ all_page_line_level_ocr_results_df_base,
+ relevant_ocr_output_with_words_found_checkbox,
+ all_page_line_level_ocr_results_with_words_df_base,
+ ],
+ show_progress_on=[redaction_output_summary_textbox],
+ ).success(
+ fn=check_for_existing_textract_file,
+ inputs=[
+ doc_file_name_no_extension_textbox,
+ output_folder_textbox,
+ handwrite_signature_checkbox,
+ ],
+ outputs=[textract_output_found_checkbox],
+ ).success(
+ fn=check_for_relevant_ocr_output_with_words,
+ inputs=[
+ doc_file_name_no_extension_textbox,
+ text_extract_method_radio,
+ output_folder_textbox,
+ ],
+ outputs=[relevant_ocr_output_with_words_found_checkbox],
+ ).success(
+ fn=check_textract_outputs_exist, inputs=[textract_output_found_checkbox]
+ ).success(
+ fn=reset_state_vars,
+ outputs=[
+ all_image_annotations_state,
+ all_page_line_level_ocr_results_df_base,
+ all_decision_process_table_state,
+ comprehend_query_number,
+ textract_metadata_textbox,
+ annotator,
+ output_file_list_state,
+ log_files_output_list_state,
+ recogniser_entity_dataframe,
+ recogniser_entity_dataframe_base,
+ pdf_doc_state,
+ duplication_file_path_outputs_list_state,
+ redaction_output_summary_textbox,
+ is_a_textract_api_call,
+ textract_query_number,
+ all_page_line_level_ocr_results_with_words,
+ input_review_files,
+ llm_total_input_tokens_number,
+ llm_total_output_tokens_number,
+ vlm_total_input_tokens_number,
+ vlm_total_output_tokens_number,
+ ],
+ ).success(
+ fn=choose_and_run_redactor,
+ inputs=[
+ in_doc_files,
+ prepared_pdf_state,
+ images_pdf_state,
+ in_redact_entities,
+ in_redact_comprehend_entities,
+ in_redact_llm_entities,
+ textract_only_method_drop,
+ in_allow_list_state,
+ in_deny_list_state,
+ in_fully_redacted_list_state,
+ latest_file_completed_num,
+ redaction_output_summary_textbox,
+ output_file_list_state,
+ log_files_output_list_state,
+ first_loop_state,
+ page_min,
+ page_max,
+ actual_time_taken_number,
+ handwrite_signature_checkbox,
+ textract_metadata_textbox,
+ all_image_annotations_state,
+ all_page_line_level_ocr_results_df_base,
+ all_decision_process_table_state,
+ pdf_doc_state,
+ current_loop_page_number,
+ page_break_return,
+ no_redaction_method_drop,
+ comprehend_query_number,
+ max_fuzzy_spelling_mistakes_num,
+ match_fuzzy_whole_phrase_bool,
+ aws_access_key_textbox,
+ aws_secret_key_textbox,
+ annotate_max_pages,
+ review_file_df,
+ output_folder_textbox,
+ document_cropboxes,
+ page_sizes,
+ textract_output_found_checkbox,
+ only_extract_text_radio,
+ extract_text_only_tab_redaction_override,
+ duplication_file_path_outputs_list_state,
+ latest_review_file_path,
+ input_folder_textbox,
+ textract_query_number,
+ latest_ocr_file_path,
+ all_page_line_level_ocr_results,
+ all_page_line_level_ocr_results_with_words,
+ all_page_line_level_ocr_results_with_words_df_base,
+ local_ocr_method_radio,
+ chosen_language_drop,
+ input_review_files,
+ custom_llm_instructions_textbox,
+ inference_server_vlm_model_textbox,
+ efficient_ocr_checkbox,
+ efficient_ocr_min_words_number,
+ efficient_ocr_min_image_coverage_number,
+ high_quality_textract_ocr_checkbox,
+ overwrite_existing_ocr_checkbox,
+ llm_model_name_textbox,
+ llm_total_input_tokens_number,
+ llm_total_output_tokens_number,
+ vlm_model_name_textbox,
+ vlm_total_input_tokens_number,
+ vlm_total_output_tokens_number,
+ save_page_ocr_visualisations_checkbox,
+ ],
+ outputs=[
+ redaction_output_summary_textbox,
+ output_file,
+ output_file_list_state,
+ latest_file_completed_num,
+ log_files_output,
+ log_files_output_list_state,
+ actual_time_taken_number,
+ textract_metadata_textbox,
+ pdf_doc_state,
+ all_image_annotations_state,
+ current_loop_page_number,
+ page_break_return,
+ all_page_line_level_ocr_results_df_base,
+ all_decision_process_table_state,
+ comprehend_query_number,
+ input_pdf_for_review,
+ annotate_max_pages,
+ annotate_max_pages_bottom,
+ prepared_pdf_state,
+ images_pdf_state,
+ review_file_df,
+ page_sizes,
+ duplication_file_path_outputs_list_state,
+ in_duplicate_pages,
+ in_summarisation_ocr_files,
+ latest_review_file_path,
+ textract_query_number,
+ latest_ocr_file_path,
+ all_page_line_level_ocr_results,
+ all_page_line_level_ocr_results_with_words,
+ all_page_line_level_ocr_results_with_words_df_base,
+ backup_review_state,
+ task_textbox,
+ input_review_files,
+ vlm_model_name_textbox,
+ vlm_total_input_tokens_number,
+ vlm_total_output_tokens_number,
+ llm_model_name_textbox,
+ llm_total_input_tokens_number,
+ llm_total_output_tokens_number,
+ total_pdf_page_count,
+ ],
+ show_progress_on=[redaction_output_summary_textbox],
+ ).success(
+ fn=export_outputs_to_s3,
+ inputs=[
+ output_file_list_state,
+ s3_output_folder_state,
+ save_outputs_to_s3_checkbox,
+ in_doc_files,
+ ],
+ outputs=None,
+ ).success(
+ fn=update_annotator_object_and_filter_df,
+ inputs=[
+ all_image_annotations_state,
+ page_min,
+ recogniser_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ text_entity_dropdown,
+ recogniser_entity_dataframe_base,
+ annotator_zoom_number,
+ review_file_df,
+ page_sizes,
+ doc_full_file_name_textbox,
+ input_folder_textbox,
+ ],
+ outputs=[
+ annotator,
+ annotate_current_page,
+ annotate_current_page_bottom,
+ annotate_previous_page,
+ recogniser_entity_dropdown,
+ recogniser_entity_dataframe,
+ recogniser_entity_dataframe_base,
+ text_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ page_sizes,
+ all_image_annotations_state,
+ ],
+ show_progress_on=[annotator],
+ )
+
+ go_to_review_redactions_tab_btn.click(
+ fn=change_tab_to_review_redactions,
+ inputs=None,
+ outputs=tabs,
+ )
+
+ ###
+ # REVIEW PDF REDACTIONS
+ ###
+
+ # Upload previous PDF for modifying redactions
+ input_pdf_for_review.upload(
+ fn=reset_review_vars,
+ inputs=None,
+ outputs=[recogniser_entity_dataframe, recogniser_entity_dataframe_base],
+ ).success(
+ fn=get_document_file_names,
+ inputs=[input_pdf_for_review],
+ outputs=[
+ doc_file_name_no_extension_textbox,
+ doc_file_name_with_extension_textbox,
+ doc_full_file_name_textbox,
+ doc_file_name_textbox_list,
+ total_pdf_page_count,
+ ],
+ ).success(
+ fn=prepare_image_or_pdf_with_efficient_ocr,
+ inputs=[
+ input_pdf_for_review,
+ text_extract_method_radio,
+ all_page_line_level_ocr_results_df_base,
+ all_page_line_level_ocr_results_with_words_df_base,
+ latest_file_completed_num,
+ redaction_output_summary_textbox,
+ second_loop_state,
+ annotate_max_pages,
+ all_image_annotations_state,
+ prepare_for_review_bool,
+ in_fully_redacted_list_state,
+ output_folder_textbox,
+ input_folder_textbox,
+ efficient_ocr_checkbox,
+ prepare_images_bool_false,
+ page_sizes,
+ pdf_doc_state,
+ page_min,
+ page_max,
+ ],
+ outputs=[
+ redaction_output_summary_textbox,
+ prepared_pdf_state,
+ images_pdf_state,
+ annotate_max_pages,
+ annotate_max_pages_bottom,
+ pdf_doc_state,
+ all_image_annotations_state,
+ review_file_df,
+ document_cropboxes,
+ page_sizes,
+ textract_output_found_checkbox,
+ all_img_details_state,
+ all_page_line_level_ocr_results_df_base,
+ relevant_ocr_output_with_words_found_checkbox,
+ all_page_line_level_ocr_results_with_words_df_base,
+ ],
+ api_name="prepare_doc",
+ show_progress_on=[redaction_output_summary_textbox, input_pdf_for_review],
+ ).success(
+ update_annotator_object_and_filter_df,
+ inputs=[
+ all_image_annotations_state,
+ annotate_current_page,
+ recogniser_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ text_entity_dropdown,
+ recogniser_entity_dataframe_base,
+ annotator_zoom_number,
+ review_file_df,
+ page_sizes,
+ doc_full_file_name_textbox,
+ input_folder_textbox,
+ ],
+ outputs=[
+ annotator,
+ annotate_current_page,
+ annotate_current_page_bottom,
+ annotate_previous_page,
+ recogniser_entity_dropdown,
+ recogniser_entity_dataframe,
+ recogniser_entity_dataframe_base,
+ text_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ page_sizes,
+ all_image_annotations_state,
+ ],
+ show_progress_on=[annotator],
+ ).success(
+ apply_redactions_to_review_df_and_files,
+ inputs=[
+ annotator,
+ doc_full_file_name_textbox,
+ pdf_doc_state,
+ all_image_annotations_state,
+ annotate_current_page,
+ review_file_df,
+ output_folder_textbox,
+ do_not_save_pdf_state,
+ page_sizes,
+ input_folder_textbox,
+ ],
+ outputs=[
+ pdf_doc_state,
+ all_image_annotations_state,
+ input_pdf_for_review,
+ log_files_output,
+ review_file_df,
+ ],
+ show_progress_on=[input_pdf_for_review],
+ )
+
+ # Upload previous review CSV files for modifying redactions
+ input_review_files.upload(
+ fn=prepare_image_or_pdf_with_efficient_ocr,
+ inputs=[
+ input_review_files,
+ text_extract_method_radio,
+ all_page_line_level_ocr_results_df_base,
+ all_page_line_level_ocr_results_with_words_df_base,
+ latest_file_completed_num,
+ redaction_output_summary_textbox,
+ second_loop_state,
+ annotate_max_pages,
+ all_image_annotations_state,
+ prepare_for_review_bool,
+ in_fully_redacted_list_state,
+ output_folder_textbox,
+ input_folder_textbox,
+ efficient_ocr_checkbox,
+ prepare_images_bool_false,
+ page_sizes,
+ pdf_doc_state,
+ page_min,
+ page_max,
+ ],
+ outputs=[
+ redaction_output_summary_textbox,
+ prepared_pdf_state,
+ images_pdf_state,
+ annotate_max_pages,
+ annotate_max_pages_bottom,
+ pdf_doc_state,
+ all_image_annotations_state,
+ review_file_df,
+ document_cropboxes,
+ page_sizes,
+ textract_output_found_checkbox,
+ all_img_details_state,
+ all_page_line_level_ocr_results_df_base,
+ relevant_ocr_output_with_words_found_checkbox,
+ all_page_line_level_ocr_results_with_words_df_base,
+ ],
+ show_progress_on=[redaction_output_summary_textbox],
+ ).success(
+ update_annotator_object_and_filter_df,
+ inputs=[
+ all_image_annotations_state,
+ annotate_current_page,
+ recogniser_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ text_entity_dropdown,
+ recogniser_entity_dataframe_base,
+ annotator_zoom_number,
+ review_file_df,
+ page_sizes,
+ doc_full_file_name_textbox,
+ input_folder_textbox,
+ ],
+ outputs=[
+ annotator,
+ annotate_current_page,
+ annotate_current_page_bottom,
+ annotate_previous_page,
+ recogniser_entity_dropdown,
+ recogniser_entity_dataframe,
+ recogniser_entity_dataframe_base,
+ text_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ page_sizes,
+ all_image_annotations_state,
+ ],
+ show_progress_on=[annotator],
+ ).success(
+ apply_redactions_to_review_df_and_files,
+ inputs=[
+ annotator,
+ doc_full_file_name_textbox,
+ pdf_doc_state,
+ all_image_annotations_state,
+ annotate_current_page,
+ review_file_df,
+ output_folder_textbox,
+ do_not_save_pdf_state,
+ page_sizes,
+ input_folder_textbox,
+ ],
+ outputs=[
+ pdf_doc_state,
+ all_image_annotations_state,
+ input_pdf_for_review,
+ log_files_output,
+ review_file_df,
+ ],
+ show_progress_on=[input_pdf_for_review],
+ )
+
+ # Manual updates to review df
+ review_file_df.input(
+ update_annotator_page_from_review_df,
+ inputs=[
+ review_file_df,
+ images_pdf_state,
+ page_sizes,
+ all_image_annotations_state,
+ annotator,
+ selected_entity_dataframe_row,
+ input_folder_textbox,
+ doc_full_file_name_textbox,
+ ],
+ outputs=[
+ annotator,
+ all_image_annotations_state,
+ annotate_current_page,
+ page_sizes,
+ review_file_df,
+ annotate_previous_page,
+ ],
+ show_progress_on=[annotator],
+ ).success(
+ update_annotator_object_and_filter_df,
+ inputs=[
+ all_image_annotations_state,
+ annotate_current_page,
+ recogniser_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ text_entity_dropdown,
+ recogniser_entity_dataframe_base,
+ annotator_zoom_number,
+ review_file_df,
+ page_sizes,
+ doc_full_file_name_textbox,
+ input_folder_textbox,
+ ],
+ outputs=[
+ annotator,
+ annotate_current_page,
+ annotate_current_page_bottom,
+ annotate_previous_page,
+ recogniser_entity_dropdown,
+ recogniser_entity_dataframe,
+ recogniser_entity_dataframe_base,
+ text_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ page_sizes,
+ all_image_annotations_state,
+ ],
+ show_progress_on=[annotator],
+ )
+
+ # Page number controls
+ annotate_current_page.submit(
+ update_all_page_annotation_object_based_on_previous_page,
+ inputs=[
+ annotator,
+ annotate_current_page,
+ annotate_previous_page,
+ all_image_annotations_state,
+ page_sizes,
+ ],
+ outputs=[
+ all_image_annotations_state,
+ annotate_previous_page,
+ annotate_current_page_bottom,
+ ],
+ ).success(
+ update_annotator_object_and_filter_df,
+ inputs=[
+ all_image_annotations_state,
+ annotate_current_page,
+ recogniser_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ text_entity_dropdown,
+ recogniser_entity_dataframe_base,
+ annotator_zoom_number,
+ review_file_df,
+ page_sizes,
+ doc_full_file_name_textbox,
+ input_folder_textbox,
+ ],
+ outputs=[
+ annotator,
+ annotate_current_page,
+ annotate_current_page_bottom,
+ annotate_previous_page,
+ recogniser_entity_dropdown,
+ recogniser_entity_dataframe,
+ recogniser_entity_dataframe_base,
+ text_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ page_sizes,
+ all_image_annotations_state,
+ ],
+ show_progress_on=[annotator],
+ ).success(
+ apply_redactions_to_review_df_and_files,
+ inputs=[
+ annotator,
+ doc_full_file_name_textbox,
+ pdf_doc_state,
+ all_image_annotations_state,
+ annotate_current_page,
+ review_file_df,
+ output_folder_textbox,
+ do_not_save_pdf_state,
+ page_sizes,
+ input_folder_textbox,
+ ],
+ outputs=[
+ pdf_doc_state,
+ all_image_annotations_state,
+ input_pdf_for_review,
+ log_files_output,
+ review_file_df,
+ ],
+ show_progress_on=[input_pdf_for_review],
+ )
+
+ annotation_last_page_button.click(
+ fn=decrease_page,
+ inputs=[annotate_current_page, all_image_annotations_state],
+ outputs=[annotate_current_page, annotate_current_page_bottom],
+ show_progress_on=[all_image_annotations_state],
+ ).success(
+ update_all_page_annotation_object_based_on_previous_page,
+ inputs=[
+ annotator,
+ annotate_current_page,
+ annotate_previous_page,
+ all_image_annotations_state,
+ page_sizes,
+ ],
+ outputs=[
+ all_image_annotations_state,
+ annotate_previous_page,
+ annotate_current_page_bottom,
+ ],
+ ).success(
+ update_annotator_object_and_filter_df,
+ inputs=[
+ all_image_annotations_state,
+ annotate_current_page,
+ recogniser_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ text_entity_dropdown,
+ recogniser_entity_dataframe_base,
+ annotator_zoom_number,
+ review_file_df,
+ page_sizes,
+ doc_full_file_name_textbox,
+ input_folder_textbox,
+ ],
+ outputs=[
+ annotator,
+ annotate_current_page,
+ annotate_current_page_bottom,
+ annotate_previous_page,
+ recogniser_entity_dropdown,
+ recogniser_entity_dataframe,
+ recogniser_entity_dataframe_base,
+ text_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ page_sizes,
+ all_image_annotations_state,
+ ],
+ show_progress_on=[annotator],
+ ).success(
+ apply_redactions_to_review_df_and_files,
+ inputs=[
+ annotator,
+ doc_full_file_name_textbox,
+ pdf_doc_state,
+ all_image_annotations_state,
+ annotate_current_page,
+ review_file_df,
+ output_folder_textbox,
+ do_not_save_pdf_state,
+ page_sizes,
+ input_folder_textbox,
+ ],
+ outputs=[
+ pdf_doc_state,
+ all_image_annotations_state,
+ input_pdf_for_review,
+ log_files_output,
+ review_file_df,
+ ],
+ show_progress_on=[input_pdf_for_review],
+ )
+
+ annotation_next_page_button.click(
+ fn=increase_page,
+ inputs=[annotate_current_page, all_image_annotations_state],
+ outputs=[annotate_current_page, annotate_current_page_bottom],
+ show_progress_on=[all_image_annotations_state],
+ ).success(
+ update_all_page_annotation_object_based_on_previous_page,
+ inputs=[
+ annotator,
+ annotate_current_page,
+ annotate_previous_page,
+ all_image_annotations_state,
+ page_sizes,
+ ],
+ outputs=[
+ all_image_annotations_state,
+ annotate_previous_page,
+ annotate_current_page_bottom,
+ ],
+ ).success(
+ update_annotator_object_and_filter_df,
+ inputs=[
+ all_image_annotations_state,
+ annotate_current_page,
+ recogniser_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ text_entity_dropdown,
+ recogniser_entity_dataframe_base,
+ annotator_zoom_number,
+ review_file_df,
+ page_sizes,
+ doc_full_file_name_textbox,
+ input_folder_textbox,
+ ],
+ outputs=[
+ annotator,
+ annotate_current_page,
+ annotate_current_page_bottom,
+ annotate_previous_page,
+ recogniser_entity_dropdown,
+ recogniser_entity_dataframe,
+ recogniser_entity_dataframe_base,
+ text_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ page_sizes,
+ all_image_annotations_state,
+ ],
+ show_progress_on=[annotator],
+ ).success(
+ apply_redactions_to_review_df_and_files,
+ inputs=[
+ annotator,
+ doc_full_file_name_textbox,
+ pdf_doc_state,
+ all_image_annotations_state,
+ annotate_current_page,
+ review_file_df,
+ output_folder_textbox,
+ do_not_save_pdf_state,
+ page_sizes,
+ input_folder_textbox,
+ ],
+ outputs=[
+ pdf_doc_state,
+ all_image_annotations_state,
+ input_pdf_for_review,
+ log_files_output,
+ review_file_df,
+ ],
+ show_progress_on=[input_pdf_for_review],
+ )
+
+ annotation_last_page_button_bottom.click(
+ fn=decrease_page,
+ inputs=[annotate_current_page, all_image_annotations_state],
+ outputs=[annotate_current_page, annotate_current_page_bottom],
+ show_progress_on=[all_image_annotations_state],
+ ).success(
+ update_all_page_annotation_object_based_on_previous_page,
+ inputs=[
+ annotator,
+ annotate_current_page,
+ annotate_previous_page,
+ all_image_annotations_state,
+ page_sizes,
+ ],
+ outputs=[
+ all_image_annotations_state,
+ annotate_previous_page,
+ annotate_current_page_bottom,
+ ],
+ ).success(
+ update_annotator_object_and_filter_df,
+ inputs=[
+ all_image_annotations_state,
+ annotate_current_page,
+ recogniser_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ text_entity_dropdown,
+ recogniser_entity_dataframe_base,
+ annotator_zoom_number,
+ review_file_df,
+ page_sizes,
+ doc_full_file_name_textbox,
+ input_folder_textbox,
+ ],
+ outputs=[
+ annotator,
+ annotate_current_page,
+ annotate_current_page_bottom,
+ annotate_previous_page,
+ recogniser_entity_dropdown,
+ recogniser_entity_dataframe,
+ recogniser_entity_dataframe_base,
+ text_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ page_sizes,
+ all_image_annotations_state,
+ ],
+ show_progress_on=[annotator],
+ ).success(
+ apply_redactions_to_review_df_and_files,
+ inputs=[
+ annotator,
+ doc_full_file_name_textbox,
+ pdf_doc_state,
+ all_image_annotations_state,
+ annotate_current_page,
+ review_file_df,
+ output_folder_textbox,
+ do_not_save_pdf_state,
+ page_sizes,
+ input_folder_textbox,
+ ],
+ outputs=[
+ pdf_doc_state,
+ all_image_annotations_state,
+ input_pdf_for_review,
+ log_files_output,
+ review_file_df,
+ ],
+ show_progress_on=[input_pdf_for_review],
+ )
+
+ annotation_next_page_button_bottom.click(
+ fn=increase_page,
+ inputs=[annotate_current_page, all_image_annotations_state],
+ outputs=[annotate_current_page, annotate_current_page_bottom],
+ show_progress_on=[all_image_annotations_state],
+ ).success(
+ update_all_page_annotation_object_based_on_previous_page,
+ inputs=[
+ annotator,
+ annotate_current_page,
+ annotate_previous_page,
+ all_image_annotations_state,
+ page_sizes,
+ ],
+ outputs=[
+ all_image_annotations_state,
+ annotate_previous_page,
+ annotate_current_page_bottom,
+ ],
+ ).success(
+ update_annotator_object_and_filter_df,
+ inputs=[
+ all_image_annotations_state,
+ annotate_current_page,
+ recogniser_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ text_entity_dropdown,
+ recogniser_entity_dataframe_base,
+ annotator_zoom_number,
+ review_file_df,
+ page_sizes,
+ doc_full_file_name_textbox,
+ input_folder_textbox,
+ ],
+ outputs=[
+ annotator,
+ annotate_current_page,
+ annotate_current_page_bottom,
+ annotate_previous_page,
+ recogniser_entity_dropdown,
+ recogniser_entity_dataframe,
+ recogniser_entity_dataframe_base,
+ text_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ page_sizes,
+ all_image_annotations_state,
+ ],
+ show_progress_on=[annotator],
+ ).success(
+ apply_redactions_to_review_df_and_files,
+ inputs=[
+ annotator,
+ doc_full_file_name_textbox,
+ pdf_doc_state,
+ all_image_annotations_state,
+ annotate_current_page,
+ review_file_df,
+ output_folder_textbox,
+ do_not_save_pdf_state,
+ page_sizes,
+ input_folder_textbox,
+ ],
+ outputs=[
+ pdf_doc_state,
+ all_image_annotations_state,
+ input_pdf_for_review,
+ log_files_output,
+ review_file_df,
+ ],
+ show_progress_on=[input_pdf_for_review],
+ )
+
+ annotate_current_page_bottom.submit(
+ update_other_annotator_number_from_current,
+ inputs=[annotate_current_page_bottom],
+ outputs=[annotate_current_page],
+ ).success(
+ update_all_page_annotation_object_based_on_previous_page,
+ inputs=[
+ annotator,
+ annotate_current_page,
+ annotate_previous_page,
+ all_image_annotations_state,
+ page_sizes,
+ ],
+ outputs=[
+ all_image_annotations_state,
+ annotate_previous_page,
+ annotate_current_page_bottom,
+ ],
+ ).success(
+ update_annotator_object_and_filter_df,
+ inputs=[
+ all_image_annotations_state,
+ annotate_current_page,
+ recogniser_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ text_entity_dropdown,
+ recogniser_entity_dataframe_base,
+ annotator_zoom_number,
+ review_file_df,
+ page_sizes,
+ doc_full_file_name_textbox,
+ input_folder_textbox,
+ ],
+ outputs=[
+ annotator,
+ annotate_current_page,
+ annotate_current_page_bottom,
+ annotate_previous_page,
+ recogniser_entity_dropdown,
+ recogniser_entity_dataframe,
+ recogniser_entity_dataframe_base,
+ text_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ page_sizes,
+ all_image_annotations_state,
+ ],
+ show_progress_on=[annotator],
+ ).success(
+ apply_redactions_to_review_df_and_files,
+ inputs=[
+ annotator,
+ doc_full_file_name_textbox,
+ pdf_doc_state,
+ all_image_annotations_state,
+ annotate_current_page,
+ review_file_df,
+ output_folder_textbox,
+ do_not_save_pdf_state,
+ page_sizes,
+ input_folder_textbox,
+ ],
+ outputs=[
+ pdf_doc_state,
+ all_image_annotations_state,
+ input_pdf_for_review,
+ log_files_output,
+ review_file_df,
+ ],
+ show_progress_on=[input_pdf_for_review],
+ )
+
+ # Apply page redactions
+ annotation_button_apply.click(
+ update_all_page_annotation_object_based_on_previous_page,
+ inputs=[
+ annotator,
+ annotate_current_page,
+ annotate_current_page,
+ all_image_annotations_state,
+ page_sizes,
+ ],
+ outputs=[
+ all_image_annotations_state,
+ annotate_previous_page,
+ annotate_current_page_bottom,
+ ],
+ ).success(
+ update_annotator_object_and_filter_df,
+ inputs=[
+ all_image_annotations_state,
+ annotate_current_page,
+ recogniser_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ text_entity_dropdown,
+ recogniser_entity_dataframe_base,
+ annotator_zoom_number,
+ review_file_df,
+ page_sizes,
+ doc_full_file_name_textbox,
+ input_folder_textbox,
+ ],
+ outputs=[
+ annotator,
+ annotate_current_page,
+ annotate_current_page_bottom,
+ annotate_previous_page,
+ recogniser_entity_dropdown,
+ recogniser_entity_dataframe,
+ recogniser_entity_dataframe_base,
+ text_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ page_sizes,
+ all_image_annotations_state,
+ ],
+ show_progress_on=[annotator],
+ ).success(
+ apply_redactions_to_review_df_and_files,
+ inputs=[
+ annotator,
+ doc_full_file_name_textbox,
+ pdf_doc_state,
+ all_image_annotations_state,
+ annotate_current_page,
+ review_file_df,
+ output_folder_textbox,
+ save_pdf_state,
+ page_sizes,
+ input_folder_textbox,
+ ],
+ outputs=[
+ pdf_doc_state,
+ all_image_annotations_state,
+ input_pdf_for_review,
+ log_files_output,
+ review_file_df,
+ ],
+ scroll_to_output=True,
+ show_progress_on=[input_pdf_for_review],
+ )
+
+ # Save current page manual redactions
+ update_current_page_redactions_btn.click(
+ update_all_page_annotation_object_based_on_previous_page,
+ inputs=[
+ annotator,
+ annotate_current_page,
+ annotate_current_page,
+ all_image_annotations_state,
+ page_sizes,
+ ],
+ outputs=[
+ all_image_annotations_state,
+ annotate_previous_page,
+ annotate_current_page_bottom,
+ ],
+ ).success(
+ update_annotator_object_and_filter_df,
+ inputs=[
+ all_image_annotations_state,
+ annotate_current_page,
+ recogniser_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ text_entity_dropdown,
+ recogniser_entity_dataframe_base,
+ annotator_zoom_number,
+ review_file_df,
+ page_sizes,
+ doc_full_file_name_textbox,
+ input_folder_textbox,
+ ],
+ outputs=[
+ annotator,
+ annotate_current_page,
+ annotate_current_page_bottom,
+ annotate_previous_page,
+ recogniser_entity_dropdown,
+ recogniser_entity_dataframe,
+ recogniser_entity_dataframe_base,
+ text_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ page_sizes,
+ all_image_annotations_state,
+ ],
+ show_progress_on=[annotator],
+ ).success(
+ apply_redactions_to_review_df_and_files,
+ inputs=[
+ annotator,
+ doc_full_file_name_textbox,
+ pdf_doc_state,
+ all_image_annotations_state,
+ annotate_current_page,
+ review_file_df,
+ output_folder_textbox,
+ do_not_save_pdf_state,
+ page_sizes,
+ input_folder_textbox,
+ ],
+ outputs=[
+ pdf_doc_state,
+ all_image_annotations_state,
+ input_pdf_for_review,
+ log_files_output,
+ review_file_df,
+ ],
+ show_progress_on=[input_pdf_for_review],
+ )
+
+ ###
+ # Review and exclude suggested redactions
+ ###
+
+ # Review table controls
+ recogniser_entity_dropdown.select(
+ update_entities_df_recogniser_entities,
+ inputs=[
+ recogniser_entity_dropdown,
+ recogniser_entity_dataframe_base,
+ page_entity_dropdown,
+ text_entity_dropdown,
+ ],
+ outputs=[
+ recogniser_entity_dataframe,
+ text_entity_dropdown,
+ page_entity_dropdown,
+ ],
+ )
+ page_entity_dropdown.select(
+ update_entities_df_page,
+ inputs=[
+ page_entity_dropdown,
+ recogniser_entity_dataframe_base,
+ recogniser_entity_dropdown,
+ text_entity_dropdown,
+ ],
+ outputs=[
+ recogniser_entity_dataframe,
+ recogniser_entity_dropdown,
+ text_entity_dropdown,
+ ],
+ )
+ text_entity_dropdown.select(
+ update_entities_df_text,
+ inputs=[
+ text_entity_dropdown,
+ recogniser_entity_dataframe_base,
+ recogniser_entity_dropdown,
+ page_entity_dropdown,
+ ],
+ outputs=[
+ recogniser_entity_dataframe,
+ recogniser_entity_dropdown,
+ page_entity_dropdown,
+ ],
+ )
+
+ # Clicking on a cell in the recogniser entity dataframe will take you to that page, and also highlight the target redaction box in blue
+ recogniser_entity_dataframe.select(
+ df_select_callback_dataframe_row,
+ inputs=[recogniser_entity_dataframe],
+ outputs=[selected_entity_dataframe_row, selected_entity_dataframe_row_text],
+ ).success(
+ update_all_page_annotation_object_based_on_previous_page,
+ inputs=[
+ annotator,
+ annotate_current_page,
+ annotate_current_page,
+ all_image_annotations_state,
+ page_sizes,
+ ],
+ outputs=[
+ all_image_annotations_state,
+ annotate_previous_page,
+ annotate_current_page_bottom,
+ ],
+ ).success(
+ get_and_merge_current_page_annotations,
+ inputs=[
+ page_sizes,
+ annotate_current_page,
+ all_image_annotations_state,
+ review_file_df,
+ ],
+ outputs=[review_file_df],
+ ).success(
+ update_selected_review_df_row_colour,
+ inputs=[
+ selected_entity_dataframe_row,
+ review_file_df,
+ selected_entity_id,
+ selected_entity_colour,
+ ],
+ outputs=[review_file_df, selected_entity_id, selected_entity_colour],
+ ).success(
+ update_annotator_page_from_review_df,
+ inputs=[
+ review_file_df,
+ images_pdf_state,
+ page_sizes,
+ all_image_annotations_state,
+ annotator,
+ selected_entity_dataframe_row,
+ input_folder_textbox,
+ doc_full_file_name_textbox,
+ ],
+ outputs=[
+ annotator,
+ all_image_annotations_state,
+ annotate_current_page,
+ page_sizes,
+ review_file_df,
+ annotate_previous_page,
+ ],
+ show_progress_on=[annotator],
+ ).success(
+ increase_bottom_page_count_based_on_top,
+ inputs=[annotate_current_page],
+ outputs=[annotate_current_page_bottom],
+ ).success(
+ apply_redactions_to_review_df_and_files,
+ inputs=[
+ annotator,
+ doc_full_file_name_textbox,
+ pdf_doc_state,
+ all_image_annotations_state,
+ annotate_current_page,
+ review_file_df,
+ output_folder_textbox,
+ do_not_save_pdf_state,
+ page_sizes,
+ input_folder_textbox,
+ ],
+ outputs=[
+ pdf_doc_state,
+ all_image_annotations_state,
+ input_pdf_for_review,
+ log_files_output,
+ review_file_df,
+ ],
+ show_progress_on=[input_pdf_for_review],
+ )
+
+ reset_dropdowns_btn.click(
+ reset_dropdowns,
+ inputs=[recogniser_entity_dataframe_base],
+ outputs=[
+ recogniser_entity_dropdown,
+ text_entity_dropdown,
+ page_entity_dropdown,
+ ],
+ ).success(
+ update_annotator_object_and_filter_df,
+ inputs=[
+ all_image_annotations_state,
+ annotate_current_page,
+ recogniser_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ text_entity_dropdown,
+ recogniser_entity_dataframe_base,
+ annotator_zoom_number,
+ review_file_df,
+ page_sizes,
+ doc_full_file_name_textbox,
+ input_folder_textbox,
+ ],
+ outputs=[
+ annotator,
+ annotate_current_page,
+ annotate_current_page_bottom,
+ annotate_previous_page,
+ recogniser_entity_dropdown,
+ recogniser_entity_dataframe,
+ recogniser_entity_dataframe_base,
+ text_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ page_sizes,
+ all_image_annotations_state,
+ ],
+ show_progress_on=[annotator],
+ )
+
+ ### Exclude current selection from annotator and outputs
+ # Exclude only selected row
+ exclude_selected_row_btn.click(
+ update_all_page_annotation_object_based_on_previous_page,
+ inputs=[
+ annotator,
+ annotate_current_page,
+ annotate_current_page,
+ all_image_annotations_state,
+ page_sizes,
+ ],
+ outputs=[
+ all_image_annotations_state,
+ annotate_previous_page,
+ annotate_current_page_bottom,
+ ],
+ ).success(
+ get_and_merge_current_page_annotations,
+ inputs=[
+ page_sizes,
+ annotate_current_page,
+ all_image_annotations_state,
+ review_file_df,
+ ],
+ outputs=[review_file_df],
+ ).success(
+ exclude_selected_items_from_redaction,
+ inputs=[
+ review_file_df,
+ selected_entity_dataframe_row,
+ images_pdf_state,
+ page_sizes,
+ all_image_annotations_state,
+ recogniser_entity_dataframe_base,
+ ],
+ outputs=[
+ review_file_df,
+ all_image_annotations_state,
+ recogniser_entity_dataframe_base,
+ backup_review_state,
+ backup_image_annotations_state,
+ backup_recogniser_entity_dataframe_base,
+ ],
+ ).success(
+ update_annotator_object_and_filter_df,
+ inputs=[
+ all_image_annotations_state,
+ annotate_current_page,
+ recogniser_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ text_entity_dropdown,
+ recogniser_entity_dataframe_base,
+ annotator_zoom_number,
+ review_file_df,
+ page_sizes,
+ doc_full_file_name_textbox,
+ input_folder_textbox,
+ ],
+ outputs=[
+ annotator,
+ annotate_current_page,
+ annotate_current_page_bottom,
+ annotate_previous_page,
+ recogniser_entity_dropdown,
+ recogniser_entity_dataframe,
+ recogniser_entity_dataframe_base,
+ text_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ page_sizes,
+ all_image_annotations_state,
+ ],
+ show_progress_on=[annotator],
+ ).success(
+ apply_redactions_to_review_df_and_files,
+ inputs=[
+ annotator,
+ doc_full_file_name_textbox,
+ pdf_doc_state,
+ all_image_annotations_state,
+ annotate_current_page,
+ review_file_df,
+ output_folder_textbox,
+ do_not_save_pdf_state,
+ page_sizes,
+ input_folder_textbox,
+ ],
+ outputs=[
+ pdf_doc_state,
+ all_image_annotations_state,
+ input_pdf_for_review,
+ log_files_output,
+ review_file_df,
+ ],
+ show_progress_on=[input_pdf_for_review],
+ ).success(
+ update_all_entity_df_dropdowns,
+ inputs=[
+ recogniser_entity_dataframe_base,
+ recogniser_entity_dropdown,
+ page_entity_dropdown,
+ text_entity_dropdown,
+ ],
+ outputs=[
+ recogniser_entity_dropdown,
+ text_entity_dropdown,
+ page_entity_dropdown,
+ ],
+ )
+
+ # Exclude all items with same text as selected row
+ exclude_text_with_same_as_selected_row_btn.click(
+ update_all_page_annotation_object_based_on_previous_page,
+ inputs=[
+ annotator,
+ annotate_current_page,
+ annotate_current_page,
+ all_image_annotations_state,
+ page_sizes,
+ ],
+ outputs=[
+ all_image_annotations_state,
+ annotate_previous_page,
+ annotate_current_page_bottom,
+ ],
+ ).success(
+ get_and_merge_current_page_annotations,
+ inputs=[
+ page_sizes,
+ annotate_current_page,
+ all_image_annotations_state,
+ review_file_df,
+ ],
+ outputs=[review_file_df],
+ ).success(
+ get_all_rows_with_same_text,
+ inputs=[
+ recogniser_entity_dataframe_base,
+ selected_entity_dataframe_row_text,
+ ],
+ outputs=[recogniser_entity_dataframe_same_text],
+ ).success(
+ exclude_selected_items_from_redaction,
+ inputs=[
+ review_file_df,
+ recogniser_entity_dataframe_same_text,
+ images_pdf_state,
+ page_sizes,
+ all_image_annotations_state,
+ recogniser_entity_dataframe_base,
+ ],
+ outputs=[
+ review_file_df,
+ all_image_annotations_state,
+ recogniser_entity_dataframe_base,
+ backup_review_state,
+ backup_image_annotations_state,
+ backup_recogniser_entity_dataframe_base,
+ ],
+ ).success(
+ update_annotator_object_and_filter_df,
+ inputs=[
+ all_image_annotations_state,
+ annotate_current_page,
+ recogniser_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ text_entity_dropdown,
+ recogniser_entity_dataframe_base,
+ annotator_zoom_number,
+ review_file_df,
+ page_sizes,
+ doc_full_file_name_textbox,
+ input_folder_textbox,
+ ],
+ outputs=[
+ annotator,
+ annotate_current_page,
+ annotate_current_page_bottom,
+ annotate_previous_page,
+ recogniser_entity_dropdown,
+ recogniser_entity_dataframe,
+ recogniser_entity_dataframe_base,
+ text_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ page_sizes,
+ all_image_annotations_state,
+ ],
+ show_progress_on=[annotator],
+ ).success(
+ apply_redactions_to_review_df_and_files,
+ inputs=[
+ annotator,
+ doc_full_file_name_textbox,
+ pdf_doc_state,
+ all_image_annotations_state,
+ annotate_current_page,
+ review_file_df,
+ output_folder_textbox,
+ do_not_save_pdf_state,
+ page_sizes,
+ input_folder_textbox,
+ ],
+ outputs=[
+ pdf_doc_state,
+ all_image_annotations_state,
+ input_pdf_for_review,
+ log_files_output,
+ review_file_df,
+ ],
+ show_progress_on=[input_pdf_for_review],
+ ).success(
+ update_all_entity_df_dropdowns,
+ inputs=[
+ recogniser_entity_dataframe_base,
+ recogniser_entity_dropdown,
+ page_entity_dropdown,
+ text_entity_dropdown,
+ ],
+ outputs=[
+ recogniser_entity_dropdown,
+ text_entity_dropdown,
+ page_entity_dropdown,
+ ],
+ )
+
+ # Exclude everything visible in table
+ exclude_selected_btn.click(
+ update_all_page_annotation_object_based_on_previous_page,
+ inputs=[
+ annotator,
+ annotate_current_page,
+ annotate_current_page,
+ all_image_annotations_state,
+ page_sizes,
+ ],
+ outputs=[
+ all_image_annotations_state,
+ annotate_previous_page,
+ annotate_current_page_bottom,
+ ],
+ ).success(
+ get_and_merge_current_page_annotations,
+ inputs=[
+ page_sizes,
+ annotate_current_page,
+ all_image_annotations_state,
+ review_file_df,
+ ],
+ outputs=[review_file_df],
+ ).success(
+ exclude_selected_items_from_redaction,
+ inputs=[
+ review_file_df,
+ recogniser_entity_dataframe,
+ images_pdf_state,
+ page_sizes,
+ all_image_annotations_state,
+ recogniser_entity_dataframe_base,
+ ],
+ outputs=[
+ review_file_df,
+ all_image_annotations_state,
+ recogniser_entity_dataframe_base,
+ backup_review_state,
+ backup_image_annotations_state,
+ backup_recogniser_entity_dataframe_base,
+ ],
+ ).success(
+ update_annotator_object_and_filter_df,
+ inputs=[
+ all_image_annotations_state,
+ annotate_current_page,
+ recogniser_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ text_entity_dropdown,
+ recogniser_entity_dataframe_base,
+ annotator_zoom_number,
+ review_file_df,
+ page_sizes,
+ doc_full_file_name_textbox,
+ input_folder_textbox,
+ ],
+ outputs=[
+ annotator,
+ annotate_current_page,
+ annotate_current_page_bottom,
+ annotate_previous_page,
+ recogniser_entity_dropdown,
+ recogniser_entity_dataframe,
+ recogniser_entity_dataframe_base,
+ text_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ page_sizes,
+ all_image_annotations_state,
+ ],
+ show_progress_on=[annotator],
+ ).success(
+ apply_redactions_to_review_df_and_files,
+ inputs=[
+ annotator,
+ doc_full_file_name_textbox,
+ pdf_doc_state,
+ all_image_annotations_state,
+ annotate_current_page,
+ review_file_df,
+ output_folder_textbox,
+ do_not_save_pdf_state,
+ page_sizes,
+ input_folder_textbox,
+ ],
+ outputs=[
+ pdf_doc_state,
+ all_image_annotations_state,
+ input_pdf_for_review,
+ log_files_output,
+ review_file_df,
+ ],
+ show_progress_on=[input_pdf_for_review],
+ ).success(
+ update_all_entity_df_dropdowns,
+ inputs=[
+ recogniser_entity_dataframe_base,
+ recogniser_entity_dropdown,
+ page_entity_dropdown,
+ text_entity_dropdown,
+ ],
+ outputs=[
+ recogniser_entity_dropdown,
+ text_entity_dropdown,
+ page_entity_dropdown,
+ ],
+ )
+
+ # Undo last redaction exclusion action
+ undo_last_removal_btn.click(
+ undo_last_removal,
+ inputs=[
+ backup_review_state,
+ backup_image_annotations_state,
+ backup_recogniser_entity_dataframe_base,
+ ],
+ outputs=[
+ review_file_df,
+ all_image_annotations_state,
+ recogniser_entity_dataframe_base,
+ ],
+ ).success(
+ update_annotator_object_and_filter_df,
+ inputs=[
+ all_image_annotations_state,
+ annotate_current_page,
+ recogniser_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ text_entity_dropdown,
+ recogniser_entity_dataframe_base,
+ annotator_zoom_number,
+ review_file_df,
+ page_sizes,
+ doc_full_file_name_textbox,
+ input_folder_textbox,
+ ],
+ outputs=[
+ annotator,
+ annotate_current_page,
+ annotate_current_page_bottom,
+ annotate_previous_page,
+ recogniser_entity_dropdown,
+ recogniser_entity_dataframe,
+ recogniser_entity_dataframe_base,
+ text_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ page_sizes,
+ all_image_annotations_state,
+ ],
+ show_progress_on=[annotator],
+ ).success(
+ apply_redactions_to_review_df_and_files,
+ inputs=[
+ annotator,
+ doc_full_file_name_textbox,
+ pdf_doc_state,
+ all_image_annotations_state,
+ annotate_current_page,
+ review_file_df,
+ output_folder_textbox,
+ do_not_save_pdf_state,
+ page_sizes,
+ input_folder_textbox,
+ ],
+ outputs=[
+ pdf_doc_state,
+ all_image_annotations_state,
+ input_pdf_for_review,
+ log_files_output,
+ review_file_df,
+ ],
+ show_progress_on=[input_pdf_for_review],
+ )
+
+ ###
+ # Add new redactions with table selection
+ ###
+ page_entity_dropdown_redaction.select(
+ update_redact_choice_df_from_page_dropdown,
+ inputs=[
+ page_entity_dropdown_redaction,
+ all_page_line_level_ocr_results_with_words_df_base,
+ ],
+ outputs=[all_page_line_level_ocr_results_with_words_df],
+ )
+
+ multi_word_search_text.submit(
+ fn=run_search_with_regex_option,
+ inputs=[
+ multi_word_search_text,
+ all_page_line_level_ocr_results_with_words_df_base,
+ similarity_search_score_minimum,
+ use_regex_search,
+ ],
+ outputs=[
+ all_page_line_level_ocr_results_with_words_df,
+ duplicate_files_out,
+ full_duplicate_data_by_file,
+ ],
+ )
+
+ multi_word_search_text_btn.click(
+ fn=run_search_with_regex_option,
+ inputs=[
+ multi_word_search_text,
+ all_page_line_level_ocr_results_with_words_df_base,
+ similarity_search_score_minimum,
+ use_regex_search,
+ ],
+ outputs=[
+ all_page_line_level_ocr_results_with_words_df,
+ duplicate_files_out,
+ full_duplicate_data_by_file,
+ ],
+ api_name="word_level_ocr_text_search",
+ )
+
+ # Clicking on a cell in the redact items table will take you to that page
+ all_page_line_level_ocr_results_with_words_df.select(
+ df_select_callback_dataframe_row_ocr_with_words,
+ inputs=[all_page_line_level_ocr_results_with_words_df],
+ outputs=[
+ selected_entity_dataframe_row_redact,
+ selected_entity_dataframe_row_text_redact,
+ ],
+ ).success(
+ update_all_page_annotation_object_based_on_previous_page,
+ inputs=[
+ annotator,
+ annotate_current_page,
+ annotate_current_page,
+ all_image_annotations_state,
+ page_sizes,
+ ],
+ outputs=[
+ all_image_annotations_state,
+ annotate_previous_page,
+ annotate_current_page_bottom,
+ ],
+ ).success(
+ get_and_merge_current_page_annotations,
+ inputs=[
+ page_sizes,
+ annotate_current_page,
+ all_image_annotations_state,
+ review_file_df,
+ ],
+ outputs=[review_file_df],
+ ).success(
+ update_annotator_page_from_review_df,
+ inputs=[
+ review_file_df,
+ images_pdf_state,
+ page_sizes,
+ all_image_annotations_state,
+ annotator,
+ selected_entity_dataframe_row_redact,
+ input_folder_textbox,
+ doc_full_file_name_textbox,
+ ],
+ outputs=[
+ annotator,
+ all_image_annotations_state,
+ annotate_current_page,
+ page_sizes,
+ review_file_df,
+ annotate_previous_page,
+ ],
+ show_progress_on=[annotator],
+ ).success(
+ increase_bottom_page_count_based_on_top,
+ inputs=[annotate_current_page],
+ outputs=[annotate_current_page_bottom],
+ )
+
+ # Reset dropdowns
+ reset_dropdowns_btn_new.click(
+ reset_dropdowns,
+ inputs=[all_page_line_level_ocr_results_with_words_df_base],
+ outputs=[
+ recogniser_entity_dropdown,
+ text_entity_dropdown,
+ page_entity_dropdown_redaction,
+ ],
+ ).success(
+ update_annotator_object_and_filter_df,
+ inputs=[
+ all_image_annotations_state,
+ annotate_current_page,
+ recogniser_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ text_entity_dropdown,
+ recogniser_entity_dataframe_base,
+ annotator_zoom_number,
+ review_file_df,
+ page_sizes,
+ doc_full_file_name_textbox,
+ input_folder_textbox,
+ ],
+ outputs=[
+ annotator,
+ annotate_current_page,
+ annotate_current_page_bottom,
+ annotate_previous_page,
+ recogniser_entity_dropdown,
+ recogniser_entity_dataframe,
+ recogniser_entity_dataframe_base,
+ text_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ page_sizes,
+ all_image_annotations_state,
+ ],
+ show_progress_on=[annotator],
+ )
+
+ # Redact everything visible in table
+ redact_selected_btn.click(
+ update_all_page_annotation_object_based_on_previous_page,
+ inputs=[
+ annotator,
+ annotate_current_page,
+ annotate_current_page,
+ all_image_annotations_state,
+ page_sizes,
+ ],
+ outputs=[
+ all_image_annotations_state,
+ annotate_previous_page,
+ annotate_current_page_bottom,
+ ],
+ ).success(
+ create_annotation_objects_from_filtered_ocr_results_with_words,
+ inputs=[
+ all_page_line_level_ocr_results_with_words_df,
+ all_page_line_level_ocr_results_with_words_df_base,
+ page_sizes,
+ review_file_df,
+ all_image_annotations_state,
+ recogniser_entity_dataframe_base,
+ new_redaction_text_label,
+ colour_label,
+ annotate_current_page,
+ ],
+ outputs=[
+ all_image_annotations_state,
+ backup_image_annotations_state,
+ review_file_df,
+ backup_review_state,
+ recogniser_entity_dataframe,
+ backup_recogniser_entity_dataframe_base,
+ ],
+ ).success(
+ update_annotator_object_and_filter_df,
+ inputs=[
+ all_image_annotations_state,
+ annotate_current_page,
+ recogniser_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ text_entity_dropdown,
+ recogniser_entity_dataframe_base,
+ annotator_zoom_number,
+ review_file_df,
+ page_sizes,
+ doc_full_file_name_textbox,
+ input_folder_textbox,
+ ],
+ outputs=[
+ annotator,
+ annotate_current_page,
+ annotate_current_page_bottom,
+ annotate_previous_page,
+ recogniser_entity_dropdown,
+ recogniser_entity_dataframe,
+ recogniser_entity_dataframe_base,
+ text_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ page_sizes,
+ all_image_annotations_state,
+ ],
+ show_progress_on=[annotator],
+ ).success(
+ apply_redactions_to_review_df_and_files,
+ inputs=[
+ annotator,
+ doc_full_file_name_textbox,
+ pdf_doc_state,
+ all_image_annotations_state,
+ annotate_current_page,
+ review_file_df,
+ output_folder_textbox,
+ do_not_save_pdf_state,
+ page_sizes,
+ input_folder_textbox,
+ ],
+ outputs=[
+ pdf_doc_state,
+ all_image_annotations_state,
+ input_pdf_for_review,
+ log_files_output,
+ review_file_df,
+ ],
+ show_progress_on=[input_pdf_for_review],
+ ).success(
+ update_all_entity_df_dropdowns,
+ inputs=[
+ all_page_line_level_ocr_results_with_words_df_base,
+ recogniser_entity_dropdown,
+ page_entity_dropdown_redaction,
+ text_entity_dropdown,
+ ],
+ outputs=[
+ recogniser_entity_dropdown,
+ text_entity_dropdown,
+ page_entity_dropdown_redaction,
+ ],
+ )
+
+ # Reset redaction table following filtering
+ reset_ocr_with_words_df_btn.click(
+ reset_ocr_with_words_base_dataframe,
+ inputs=[
+ all_page_line_level_ocr_results_with_words_df_base,
+ page_entity_dropdown_redaction,
+ ],
+ outputs=[
+ all_page_line_level_ocr_results_with_words_df,
+ backup_all_page_line_level_ocr_results_with_words_df_base,
+ ],
+ )
+
+ # Redact current selection
+ redact_selected_row_btn.click(
+ update_all_page_annotation_object_based_on_previous_page,
+ inputs=[
+ annotator,
+ annotate_current_page,
+ annotate_current_page,
+ all_image_annotations_state,
+ page_sizes,
+ ],
+ outputs=[
+ all_image_annotations_state,
+ annotate_previous_page,
+ annotate_current_page_bottom,
+ ],
+ ).success(
+ create_annotation_objects_from_filtered_ocr_results_with_words,
+ inputs=[
+ selected_entity_dataframe_row_redact,
+ all_page_line_level_ocr_results_with_words_df_base,
+ page_sizes,
+ review_file_df,
+ all_image_annotations_state,
+ recogniser_entity_dataframe_base,
+ new_redaction_text_label,
+ colour_label,
+ annotate_current_page,
+ ],
+ outputs=[
+ all_image_annotations_state,
+ backup_image_annotations_state,
+ review_file_df,
+ backup_review_state,
+ recogniser_entity_dataframe,
+ backup_recogniser_entity_dataframe_base,
+ ],
+ ).success(
+ update_annotator_object_and_filter_df,
+ inputs=[
+ all_image_annotations_state,
+ annotate_current_page,
+ recogniser_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ text_entity_dropdown,
+ recogniser_entity_dataframe_base,
+ annotator_zoom_number,
+ review_file_df,
+ page_sizes,
+ doc_full_file_name_textbox,
+ input_folder_textbox,
+ ],
+ outputs=[
+ annotator,
+ annotate_current_page,
+ annotate_current_page_bottom,
+ annotate_previous_page,
+ recogniser_entity_dropdown,
+ recogniser_entity_dataframe,
+ recogniser_entity_dataframe_base,
+ text_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ page_sizes,
+ all_image_annotations_state,
+ ],
+ show_progress_on=[annotator],
+ ).success(
+ apply_redactions_to_review_df_and_files,
+ inputs=[
+ annotator,
+ doc_full_file_name_textbox,
+ pdf_doc_state,
+ all_image_annotations_state,
+ annotate_current_page,
+ review_file_df,
+ output_folder_textbox,
+ do_not_save_pdf_state,
+ page_sizes,
+ input_folder_textbox,
+ ],
+ outputs=[
+ pdf_doc_state,
+ all_image_annotations_state,
+ input_pdf_for_review,
+ log_files_output,
+ review_file_df,
+ ],
+ show_progress_on=[input_pdf_for_review],
+ ).success(
+ update_all_entity_df_dropdowns,
+ inputs=[
+ all_page_line_level_ocr_results_with_words_df_base,
+ recogniser_entity_dropdown,
+ page_entity_dropdown_redaction,
+ text_entity_dropdown,
+ ],
+ outputs=[
+ recogniser_entity_dropdown,
+ text_entity_dropdown,
+ page_entity_dropdown_redaction,
+ ],
+ )
+
+ # Redact all items with same text as selected row
+ redact_text_with_same_as_selected_row_btn.click(
+ update_all_page_annotation_object_based_on_previous_page,
+ inputs=[
+ annotator,
+ annotate_current_page,
+ annotate_current_page,
+ all_image_annotations_state,
+ page_sizes,
+ ],
+ outputs=[
+ all_image_annotations_state,
+ annotate_previous_page,
+ annotate_current_page_bottom,
+ ],
+ ).success(
+ get_all_rows_with_same_text_redact,
+ inputs=[
+ all_page_line_level_ocr_results_with_words_df_base,
+ selected_entity_dataframe_row_text_redact,
+ ],
+ outputs=[to_redact_dataframe_same_text],
+ ).success(
+ create_annotation_objects_from_filtered_ocr_results_with_words,
+ inputs=[
+ to_redact_dataframe_same_text,
+ all_page_line_level_ocr_results_with_words_df_base,
+ page_sizes,
+ review_file_df,
+ all_image_annotations_state,
+ recogniser_entity_dataframe_base,
+ new_redaction_text_label,
+ colour_label,
+ annotate_current_page,
+ ],
+ outputs=[
+ all_image_annotations_state,
+ backup_image_annotations_state,
+ review_file_df,
+ backup_review_state,
+ recogniser_entity_dataframe,
+ backup_recogniser_entity_dataframe_base,
+ ],
+ ).success(
+ update_annotator_object_and_filter_df,
+ inputs=[
+ all_image_annotations_state,
+ annotate_current_page,
+ recogniser_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ text_entity_dropdown,
+ recogniser_entity_dataframe_base,
+ annotator_zoom_number,
+ review_file_df,
+ page_sizes,
+ doc_full_file_name_textbox,
+ input_folder_textbox,
+ ],
+ outputs=[
+ annotator,
+ annotate_current_page,
+ annotate_current_page_bottom,
+ annotate_previous_page,
+ recogniser_entity_dropdown,
+ recogniser_entity_dataframe,
+ recogniser_entity_dataframe_base,
+ text_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ page_sizes,
+ all_image_annotations_state,
+ ],
+ show_progress_on=[annotator],
+ ).success(
+ apply_redactions_to_review_df_and_files,
+ inputs=[
+ annotator,
+ doc_full_file_name_textbox,
+ pdf_doc_state,
+ all_image_annotations_state,
+ annotate_current_page,
+ review_file_df,
+ output_folder_textbox,
+ do_not_save_pdf_state,
+ page_sizes,
+ input_folder_textbox,
+ ],
+ outputs=[
+ pdf_doc_state,
+ all_image_annotations_state,
+ input_pdf_for_review,
+ log_files_output,
+ review_file_df,
+ ],
+ show_progress_on=[input_pdf_for_review],
+ ).success(
+ update_all_entity_df_dropdowns,
+ inputs=[
+ all_page_line_level_ocr_results_with_words_df_base,
+ recogniser_entity_dropdown,
+ page_entity_dropdown_redaction,
+ text_entity_dropdown,
+ ],
+ outputs=[
+ recogniser_entity_dropdown,
+ text_entity_dropdown,
+ page_entity_dropdown_redaction,
+ ],
+ )
+
+ # Undo last redaction action
+ undo_last_redact_btn.click(
+ undo_last_removal,
+ inputs=[
+ backup_review_state,
+ backup_image_annotations_state,
+ backup_recogniser_entity_dataframe_base,
+ ],
+ outputs=[
+ review_file_df,
+ all_image_annotations_state,
+ recogniser_entity_dataframe_base,
+ ],
+ ).success(
+ update_annotator_object_and_filter_df,
+ inputs=[
+ all_image_annotations_state,
+ annotate_current_page,
+ recogniser_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ text_entity_dropdown,
+ recogniser_entity_dataframe_base,
+ annotator_zoom_number,
+ review_file_df,
+ page_sizes,
+ doc_full_file_name_textbox,
+ input_folder_textbox,
+ ],
+ outputs=[
+ annotator,
+ annotate_current_page,
+ annotate_current_page_bottom,
+ annotate_previous_page,
+ recogniser_entity_dropdown,
+ recogniser_entity_dataframe,
+ recogniser_entity_dataframe_base,
+ text_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ page_sizes,
+ all_image_annotations_state,
+ ],
+ show_progress_on=[annotator],
+ ).success(
+ apply_redactions_to_review_df_and_files,
+ inputs=[
+ annotator,
+ doc_full_file_name_textbox,
+ pdf_doc_state,
+ all_image_annotations_state,
+ annotate_current_page,
+ review_file_df,
+ output_folder_textbox,
+ do_not_save_pdf_state,
+ page_sizes,
+ input_folder_textbox,
+ ],
+ outputs=[
+ pdf_doc_state,
+ all_image_annotations_state,
+ input_pdf_for_review,
+ log_files_output,
+ review_file_df,
+ ],
+ show_progress_on=[input_pdf_for_review],
+ )
+
+ ###
+ # Review OCR text
+ ###
+ all_page_line_level_ocr_results_df.select(
+ df_select_callback_ocr,
+ inputs=[all_page_line_level_ocr_results_df],
+ outputs=[annotate_current_page, selected_ocr_dataframe_row],
+ ).success(
+ update_annotator_page_from_review_df,
+ inputs=[
+ review_file_df,
+ images_pdf_state,
+ page_sizes,
+ all_image_annotations_state,
+ annotator,
+ selected_ocr_dataframe_row,
+ input_folder_textbox,
+ doc_full_file_name_textbox,
+ ],
+ outputs=[
+ annotator,
+ all_image_annotations_state,
+ annotate_current_page,
+ page_sizes,
+ review_file_df,
+ annotate_previous_page,
+ ],
+ show_progress_on=[annotator],
+ ).success(
+ increase_bottom_page_count_based_on_top,
+ inputs=[annotate_current_page],
+ outputs=[annotate_current_page_bottom],
+ )
+
+ # Reset the OCR results filter
+ reset_all_ocr_results_btn.click(
+ reset_ocr_base_dataframe,
+ inputs=[all_page_line_level_ocr_results_df_base],
+ outputs=[all_page_line_level_ocr_results_df],
+ )
+
+ # Convert review file to xfdf Adobe format
+ convert_review_file_to_adobe_btn.click(
+ fn=get_document_file_names,
+ inputs=[input_pdf_for_review],
+ outputs=[
+ doc_file_name_no_extension_textbox,
+ doc_file_name_with_extension_textbox,
+ doc_full_file_name_textbox,
+ doc_file_name_textbox_list,
+ total_pdf_page_count,
+ ],
+ ).success(
+ fn=prepare_image_or_pdf_with_efficient_ocr,
+ inputs=[
+ input_pdf_for_review,
+ text_extract_method_radio,
+ all_page_line_level_ocr_results_df_base,
+ all_page_line_level_ocr_results_with_words_df_base,
+ latest_file_completed_num,
+ redaction_output_summary_textbox,
+ second_loop_state,
+ annotate_max_pages,
+ all_image_annotations_state,
+ prepare_for_review_bool,
+ in_fully_redacted_list_state,
+ output_folder_textbox,
+ input_folder_textbox,
+ efficient_ocr_checkbox,
+ prepare_images_bool_false,
+ page_sizes,
+ pdf_doc_state,
+ page_min,
+ page_max,
+ ],
+ outputs=[
+ redaction_output_summary_textbox,
+ prepared_pdf_state,
+ images_pdf_state,
+ annotate_max_pages,
+ annotate_max_pages_bottom,
+ pdf_doc_state,
+ all_image_annotations_state,
+ review_file_df,
+ document_cropboxes,
+ page_sizes,
+ textract_output_found_checkbox,
+ all_img_details_state,
+ all_line_level_ocr_results_df_placeholder,
+ relevant_ocr_output_with_words_found_checkbox,
+ all_page_line_level_ocr_results_with_words_df_base,
+ ],
+ show_progress_on=[adobe_review_files_out],
+ ).success(
+ convert_df_to_xfdf,
+ inputs=[
+ input_pdf_for_review,
+ pdf_doc_state,
+ images_pdf_state,
+ output_folder_textbox,
+ document_cropboxes,
+ page_sizes,
+ ],
+ outputs=[adobe_review_files_out],
+ ).success(
+ fn=export_outputs_to_s3,
+ inputs=[
+ adobe_review_files_out,
+ s3_output_folder_state,
+ save_outputs_to_s3_checkbox,
+ input_pdf_for_review,
+ ],
+ outputs=None,
+ )
+
+ # Convert xfdf Adobe file back to review_file.csv
+ convert_adobe_to_review_file_btn.click(
+ fn=get_document_file_names,
+ inputs=[adobe_review_files_out],
+ outputs=[
+ doc_file_name_no_extension_textbox,
+ doc_file_name_with_extension_textbox,
+ doc_full_file_name_textbox,
+ doc_file_name_textbox_list,
+ total_pdf_page_count,
+ ],
+ ).success(
+ fn=prepare_image_or_pdf_with_efficient_ocr,
+ inputs=[
+ adobe_review_files_out,
+ text_extract_method_radio,
+ all_page_line_level_ocr_results_df_base,
+ all_page_line_level_ocr_results_with_words_df_base,
+ latest_file_completed_num,
+ redaction_output_summary_textbox,
+ second_loop_state,
+ annotate_max_pages,
+ all_image_annotations_state,
+ prepare_for_review_bool,
+ in_fully_redacted_list_state,
+ output_folder_textbox,
+ input_folder_textbox,
+ efficient_ocr_checkbox,
+ prepare_images_bool_false,
+ page_sizes,
+ pdf_doc_state,
+ page_min,
+ page_max,
+ ],
+ outputs=[
+ redaction_output_summary_textbox,
+ prepared_pdf_state,
+ images_pdf_state,
+ annotate_max_pages,
+ annotate_max_pages_bottom,
+ pdf_doc_state,
+ all_image_annotations_state,
+ review_file_df,
+ document_cropboxes,
+ page_sizes,
+ textract_output_found_checkbox,
+ all_img_details_state,
+ all_line_level_ocr_results_df_placeholder,
+ relevant_ocr_output_with_words_found_checkbox,
+ all_page_line_level_ocr_results_with_words_df_base,
+ ],
+ show_progress_on=[adobe_review_files_out],
+ ).success(
+ fn=convert_xfdf_to_dataframe,
+ inputs=[
+ adobe_review_files_out,
+ pdf_doc_state,
+ images_pdf_state,
+ output_folder_textbox,
+ input_folder_textbox,
+ ],
+ outputs=[input_pdf_for_review],
+ scroll_to_output=True,
+ )
+
+ ###
+ # WORD/TABULAR DATA REDACTION
+ ###
+ in_data_files.upload(
+ fn=put_columns_in_df,
+ inputs=[in_data_files],
+ outputs=[in_colnames, in_excel_sheets],
+ ).success(
+ fn=get_input_file_names,
+ inputs=[in_data_files],
+ outputs=[
+ data_file_name_no_extension_textbox,
+ data_file_name_with_extension_textbox,
+ data_full_file_name_textbox,
+ data_file_name_textbox_list,
+ total_pdf_page_count,
+ ],
+ )
+
+ # Redact tabular data
+
+ ## From walkthrough tab button – use walkthrough_ components so step 1–3 choices are used
+ step_4_next_tabular_redact_btn.click(
+ change_tab_to_tabular_or_document_redactions,
+ inputs=walkthrough_is_data_file,
+ outputs=tabs,
+ ).success(
+ fn=reset_data_vars,
+ outputs=[
+ actual_time_taken_number,
+ log_files_output_list_state,
+ comprehend_query_number,
+ ],
+ ).success(
+ fn=anonymise_files_with_open_text,
+ inputs=[
+ in_data_files,
+ in_text,
+ walkthrough_anon_strategy,
+ walkthrough_colnames,
+ walkthrough_in_redact_entities,
+ walkthrough_allow_list_state,
+ text_tabular_files_done,
+ text_output_summary,
+ text_output_file_list_state,
+ log_files_output_list_state,
+ walkthrough_excel_sheets,
+ first_loop_state,
+ output_folder_textbox,
+ walkthrough_deny_list_state,
+ walkthrough_max_fuzzy_spelling_mistakes_num,
+ walkthrough_pii_identification_method_drop_tabular,
+ walkthrough_in_redact_comprehend_entities,
+ comprehend_query_number,
+ aws_access_key_textbox,
+ aws_secret_key_textbox,
+ actual_time_taken_number,
+ walkthrough_do_initial_clean,
+ chosen_language_drop,
+ walkthrough_custom_llm_instructions_textbox,
+ walkthrough_in_redact_llm_entities,
+ ],
+ outputs=[
+ text_output_summary,
+ text_output_file,
+ text_output_file_list_state,
+ text_tabular_files_done,
+ log_files_output,
+ log_files_output_list_state,
+ actual_time_taken_number,
+ comprehend_query_number,
+ llm_total_input_tokens_number,
+ llm_total_output_tokens_number,
+ llm_model_name_textbox,
+ ],
+ api_name="redact_data",
+ show_progress_on=[text_output_summary],
+ ).success(
+ fn=lambda *args: usage_callback.flag(
+ list(args),
+ save_to_csv=SAVE_LOGS_TO_CSV,
+ save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB,
+ dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME,
+ dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS,
+ replacement_headers=CSV_USAGE_LOG_HEADERS,
+ ),
+ inputs=(
+ [
+ session_hash_textbox,
+ blank_doc_file_name_no_extension_textbox_for_logs,
+ data_file_name_with_extension_textbox,
+ actual_time_taken_number,
+ total_pdf_page_count,
+ textract_query_number,
+ pii_identification_method_drop_tabular,
+ comprehend_query_number,
+ cost_code_choice_drop,
+ handwrite_signature_checkbox,
+ host_name_textbox,
+ text_extract_method_radio,
+ is_a_textract_api_call,
+ task_textbox,
+ vlm_model_name_textbox,
+ vlm_total_input_tokens_number,
+ vlm_total_output_tokens_number,
+ llm_model_name_textbox,
+ llm_total_input_tokens_number,
+ llm_total_output_tokens_number,
+ ]
+ if DISPLAY_FILE_NAMES_IN_LOGS
+ else [
+ session_hash_textbox,
+ blank_doc_file_name_no_extension_textbox_for_logs,
+ placeholder_data_file_name_no_extension_textbox_for_logs,
+ actual_time_taken_number,
+ total_pdf_page_count,
+ textract_query_number,
+ pii_identification_method_drop_tabular,
+ comprehend_query_number,
+ cost_code_choice_drop,
+ handwrite_signature_checkbox,
+ host_name_textbox,
+ text_extract_method_radio,
+ is_a_textract_api_call,
+ task_textbox,
+ vlm_model_name_textbox,
+ vlm_total_input_tokens_number,
+ vlm_total_output_tokens_number,
+ llm_model_name_textbox,
+ llm_total_input_tokens_number,
+ llm_total_output_tokens_number,
+ ]
+ ),
+ outputs=[flag_value_placeholder],
+ preprocess=False,
+ ).success(
+ fn=upload_log_file_to_s3,
+ inputs=[usage_logs_state, usage_s3_logs_loc_state],
+ outputs=[s3_logs_output_textbox],
+ ).success(
+ fn=export_outputs_to_s3,
+ inputs=[
+ text_output_file_list_state,
+ s3_output_folder_state,
+ save_outputs_to_s3_checkbox,
+ in_data_files,
+ ],
+ outputs=None,
+ )
+
+ ## From tabular data redaction tab button
+ tabular_data_redact_btn.click(
+ reset_data_vars,
+ outputs=[
+ actual_time_taken_number,
+ log_files_output_list_state,
+ comprehend_query_number,
+ ],
+ ).success(
+ fn=anonymise_files_with_open_text,
+ inputs=[
+ in_data_files,
+ in_text,
+ anon_strategy,
+ in_colnames,
+ in_redact_entities,
+ in_allow_list_state,
+ text_tabular_files_done,
+ text_output_summary,
+ text_output_file_list_state,
+ log_files_output_list_state,
+ in_excel_sheets,
+ first_loop_state,
+ output_folder_textbox,
+ in_deny_list_state,
+ max_fuzzy_spelling_mistakes_num,
+ pii_identification_method_drop_tabular,
+ in_redact_comprehend_entities,
+ comprehend_query_number,
+ aws_access_key_textbox,
+ aws_secret_key_textbox,
+ actual_time_taken_number,
+ do_initial_clean,
+ chosen_language_drop,
+ custom_llm_instructions_textbox,
+ in_redact_llm_entities,
+ ],
+ outputs=[
+ text_output_summary,
+ text_output_file,
+ text_output_file_list_state,
+ text_tabular_files_done,
+ log_files_output,
+ log_files_output_list_state,
+ actual_time_taken_number,
+ comprehend_query_number,
+ llm_total_input_tokens_number,
+ llm_total_output_tokens_number,
+ llm_model_name_textbox,
+ ],
+ api_name="redact_data",
+ show_progress_on=[text_output_summary],
+ ).success(
+ fn=lambda *args: usage_callback.flag(
+ list(args),
+ save_to_csv=SAVE_LOGS_TO_CSV,
+ save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB,
+ dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME,
+ dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS,
+ replacement_headers=CSV_USAGE_LOG_HEADERS,
+ ),
+ inputs=(
+ [
+ session_hash_textbox,
+ blank_doc_file_name_no_extension_textbox_for_logs,
+ data_file_name_with_extension_textbox,
+ actual_time_taken_number,
+ total_pdf_page_count,
+ textract_query_number,
+ pii_identification_method_drop_tabular,
+ comprehend_query_number,
+ cost_code_choice_drop,
+ handwrite_signature_checkbox,
+ host_name_textbox,
+ text_extract_method_radio,
+ is_a_textract_api_call,
+ task_textbox,
+ vlm_model_name_textbox,
+ vlm_total_input_tokens_number,
+ vlm_total_output_tokens_number,
+ llm_model_name_textbox,
+ llm_total_input_tokens_number,
+ llm_total_output_tokens_number,
+ ]
+ if DISPLAY_FILE_NAMES_IN_LOGS
+ else [
+ session_hash_textbox,
+ blank_doc_file_name_no_extension_textbox_for_logs,
+ placeholder_data_file_name_no_extension_textbox_for_logs,
+ actual_time_taken_number,
+ total_pdf_page_count,
+ textract_query_number,
+ pii_identification_method_drop_tabular,
+ comprehend_query_number,
+ cost_code_choice_drop,
+ handwrite_signature_checkbox,
+ host_name_textbox,
+ text_extract_method_radio,
+ is_a_textract_api_call,
+ task_textbox,
+ vlm_model_name_textbox,
+ vlm_total_input_tokens_number,
+ vlm_total_output_tokens_number,
+ llm_model_name_textbox,
+ llm_total_input_tokens_number,
+ llm_total_output_tokens_number,
+ ]
+ ),
+ outputs=[flag_value_placeholder],
+ preprocess=False,
+ ).success(
+ fn=upload_log_file_to_s3,
+ inputs=[usage_logs_state, usage_s3_logs_loc_state],
+ outputs=[s3_logs_output_textbox],
+ ).success(
+ fn=export_outputs_to_s3,
+ inputs=[
+ text_output_file_list_state,
+ s3_output_folder_state,
+ save_outputs_to_s3_checkbox,
+ in_data_files,
+ ],
+ outputs=None,
+ ).success(
+ fn=reveal_feedback_buttons,
+ outputs=[
+ data_feedback_radio,
+ data_further_details_text,
+ data_submit_feedback_btn,
+ data_feedback_title,
+ ],
+ )
+
+ ###
+ # IDENTIFY DUPLICATE PAGES
+ ###
+
+ greedy_match_input.change(
+ fn=lambda greedy: gr.update(visible=not greedy),
+ inputs=[greedy_match_input],
+ outputs=[min_consecutive_pages_input],
+ )
+
+ find_duplicate_pages_btn.click(
+ fn=run_duplicate_analysis,
+ inputs=[
+ in_duplicate_pages,
+ duplicate_threshold_input,
+ min_word_count_input,
+ min_consecutive_pages_input,
+ greedy_match_input,
+ all_page_line_level_ocr_results_df_base,
+ input_review_files,
+ combine_page_text_for_duplicates_bool,
+ doc_file_name_with_extension_textbox,
+ output_folder_textbox,
+ ],
+ outputs=[
+ results_df_preview,
+ duplicate_files_out,
+ full_duplicate_data_by_file,
+ actual_time_taken_number,
+ task_textbox,
+ all_page_line_level_ocr_results_df_base,
+ input_review_files,
+ duplicate_pages_list_state,
+ ],
+ show_progress_on=[results_df_preview, redaction_output_summary_textbox],
+ ).success(
+ fn=export_outputs_to_s3,
+ # duplicate_files_out returns a single file path; export helper will normalise it
+ inputs=[
+ duplicate_files_out,
+ s3_output_folder_state,
+ save_outputs_to_s3_checkbox,
+ in_duplicate_pages,
+ ],
+ outputs=None,
+ ).success(
+ fn=lambda: "deduplicate",
+ outputs=[task_textbox],
+ ).success(
+ fn=lambda *args: usage_callback.flag(
+ list(args),
+ save_to_csv=SAVE_LOGS_TO_CSV,
+ save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB,
+ dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME,
+ dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS,
+ replacement_headers=CSV_USAGE_LOG_HEADERS,
+ ),
+ inputs=(
+ [
+ session_hash_textbox,
+ doc_file_name_no_extension_textbox,
+ blank_data_file_name_no_extension_textbox_for_logs,
+ actual_time_taken_number,
+ total_pdf_page_count,
+ textract_query_number,
+ pii_identification_method_drop,
+ comprehend_query_number,
+ cost_code_choice_drop,
+ handwrite_signature_checkbox,
+ host_name_textbox,
+ text_extract_method_radio,
+ is_a_textract_api_call,
+ task_textbox,
+ vlm_model_name_textbox,
+ vlm_total_input_tokens_number,
+ vlm_total_output_tokens_number,
+ llm_model_name_textbox,
+ llm_total_input_tokens_number,
+ llm_total_output_tokens_number,
+ ]
+ if DISPLAY_FILE_NAMES_IN_LOGS
+ else [
+ session_hash_textbox,
+ placeholder_doc_file_name_no_extension_textbox_for_logs,
+ blank_data_file_name_no_extension_textbox_for_logs,
+ actual_time_taken_number,
+ total_pdf_page_count,
+ textract_query_number,
+ pii_identification_method_drop,
+ comprehend_query_number,
+ cost_code_choice_drop,
+ handwrite_signature_checkbox,
+ host_name_textbox,
+ text_extract_method_radio,
+ is_a_textract_api_call,
+ task_textbox,
+ vlm_model_name_textbox,
+ vlm_total_input_tokens_number,
+ vlm_total_output_tokens_number,
+ llm_model_name_textbox,
+ llm_total_input_tokens_number,
+ llm_total_output_tokens_number,
+ ]
+ ),
+ outputs=[flag_value_placeholder],
+ preprocess=False,
+ ).success(
+ fn=upload_log_file_to_s3,
+ inputs=[usage_logs_state, usage_s3_logs_loc_state],
+ outputs=[s3_logs_output_textbox],
+ )
+
+ results_df_preview.select(
+ fn=handle_selection_and_preview,
+ inputs=[results_df_preview, full_duplicate_data_by_file],
+ outputs=[
+ selected_duplicate_data_row_index,
+ page1_text_preview,
+ page2_text_preview,
+ ],
+ )
+
+ # When the user clicks the "Exclude" button
+ exclude_match_btn.click(
+ fn=exclude_match,
+ inputs=[
+ results_df_preview,
+ selected_duplicate_data_row_index,
+ output_folder_textbox,
+ doc_file_name_with_extension_textbox,
+ ],
+ outputs=[
+ results_df_preview,
+ duplicate_files_out,
+ page1_text_preview,
+ page2_text_preview,
+ duplicate_pages_list_state,
+ ],
+ )
+
+ apply_match_btn.click(
+ fn=create_annotation_objects_from_duplicates,
+ inputs=[
+ results_df_preview,
+ all_page_line_level_ocr_results_df_base,
+ page_sizes,
+ combine_page_text_for_duplicates_bool,
+ ],
+ outputs=[new_duplicate_search_annotation_object],
+ show_progress_on=[
+ new_duplicate_search_annotation_object,
+ redaction_output_summary_textbox,
+ ],
+ ).success(
+ fn=apply_whole_page_redactions_from_list,
+ inputs=[
+ duplicate_pages_list_state,
+ doc_file_name_with_extension_textbox,
+ review_file_df,
+ duplicate_files_out,
+ pdf_doc_state,
+ page_sizes,
+ all_image_annotations_state,
+ combine_page_text_for_duplicates_bool,
+ new_duplicate_search_annotation_object,
+ latest_review_file_path,
+ ],
+ outputs=[review_file_df, all_image_annotations_state],
+ ).success(
+ update_annotator_page_from_review_df,
+ inputs=[
+ review_file_df,
+ images_pdf_state,
+ page_sizes,
+ all_image_annotations_state,
+ annotator,
+ selected_entity_dataframe_row,
+ input_folder_textbox,
+ doc_full_file_name_textbox,
+ ],
+ outputs=[
+ annotator,
+ all_image_annotations_state,
+ annotate_current_page,
+ page_sizes,
+ review_file_df,
+ annotate_previous_page,
+ ],
+ show_progress_on=[annotator],
+ ).success(
+ update_annotator_object_and_filter_df,
+ inputs=[
+ all_image_annotations_state,
+ annotate_current_page,
+ recogniser_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ text_entity_dropdown,
+ recogniser_entity_dataframe_base,
+ annotator_zoom_number,
+ review_file_df,
+ page_sizes,
+ doc_full_file_name_textbox,
+ input_folder_textbox,
+ ],
+ outputs=[
+ annotator,
+ annotate_current_page,
+ annotate_current_page_bottom,
+ annotate_previous_page,
+ recogniser_entity_dropdown,
+ recogniser_entity_dataframe,
+ recogniser_entity_dataframe_base,
+ text_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ page_sizes,
+ all_image_annotations_state,
+ ],
+ show_progress_on=[annotator],
+ )
+
+ go_to_review_redactions_tab_btn_2.click(
+ fn=change_tab_to_review_redactions,
+ inputs=None,
+ outputs=tabs,
+ )
+
+ ###
+ # TABULAR DUPLICATE DETECTION
+ ###
+
+ # Event handlers
+ in_tabular_duplicate_files.upload(
+ fn=put_columns_in_df,
+ inputs=[in_tabular_duplicate_files],
+ outputs=[tabular_text_columns, in_excel_tabular_sheets],
+ )
+
+ find_tabular_duplicates_btn.click(
+ fn=run_tabular_duplicate_detection,
+ inputs=[
+ in_tabular_duplicate_files,
+ tabular_duplicate_threshold,
+ tabular_min_word_count,
+ tabular_text_columns,
+ output_folder_textbox,
+ do_initial_clean_dup,
+ in_excel_tabular_sheets,
+ remove_duplicate_rows,
+ ],
+ outputs=[
+ tabular_results_df,
+ tabular_cleaned_file,
+ tabular_file_to_clean,
+ actual_time_taken_number,
+ task_textbox,
+ ],
+ api_name="tabular_clean_duplicates",
+ show_progress_on=[tabular_results_df],
+ ).success(
+ fn=lambda: "deduplicate",
+ outputs=[task_textbox],
+ ).success(
+ fn=lambda *args: usage_callback.flag(
+ list(args),
+ save_to_csv=SAVE_LOGS_TO_CSV,
+ save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB,
+ dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME,
+ dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS,
+ replacement_headers=CSV_USAGE_LOG_HEADERS,
+ ),
+ inputs=(
+ [
+ session_hash_textbox,
+ blank_doc_file_name_no_extension_textbox_for_logs,
+ data_file_name_with_extension_textbox,
+ actual_time_taken_number,
+ total_pdf_page_count,
+ textract_query_number,
+ pii_identification_method_drop_tabular,
+ comprehend_query_number,
+ cost_code_choice_drop,
+ handwrite_signature_checkbox,
+ host_name_textbox,
+ text_extract_method_radio,
+ is_a_textract_api_call,
+ task_textbox,
+ vlm_model_name_textbox,
+ vlm_total_input_tokens_number,
+ vlm_total_output_tokens_number,
+ llm_model_name_textbox,
+ llm_total_input_tokens_number,
+ llm_total_output_tokens_number,
+ ]
+ if DISPLAY_FILE_NAMES_IN_LOGS
+ else [
+ session_hash_textbox,
+ blank_doc_file_name_no_extension_textbox_for_logs,
+ placeholder_data_file_name_no_extension_textbox_for_logs,
+ actual_time_taken_number,
+ total_pdf_page_count,
+ textract_query_number,
+ pii_identification_method_drop_tabular,
+ comprehend_query_number,
+ cost_code_choice_drop,
+ handwrite_signature_checkbox,
+ host_name_textbox,
+ text_extract_method_radio,
+ is_a_textract_api_call,
+ task_textbox,
+ vlm_model_name_textbox,
+ vlm_total_input_tokens_number,
+ vlm_total_output_tokens_number,
+ llm_model_name_textbox,
+ llm_total_input_tokens_number,
+ llm_total_output_tokens_number,
+ ]
+ ),
+ outputs=[flag_value_placeholder],
+ preprocess=False,
+ ).success(
+ fn=upload_log_file_to_s3,
+ inputs=[usage_logs_state, usage_s3_logs_loc_state],
+ outputs=[s3_logs_output_textbox],
+ )
+
+ tabular_results_df.select(
+ fn=handle_tabular_row_selection,
+ inputs=[tabular_results_df],
+ outputs=[
+ tabular_selected_row_index,
+ tabular_text1_preview,
+ tabular_text2_preview,
+ ],
+ )
+
+ clean_duplicates_btn.click(
+ fn=clean_tabular_duplicates,
+ inputs=[
+ tabular_file_to_clean,
+ tabular_results_df,
+ output_folder_textbox,
+ in_excel_tabular_sheets,
+ ],
+ outputs=[tabular_cleaned_file],
+ )
+
+ ###
+ # SUMMARISATION TAB
+ ###
+ # Function for summarising from a PDF or ocr file
+
+ def maybe_extract_then_summarise(
+ all_page_line_level_ocr_results_df_base,
+ output_folder,
+ summarisation_inference_method,
+ summarisation_api_key,
+ summarisation_temperature,
+ doc_full_file_name_textbox,
+ summarisation_context,
+ aws_access_key_textbox,
+ aws_secret_key_textbox,
+ summarisation_hf_api_key_hidden,
+ summarisation_azure_endpoint_hidden,
+ summarisation_format,
+ summarisation_additional_instructions,
+ summarisation_max_pages_per_group,
+ in_summarisation_ocr_files,
+ # prepare + redactor state (same order as used in document_redact_btn / prepare chain)
+ text_extract_method_radio,
+ all_page_line_level_ocr_results_with_words_df_base,
+ latest_file_completed_num,
+ redaction_output_summary_textbox,
+ first_loop_state,
+ annotate_max_pages,
+ all_image_annotations_state,
+ prepare_for_review_bool_false,
+ in_fully_redacted_list_state,
+ input_folder_textbox,
+ prepare_images_bool_false,
+ page_sizes,
+ pdf_doc_state,
+ page_min,
+ page_max,
+ prepared_pdf_state,
+ images_pdf_state,
+ in_redact_entities,
+ in_redact_comprehend_entities,
+ in_redact_llm_entities,
+ in_allow_list_state,
+ in_deny_list_state,
+ output_file_list_state,
+ log_files_output_list_state,
+ actual_time_taken_number,
+ handwrite_signature_checkbox,
+ textract_metadata_textbox,
+ all_decision_process_table_state,
+ current_loop_page_number,
+ page_break_return,
+ pii_identification_method_drop,
+ comprehend_query_number,
+ max_fuzzy_spelling_mistakes_num,
+ match_fuzzy_whole_phrase_bool,
+ review_file_df,
+ document_cropboxes,
+ textract_output_found_checkbox,
+ only_extract_text_radio,
+ duplication_file_path_outputs_list_state,
+ latest_review_file_path,
+ textract_query_number,
+ latest_ocr_file_path,
+ all_page_line_level_ocr_results,
+ all_page_line_level_ocr_results_with_words,
+ local_ocr_method_radio,
+ chosen_language_drop,
+ input_review_files,
+ custom_llm_instructions_textbox,
+ inference_server_vlm_model_textbox,
+ efficient_ocr_checkbox,
+ efficient_ocr_min_words_number,
+ efficient_ocr_min_image_coverage_number,
+ high_quality_textract_ocr_checkbox,
+ overwrite_existing_ocr_checkbox,
+ save_page_ocr_visualisations_checkbox,
+ llm_model_name_textbox,
+ llm_total_input_tokens_number,
+ llm_total_output_tokens_number,
+ vlm_model_name_textbox,
+ vlm_total_input_tokens_number,
+ vlm_total_output_tokens_number,
+ ):
+ """
+ If the summarisation upload contains a PDF, run text extraction (prepare + redactor
+ with text_extraction_only=True) then summarise; otherwise call summarise_document_wrapper
+ with existing behaviour (CSV or state dataframe).
+ Returns 7 summarisation outputs; when a PDF was processed via the redactor, also returns
+ 5 redaction outputs (output_file, output_file_list_state, log_files_output,
+ log_files_output_list_state, redaction_output_summary_textbox) so they update the same
+ components as the document redaction tab.
+ """
+ paths = _summarisation_upload_to_paths(in_summarisation_ocr_files)
+ if not _upload_contains_pdf(in_summarisation_ocr_files):
+ out = summarise_document_wrapper(
+ all_page_line_level_ocr_results_df_base,
+ output_folder,
+ summarisation_inference_method,
+ summarisation_api_key,
+ summarisation_temperature,
+ doc_full_file_name_textbox,
+ summarisation_context,
+ aws_access_key_textbox,
+ aws_secret_key_textbox,
+ summarisation_hf_api_key_hidden,
+ summarisation_azure_endpoint_hidden,
+ summarisation_format,
+ summarisation_additional_instructions,
+ summarisation_max_pages_per_group,
+ in_summarisation_ocr_files,
+ )
+ return (
+ *out,
+ gr.update(),
+ gr.update(),
+ gr.update(),
+ gr.update(),
+ gr.update(),
+ )
+
+ pdf_path = next((p for p in paths if is_pdf(p)), None)
+ if not pdf_path:
+ out = summarise_document_wrapper(
+ all_page_line_level_ocr_results_df_base,
+ output_folder,
+ summarisation_inference_method,
+ summarisation_api_key,
+ summarisation_temperature,
+ doc_full_file_name_textbox,
+ summarisation_context,
+ aws_access_key_textbox,
+ aws_secret_key_textbox,
+ summarisation_hf_api_key_hidden,
+ summarisation_azure_endpoint_hidden,
+ summarisation_format,
+ summarisation_additional_instructions,
+ summarisation_max_pages_per_group,
+ in_summarisation_ocr_files,
+ )
+ return (
+ *out,
+ gr.update(),
+ gr.update(),
+ gr.update(),
+ gr.update(),
+ gr.update(),
+ )
+
+ doc_names = get_document_file_names([pdf_path])
+ doc_file_name_no_extension = (
+ doc_names[0]
+ if doc_names[0]
+ else os.path.splitext(os.path.basename(pdf_path))[0]
+ )
+ full_file_name = (
+ doc_names[2] if len(doc_names) > 2 and doc_names[2] else pdf_path
+ )
+ summary_file_name = _file_name_from_pdf_path(full_file_name)
+
+ check_for_existing_textract_file(
+ doc_file_name_no_extension, output_folder, handwrite_signature_checkbox
+ )
+ check_for_relevant_ocr_output_with_words(
+ doc_file_name_no_extension, text_extract_method_radio, output_folder
+ )
+
+ prepare_result = prepare_image_or_pdf(
+ [pdf_path],
+ text_extract_method_radio,
+ (
+ all_page_line_level_ocr_results_df_base
+ if all_page_line_level_ocr_results_df_base is not None
+ else pd.DataFrame()
+ ),
+ (
+ all_page_line_level_ocr_results_with_words_df_base
+ if all_page_line_level_ocr_results_with_words_df_base is not None
+ else pd.DataFrame()
+ ),
+ latest_file_completed_num,
+ redaction_output_summary_textbox or [],
+ first_loop_state,
+ annotate_max_pages or 1,
+ all_image_annotations_state or [],
+ prepare_for_review_bool_false,
+ (
+ in_fully_redacted_list_state
+ if in_fully_redacted_list_state is not None
+ else []
+ ),
+ output_folder,
+ input_folder_textbox,
+ (
+ prepare_images_bool_false
+ if prepare_images_bool_false is not None
+ else True
+ ),
+ page_sizes or [],
+ pdf_doc_state if pdf_doc_state is not None else [],
+ page_min or 0,
+ page_max or 0,
+ )
+
+ prepared_pdf_paths = prepare_result[1]
+ pdf_image_paths = prepare_result[2]
+ review_file_from_prepare = prepare_result[7]
+ document_cropboxes_from_prepare = prepare_result[8]
+ page_sizes_from_prepare = prepare_result[9]
+ textract_found_after_prepare = prepare_result[10]
+ ocr_df_base_from_prepare = prepare_result[12]
+ prepare_result[13]
+ ocr_with_words_df_from_prepare = prepare_result[14]
+ pdf_doc_from_prepare = prepare_result[5]
+
+ redactor_result = choose_and_run_redactor(
+ [pdf_path],
+ prepared_pdf_paths,
+ pdf_image_paths,
+ in_redact_entities or [],
+ in_redact_comprehend_entities or [],
+ in_redact_llm_entities or [],
+ text_extract_method_radio,
+ in_allow_list_state or [],
+ in_deny_list_state or [],
+ (
+ in_fully_redacted_list_state
+ if in_fully_redacted_list_state is not None
+ else []
+ ),
+ latest_file_completed_num or 0,
+ redaction_output_summary_textbox or [],
+ output_file_list_state or [],
+ log_files_output_list_state or [],
+ first_loop_state,
+ page_min or 0,
+ page_max or 0,
+ actual_time_taken_number or 0.0,
+ handwrite_signature_checkbox or [],
+ textract_metadata_textbox or "",
+ all_image_annotations_state or [],
+ ocr_df_base_from_prepare,
+ (
+ all_decision_process_table_state
+ if all_decision_process_table_state is not None
+ else pd.DataFrame()
+ ),
+ pdf_doc_from_prepare,
+ current_loop_page_number or 0,
+ page_break_return or False,
+ pii_identification_method_drop or "Local",
+ comprehend_query_number or 0,
+ (
+ max_fuzzy_spelling_mistakes_num
+ if max_fuzzy_spelling_mistakes_num is not None
+ else 1
+ ),
+ (
+ match_fuzzy_whole_phrase_bool
+ if match_fuzzy_whole_phrase_bool is not None
+ else True
+ ),
+ aws_access_key_textbox or "",
+ aws_secret_key_textbox or "",
+ annotate_max_pages or 1,
+ review_file_from_prepare,
+ output_folder,
+ document_cropboxes_from_prepare,
+ page_sizes_from_prepare,
+ textract_found_after_prepare,
+ True, # text_extraction_only: summarisation route never runs PII detection
+ duplication_file_path_outputs_list_state or [],
+ latest_review_file_path or "",
+ input_folder_textbox or "",
+ textract_query_number or 0,
+ latest_ocr_file_path or "",
+ all_page_line_level_ocr_results or [],
+ all_page_line_level_ocr_results_with_words or [],
+ ocr_with_words_df_from_prepare,
+ local_ocr_method_radio or "",
+ chosen_language_drop or "",
+ input_review_files or [],
+ custom_llm_instructions_textbox or "",
+ inference_server_vlm_model_textbox or "",
+ efficient_ocr_checkbox if efficient_ocr_checkbox is not None else False,
+ efficient_ocr_min_words_number,
+ efficient_ocr_min_image_coverage_number,
+ (
+ high_quality_textract_ocr_checkbox
+ if high_quality_textract_ocr_checkbox is not None
+ else False
+ ),
+ (
+ overwrite_existing_ocr_checkbox
+ if overwrite_existing_ocr_checkbox is not None
+ else False
+ ),
+ llm_model_name_textbox or "",
+ llm_total_input_tokens_number or 0,
+ llm_total_output_tokens_number or 0,
+ vlm_model_name_textbox or "",
+ vlm_total_input_tokens_number or 0,
+ vlm_total_output_tokens_number or 0,
+ save_page_ocr_visualisations=(
+ save_page_ocr_visualisations_checkbox
+ if save_page_ocr_visualisations_checkbox is not None
+ else SAVE_PAGE_OCR_VISUALISATIONS
+ ),
+ )
+
+ ocr_df_for_summary = redactor_result[12]
+ out_file_paths = redactor_result[1]
+ log_files_output_paths = redactor_result[4]
+ redaction_summary = redactor_result[0]
+ if ocr_df_for_summary is None or (
+ isinstance(ocr_df_for_summary, pd.DataFrame) and ocr_df_for_summary.empty
+ ):
+ return (
+ [],
+ "No OCR text extracted from PDF. Cannot summarise.",
+ llm_model_name_textbox or "",
+ llm_total_input_tokens_number or 0,
+ llm_total_output_tokens_number or 0,
+ "",
+ 0.0,
+ out_file_paths,
+ out_file_paths,
+ log_files_output_paths,
+ log_files_output_paths,
+ redaction_summary,
+ )
+
+ summarise_out = summarise_document_wrapper(
+ ocr_df_for_summary,
+ output_folder,
+ summarisation_inference_method,
+ summarisation_api_key,
+ summarisation_temperature,
+ summary_file_name,
+ summarisation_context,
+ aws_access_key_textbox,
+ aws_secret_key_textbox,
+ summarisation_hf_api_key_hidden,
+ summarisation_azure_endpoint_hidden,
+ summarisation_format,
+ summarisation_additional_instructions,
+ summarisation_max_pages_per_group,
+ None,
+ )
+ return (
+ *summarise_out,
+ out_file_paths,
+ out_file_paths,
+ log_files_output_paths,
+ log_files_output_paths,
+ redaction_summary,
+ )
+
+ summarise_btn.click(
+ reset_aws_call_vars,
+ outputs=[
+ comprehend_query_number,
+ textract_query_number,
+ vlm_total_input_tokens_number,
+ vlm_total_output_tokens_number,
+ llm_total_input_tokens_number,
+ llm_total_output_tokens_number,
+ llm_model_name_textbox,
+ vlm_model_name_textbox,
+ ],
+ ).success(
+ fn=enforce_cost_codes,
+ inputs=[
+ enforce_cost_code_bool,
+ cost_code_choice_drop,
+ cost_code_dataframe_base,
+ ],
+ ).success(
+ fn=maybe_extract_then_summarise,
+ inputs=[
+ all_page_line_level_ocr_results_df_base,
+ output_folder_textbox,
+ summarisation_inference_method,
+ summarisation_api_key,
+ summarisation_temperature,
+ doc_full_file_name_textbox,
+ summarisation_context,
+ aws_access_key_textbox,
+ aws_secret_key_textbox,
+ summarisation_hf_api_key_hidden,
+ summarisation_azure_endpoint_hidden,
+ summarisation_format,
+ summarisation_additional_instructions,
+ summarisation_max_pages_per_group,
+ in_summarisation_ocr_files,
+ text_extract_method_radio,
+ all_page_line_level_ocr_results_with_words_df_base,
+ latest_file_completed_num,
+ redaction_output_summary_textbox,
+ first_loop_state,
+ annotate_max_pages,
+ all_image_annotations_state,
+ prepare_for_review_bool_false,
+ in_fully_redacted_list_state,
+ input_folder_textbox,
+ prepare_images_bool_false,
+ page_sizes,
+ pdf_doc_state,
+ page_min,
+ page_max,
+ prepared_pdf_state,
+ images_pdf_state,
+ in_redact_entities,
+ in_redact_comprehend_entities,
+ in_redact_llm_entities,
+ in_allow_list_state,
+ in_deny_list_state,
+ output_file_list_state,
+ log_files_output_list_state,
+ actual_time_taken_number,
+ handwrite_signature_checkbox,
+ textract_metadata_textbox,
+ all_decision_process_table_state,
+ current_loop_page_number,
+ page_break_return,
+ pii_identification_method_drop,
+ comprehend_query_number,
+ max_fuzzy_spelling_mistakes_num,
+ match_fuzzy_whole_phrase_bool,
+ review_file_df,
+ document_cropboxes,
+ textract_output_found_checkbox,
+ only_extract_text_radio,
+ duplication_file_path_outputs_list_state,
+ latest_review_file_path,
+ textract_query_number,
+ latest_ocr_file_path,
+ all_page_line_level_ocr_results,
+ all_page_line_level_ocr_results_with_words,
+ local_ocr_method_radio,
+ chosen_language_drop,
+ input_review_files,
+ custom_llm_instructions_textbox,
+ inference_server_vlm_model_textbox,
+ efficient_ocr_checkbox,
+ efficient_ocr_min_words_number,
+ efficient_ocr_min_image_coverage_number,
+ high_quality_textract_ocr_checkbox,
+ overwrite_existing_ocr_checkbox,
+ save_page_ocr_visualisations_checkbox,
+ llm_model_name_textbox,
+ llm_total_input_tokens_number,
+ llm_total_output_tokens_number,
+ vlm_model_name_textbox,
+ vlm_total_input_tokens_number,
+ vlm_total_output_tokens_number,
+ ],
+ outputs=[
+ summarisation_output_files,
+ summarisation_status,
+ llm_model_name_textbox,
+ llm_total_input_tokens_number,
+ llm_total_output_tokens_number,
+ summarisation_display,
+ actual_time_taken_number,
+ output_file,
+ output_file_list_state,
+ log_files_output,
+ log_files_output_list_state,
+ redaction_output_summary_textbox,
+ ],
+ show_progress=True,
+ show_progress_on=[summarisation_status],
+ ).success(
+ fn=lambda: "summarisation",
+ outputs=[task_textbox],
+ ).success(
+ fn=lambda *args: usage_callback.flag(
+ list(args),
+ save_to_csv=SAVE_LOGS_TO_CSV,
+ save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB,
+ dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME,
+ dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS,
+ replacement_headers=CSV_USAGE_LOG_HEADERS,
+ ),
+ inputs=(
+ [
+ session_hash_textbox,
+ doc_file_name_no_extension_textbox,
+ blank_data_file_name_no_extension_textbox_for_logs,
+ actual_time_taken_number,
+ total_pdf_page_count,
+ textract_query_number,
+ pii_identification_method_drop,
+ comprehend_query_number,
+ cost_code_choice_drop,
+ handwrite_signature_checkbox,
+ host_name_textbox,
+ text_extract_method_radio,
+ is_a_textract_api_call,
+ task_textbox,
+ vlm_model_name_textbox,
+ vlm_total_input_tokens_number,
+ vlm_total_output_tokens_number,
+ llm_model_name_textbox,
+ llm_total_input_tokens_number,
+ llm_total_output_tokens_number,
+ ]
+ if DISPLAY_FILE_NAMES_IN_LOGS
+ else [
+ session_hash_textbox,
+ placeholder_doc_file_name_no_extension_textbox_for_logs,
+ blank_data_file_name_no_extension_textbox_for_logs,
+ actual_time_taken_number,
+ total_pdf_page_count,
+ textract_query_number,
+ pii_identification_method_drop,
+ comprehend_query_number,
+ cost_code_choice_drop,
+ handwrite_signature_checkbox,
+ host_name_textbox,
+ text_extract_method_radio,
+ is_a_textract_api_call,
+ task_textbox,
+ vlm_model_name_textbox,
+ vlm_total_input_tokens_number,
+ vlm_total_output_tokens_number,
+ llm_model_name_textbox,
+ llm_total_input_tokens_number,
+ llm_total_output_tokens_number,
+ ]
+ ),
+ outputs=[flag_value_placeholder],
+ preprocess=False,
+ api_name="usage_logs_summarisation",
+ ).success(
+ fn=upload_log_file_to_s3,
+ inputs=[usage_logs_state, usage_s3_logs_loc_state],
+ outputs=[s3_logs_output_textbox],
+ )
+
+ ###
+ # SETTINGS PAGE INPUT / OUTPUT
+ ###
+ # If a custom allow/deny/duplicate page list is uploaded
+ in_allow_list.change(
+ fn=custom_regex_load,
+ inputs=[in_allow_list],
+ outputs=[in_allow_list_text, in_allow_list_state],
+ )
+ in_deny_list.change(
+ fn=custom_regex_load,
+ inputs=[in_deny_list, in_deny_list_text_in],
+ outputs=[in_deny_list_text, in_deny_list_state],
+ )
+ in_fully_redacted_list.change(
+ fn=custom_regex_load,
+ inputs=[in_fully_redacted_list, in_fully_redacted_text_in],
+ outputs=[in_fully_redacted_list_text, in_fully_redacted_list_state],
+ )
+
+ # Apply whole page redactions from the provided whole page redaction csv file upload/list of specific page numbers given by user
+ def _apply_whole_page_redactions_fully_redacted(
+ duplicate_page_numbers_df_or_list,
+ doc_file_name_with_extension_textbox,
+ review_file_df,
+ duplicate_files_out,
+ pdf_doc_state,
+ page_sizes,
+ all_image_annotations_state,
+ latest_review_file_path,
+ ):
+ return apply_whole_page_redactions_from_list(
+ duplicate_page_numbers_df_or_list,
+ doc_file_name_with_extension_textbox,
+ review_file_df,
+ duplicate_files_out,
+ pdf_doc_state,
+ page_sizes,
+ all_image_annotations_state,
+ combine_pages=True,
+ new_annotations_with_bounding_boxes=list(),
+ review_file_path=latest_review_file_path or "",
+ )
+
+ apply_fully_redacted_list_btn.click(
+ fn=_apply_whole_page_redactions_fully_redacted,
+ inputs=[
+ in_fully_redacted_list_state,
+ doc_file_name_with_extension_textbox,
+ review_file_df,
+ duplicate_files_out,
+ pdf_doc_state,
+ page_sizes,
+ all_image_annotations_state,
+ latest_review_file_path,
+ ],
+ outputs=[review_file_df, all_image_annotations_state],
+ ).success(
+ update_annotator_page_from_review_df,
+ inputs=[
+ review_file_df,
+ images_pdf_state,
+ page_sizes,
+ all_image_annotations_state,
+ annotator,
+ selected_entity_dataframe_row,
+ input_folder_textbox,
+ doc_full_file_name_textbox,
+ ],
+ outputs=[
+ annotator,
+ all_image_annotations_state,
+ annotate_current_page,
+ page_sizes,
+ review_file_df,
+ annotate_previous_page,
+ ],
+ show_progress_on=[annotator],
+ ).success(
+ update_annotator_object_and_filter_df,
+ inputs=[
+ all_image_annotations_state,
+ annotate_current_page,
+ recogniser_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ text_entity_dropdown,
+ recogniser_entity_dataframe_base,
+ annotator_zoom_number,
+ review_file_df,
+ page_sizes,
+ doc_full_file_name_textbox,
+ input_folder_textbox,
+ ],
+ outputs=[
+ annotator,
+ annotate_current_page,
+ annotate_current_page_bottom,
+ annotate_previous_page,
+ recogniser_entity_dropdown,
+ recogniser_entity_dataframe,
+ recogniser_entity_dataframe_base,
+ text_entity_dropdown,
+ page_entity_dropdown,
+ page_entity_dropdown_redaction,
+ page_sizes,
+ all_image_annotations_state,
+ ],
+ show_progress_on=[annotator],
+ )
+
+ # Merge multiple review csv files together
+ merge_multiple_review_files_btn.click(
+ fn=merge_csv_files,
+ inputs=multiple_review_files_in_out,
+ outputs=multiple_review_files_in_out,
+ api_name="combine_review_csvs",
+ )
+
+ # Combine multiple review PDFs (merge redaction comments into one file)
+ combine_review_pdfs_btn.click(
+ fn=combine_review_pdf_files,
+ inputs=[combine_review_pdfs_in_out, output_folder_textbox],
+ outputs=combine_review_pdfs_in_out,
+ api_name="combine_review_pdfs",
+ )
+
+ # Need to momentarilly change the root directory of the file explorer to another non-sensitive folder when the button is clicked to get it to update (workaround))
+ all_output_files_btn.click(
+ fn=lambda: gr.FileExplorer(root_dir=FEEDBACK_LOGS_FOLDER),
+ inputs=None,
+ outputs=all_output_files,
+ ).success(
+ fn=load_all_output_files,
+ inputs=output_folder_textbox,
+ outputs=all_output_files,
+ )
+
+ all_output_files.input(
+ fn=all_outputs_file_download_fn,
+ inputs=all_output_files,
+ outputs=all_outputs_file_download,
+ )
+
+ # Language selection dropdown
+ chosen_language_full_name_drop.select(
+ update_language_dropdown,
+ inputs=[chosen_language_full_name_drop],
+ outputs=[chosen_language_drop],
+ )
+
+ ###
+ # APP LOAD AND LOGGING
+ ###
+
+ # Get connection details on app load
+
+ if SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS:
+ blocks.load(
+ get_connection_params,
+ inputs=[
+ output_folder_textbox,
+ input_folder_textbox,
+ session_output_folder_textbox,
+ s3_output_folder_state,
+ s3_whole_document_textract_input_subfolder,
+ s3_whole_document_textract_output_subfolder,
+ s3_whole_document_textract_logs_subfolder,
+ local_whole_document_textract_logs_subfolder,
+ ],
+ outputs=[
+ session_hash_state,
+ output_folder_textbox,
+ session_hash_textbox,
+ input_folder_textbox,
+ s3_whole_document_textract_input_subfolder,
+ s3_whole_document_textract_output_subfolder,
+ s3_whole_document_textract_logs_subfolder,
+ local_whole_document_textract_logs_subfolder,
+ s3_output_folder_state,
+ ],
+ ).success(
+ load_in_textract_job_details,
+ inputs=[
+ load_s3_whole_document_textract_logs_bool,
+ s3_whole_document_textract_logs_subfolder,
+ local_whole_document_textract_logs_subfolder,
+ ],
+ outputs=[textract_job_detail_df],
+ ).success(
+ fn=load_all_output_files,
+ inputs=output_folder_textbox,
+ outputs=all_output_files,
+ )
+
+ else:
+ blocks.load(
+ get_connection_params,
+ inputs=[
+ output_folder_textbox,
+ input_folder_textbox,
+ session_output_folder_textbox,
+ s3_output_folder_state,
+ s3_whole_document_textract_input_subfolder,
+ s3_whole_document_textract_output_subfolder,
+ s3_whole_document_textract_logs_subfolder,
+ local_whole_document_textract_logs_subfolder,
+ ],
+ outputs=[
+ session_hash_state,
+ output_folder_textbox,
+ session_hash_textbox,
+ input_folder_textbox,
+ s3_whole_document_textract_input_subfolder,
+ s3_whole_document_textract_output_subfolder,
+ s3_whole_document_textract_logs_subfolder,
+ local_whole_document_textract_logs_subfolder,
+ s3_output_folder_state,
+ ],
+ ).success(
+ fn=load_all_output_files,
+ inputs=output_folder_textbox,
+ outputs=all_output_files,
+ )
+
+ # If relevant environment variable is set, load in the default allow list file from S3 or locally. Even when setting S3 path, need to local path to give a download location
+ if GET_DEFAULT_ALLOW_LIST and (ALLOW_LIST_PATH or S3_ALLOW_LIST_PATH):
+ if (
+ not os.path.exists(ALLOW_LIST_PATH)
+ and S3_ALLOW_LIST_PATH
+ and RUN_AWS_FUNCTIONS
+ ):
+ print("Downloading allow list from S3")
+ blocks.load(
+ download_file_from_s3,
+ inputs=[
+ s3_default_bucket,
+ s3_default_allow_list_file,
+ default_allow_list_output_folder_location,
+ ],
+ ).success(
+ load_in_default_allow_list,
+ inputs=[default_allow_list_output_folder_location],
+ outputs=[in_allow_list],
+ )
+ print("Successfully loaded allow list from S3")
+ elif os.path.exists(ALLOW_LIST_PATH):
+ print(
+ "Loading allow list from default allow list output path location:",
+ ALLOW_LIST_PATH,
+ )
+ blocks.load(
+ load_in_default_allow_list,
+ inputs=[default_allow_list_output_folder_location],
+ outputs=[in_allow_list],
+ )
+ else:
+ print("Could not load in default allow list")
+
+ # If relevant environment variable is set, load in the default cost code file from S3 or locally
+ if GET_COST_CODES and (COST_CODES_PATH or S3_COST_CODES_PATH):
+ if (
+ not os.path.exists(COST_CODES_PATH)
+ and S3_COST_CODES_PATH
+ and RUN_AWS_FUNCTIONS
+ ):
+ print("Downloading cost codes from S3")
+ blocks.load(
+ download_file_from_s3,
+ inputs=[
+ s3_default_bucket,
+ s3_default_cost_codes_file,
+ default_cost_codes_output_folder_location,
+ ],
+ ).success(
+ load_in_default_cost_codes,
+ inputs=[
+ default_cost_codes_output_folder_location,
+ default_cost_code_textbox,
+ ],
+ outputs=[
+ cost_code_dataframe,
+ cost_code_dataframe_base,
+ cost_code_choice_drop,
+ ],
+ )
+ print("Successfully loaded cost codes from S3")
+ elif os.path.exists(COST_CODES_PATH):
+ print(
+ "Loading cost codes from default cost codes path location:",
+ COST_CODES_PATH,
+ )
+ blocks.load(
+ load_in_default_cost_codes,
+ inputs=[
+ default_cost_codes_output_folder_location,
+ default_cost_code_textbox,
+ ],
+ outputs=[
+ cost_code_dataframe,
+ cost_code_dataframe_base,
+ cost_code_choice_drop,
+ ],
+ )
+ else:
+ print("Could not load in cost code data")
+
+ # When session hash becomes available, apply saved default cost code (once)
+ if GET_COST_CODES and (COST_CODES_PATH or S3_COST_CODES_PATH):
+ session_hash_textbox.change(
+ apply_session_default_cost_code,
+ inputs=[
+ session_hash_textbox,
+ cost_code_dataframe,
+ input_folder_textbox,
+ default_cost_code_textbox,
+ cost_code_choice_drop,
+ ],
+ outputs=[default_cost_code_textbox, cost_code_choice_drop],
+ )
+
+ ###
+ # LOGGING
+ ###
+
+ ### ACCESS LOGS
+ # Log usernames and times of access to file (to know who is using the app when running on AWS)
+ access_callback = CSVLogger_custom(dataset_file_name=LOG_FILE_NAME)
+
+ access_callback.setup([session_hash_textbox, host_name_textbox], ACCESS_LOGS_FOLDER)
+ session_hash_textbox.change(
+ lambda *args: access_callback.flag(
+ list(args),
+ save_to_csv=SAVE_LOGS_TO_CSV,
+ save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB,
+ dynamodb_table_name=ACCESS_LOG_DYNAMODB_TABLE_NAME,
+ dynamodb_headers=DYNAMODB_ACCESS_LOG_HEADERS,
+ replacement_headers=CSV_ACCESS_LOG_HEADERS,
+ ),
+ [session_hash_textbox, host_name_textbox],
+ outputs=[flag_value_placeholder],
+ preprocess=False,
+ ).success(
+ fn=upload_log_file_to_s3,
+ inputs=[access_logs_state, access_s3_logs_loc_state],
+ outputs=[s3_logs_output_textbox],
+ )
+
+ ### FEEDBACK LOGS
+ pdf_callback = CSVLogger_custom(dataset_file_name=FEEDBACK_LOG_FILE_NAME)
+ data_callback = CSVLogger_custom(dataset_file_name=FEEDBACK_LOG_FILE_NAME)
+
+ if DISPLAY_FILE_NAMES_IN_LOGS:
+ # User submitted feedback for pdf redactions
+ pdf_callback.setup(
+ [
+ pdf_feedback_radio,
+ pdf_further_details_text,
+ doc_file_name_no_extension_textbox,
+ ],
+ FEEDBACK_LOGS_FOLDER,
+ )
+ pdf_submit_feedback_btn.click(
+ lambda *args: pdf_callback.flag(
+ list(args),
+ save_to_csv=SAVE_LOGS_TO_CSV,
+ save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB,
+ dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME,
+ dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS,
+ replacement_headers=CSV_FEEDBACK_LOG_HEADERS,
+ ),
+ [
+ pdf_feedback_radio,
+ pdf_further_details_text,
+ doc_file_name_no_extension_textbox,
+ ],
+ outputs=[flag_value_placeholder],
+ preprocess=False,
+ ).success(
+ fn=upload_log_file_to_s3,
+ inputs=[feedback_logs_state, feedback_s3_logs_loc_state],
+ outputs=[pdf_further_details_text],
+ )
+
+ # User submitted feedback for data redactions
+ data_callback.setup(
+ [
+ data_feedback_radio,
+ data_further_details_text,
+ data_file_name_with_extension_textbox,
+ ],
+ FEEDBACK_LOGS_FOLDER,
+ )
+ data_submit_feedback_btn.click(
+ lambda *args: data_callback.flag(
+ list(args),
+ save_to_csv=SAVE_LOGS_TO_CSV,
+ save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB,
+ dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME,
+ dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS,
+ replacement_headers=CSV_FEEDBACK_LOG_HEADERS,
+ ),
+ [
+ data_feedback_radio,
+ data_further_details_text,
+ data_file_name_with_extension_textbox,
+ ],
+ outputs=[flag_value_placeholder],
+ preprocess=False,
+ ).success(
+ fn=upload_log_file_to_s3,
+ inputs=[feedback_logs_state, feedback_s3_logs_loc_state],
+ outputs=[data_further_details_text],
+ )
+ else:
+ # User submitted feedback for pdf redactions
+ pdf_callback.setup(
+ [
+ pdf_feedback_radio,
+ pdf_further_details_text,
+ doc_file_name_no_extension_textbox,
+ ],
+ FEEDBACK_LOGS_FOLDER,
+ )
+ pdf_submit_feedback_btn.click(
+ lambda *args: pdf_callback.flag(
+ list(args),
+ save_to_csv=SAVE_LOGS_TO_CSV,
+ save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB,
+ dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME,
+ dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS,
+ replacement_headers=CSV_FEEDBACK_LOG_HEADERS,
+ ),
+ [
+ pdf_feedback_radio,
+ pdf_further_details_text,
+ placeholder_doc_file_name_no_extension_textbox_for_logs,
+ ],
+ outputs=[flag_value_placeholder],
+ preprocess=False,
+ ).success(
+ fn=upload_log_file_to_s3,
+ inputs=[feedback_logs_state, feedback_s3_logs_loc_state],
+ outputs=[pdf_further_details_text],
+ )
+
+ # User submitted feedback for data redactions
+ data_callback.setup(
+ [
+ data_feedback_radio,
+ data_further_details_text,
+ data_file_name_with_extension_textbox,
+ ],
+ FEEDBACK_LOGS_FOLDER,
+ )
+ data_submit_feedback_btn.click(
+ lambda *args: data_callback.flag(
+ list(args),
+ save_to_csv=SAVE_LOGS_TO_CSV,
+ save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB,
+ dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME,
+ dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS,
+ replacement_headers=CSV_FEEDBACK_LOG_HEADERS,
+ ),
+ [
+ data_feedback_radio,
+ data_further_details_text,
+ placeholder_data_file_name_no_extension_textbox_for_logs,
+ ],
+ outputs=[flag_value_placeholder],
+ preprocess=False,
+ ).success(
+ fn=upload_log_file_to_s3,
+ inputs=[feedback_logs_state, feedback_s3_logs_loc_state],
+ outputs=[data_further_details_text],
+ )
+
+ ### USAGE LOGS for data file analysis and Textract API calls
+
+ if DISPLAY_FILE_NAMES_IN_LOGS:
+
+ successful_textract_api_call_number.change(
+ lambda *args: usage_callback.flag(
+ list(args),
+ save_to_csv=SAVE_LOGS_TO_CSV,
+ save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB,
+ dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME,
+ dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS,
+ replacement_headers=CSV_USAGE_LOG_HEADERS,
+ ),
+ [
+ session_hash_textbox,
+ doc_file_name_no_extension_textbox,
+ blank_data_file_name_no_extension_textbox_for_logs,
+ actual_time_taken_number,
+ total_pdf_page_count,
+ textract_query_number,
+ pii_identification_method_drop,
+ comprehend_query_number,
+ cost_code_choice_drop,
+ handwrite_signature_checkbox,
+ host_name_textbox,
+ text_extract_method_radio,
+ is_a_textract_api_call,
+ task_textbox,
+ vlm_model_name_textbox,
+ vlm_total_input_tokens_number,
+ vlm_total_output_tokens_number,
+ llm_model_name_textbox,
+ llm_total_input_tokens_number,
+ llm_total_output_tokens_number,
+ ],
+ outputs=[flag_value_placeholder],
+ preprocess=False,
+ ).success(
+ fn=upload_log_file_to_s3,
+ inputs=[usage_logs_state, usage_s3_logs_loc_state],
+ outputs=[s3_logs_output_textbox],
+ )
+
+ else:
+ successful_textract_api_call_number.change(
+ lambda *args: usage_callback.flag(
+ list(args),
+ save_to_csv=SAVE_LOGS_TO_CSV,
+ save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB,
+ dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME,
+ dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS,
+ replacement_headers=CSV_USAGE_LOG_HEADERS,
+ ),
+ [
+ session_hash_textbox,
+ placeholder_doc_file_name_no_extension_textbox_for_logs,
+ blank_data_file_name_no_extension_textbox_for_logs,
+ actual_time_taken_number,
+ total_pdf_page_count,
+ textract_query_number,
+ pii_identification_method_drop,
+ comprehend_query_number,
+ cost_code_choice_drop,
+ handwrite_signature_checkbox,
+ host_name_textbox,
+ text_extract_method_radio,
+ is_a_textract_api_call,
+ task_textbox,
+ vlm_model_name_textbox,
+ vlm_total_input_tokens_number,
+ vlm_total_output_tokens_number,
+ llm_model_name_textbox,
+ llm_total_input_tokens_number,
+ llm_total_output_tokens_number,
+ ],
+ outputs=[flag_value_placeholder],
+ preprocess=False,
+ ).success(
+ fn=upload_log_file_to_s3,
+ inputs=[usage_logs_state, usage_s3_logs_loc_state],
+ outputs=[s3_logs_output_textbox],
+ )
+
+ ###
+ # APP RUN SETTINGS
+ ###
+
+ blocks.queue(
+ max_size=int(MAX_QUEUE_SIZE),
+ default_concurrency_limit=int(DEFAULT_CONCURRENCY_LIMIT),
+ )
+
+ if not RUN_DIRECT_MODE:
+ # If running through command line with uvicorn
+ if RUN_FASTAPI:
+ if ALLOWED_ORIGINS:
+ print(f"CORS enabled. Allowing origins: {ALLOWED_ORIGINS}")
+ app.add_middleware(
+ CORSMiddleware,
+ allow_origins=ALLOWED_ORIGINS, # The list of allowed origins
+ allow_credentials=True, # Allow cookies to be included in cross-origin requests
+ allow_methods=["*"], # Allow all methods (GET, POST, etc.)
+ allow_headers=["*"], # Allow all headers
+ )
+
+ if ALLOWED_HOSTS:
+ app.add_middleware(TrustedHostMiddleware, allowed_hosts=ALLOWED_HOSTS)
+
+ @app.get("/health", status_code=status.HTTP_200_OK)
+ def health_check():
+ """Simple health check endpoint."""
+ return {"status": "ok"}
+
+ app = gr.mount_gradio_app(
+ app,
+ blocks,
+ theme=gr.themes.Default(primary_hue="blue"),
+ head=head_html,
+ css=css,
+ show_error=True,
+ auth=authenticate_user if COGNITO_AUTH else None,
+ max_file_size=MAX_FILE_SIZE,
+ path="",
+ favicon_path=Path(FAVICON_PATH),
+ mcp_server=RUN_MCP_SERVER,
+ )
+
+ # Example command to run in uvicorn (in python): uvicorn.run("app:app", host=GRADIO_SERVER_NAME, port=GRADIO_SERVER_PORT)
+ # In command line something like: uvicorn app:app --host=0.0.0.0 --port=7860
+
+ else:
+ if __name__ == "__main__":
+ if COGNITO_AUTH:
+ blocks.launch(
+ theme=gr.themes.Default(primary_hue="blue"),
+ head=head_html,
+ css=css,
+ show_error=True,
+ inbrowser=True,
+ auth=authenticate_user,
+ max_file_size=MAX_FILE_SIZE,
+ server_name=GRADIO_SERVER_NAME,
+ server_port=GRADIO_SERVER_PORT,
+ root_path=ROOT_PATH,
+ favicon_path=Path(FAVICON_PATH),
+ mcp_server=RUN_MCP_SERVER,
+ )
+ else:
+ blocks.launch(
+ theme=gr.themes.Default(primary_hue="blue"),
+ head=head_html,
+ css=css,
+ show_error=True,
+ inbrowser=True,
+ max_file_size=MAX_FILE_SIZE,
+ server_name=GRADIO_SERVER_NAME,
+ server_port=GRADIO_SERVER_PORT,
+ root_path=ROOT_PATH,
+ favicon_path=Path(FAVICON_PATH),
+ mcp_server=RUN_MCP_SERVER,
+ )
+
+ else:
+ if __name__ == "__main__":
+ from cli_redact import main
+
+ # Validate required direct mode configuration
+ if not DIRECT_MODE_INPUT_FILE:
+ print(
+ "Error: DIRECT_MODE_INPUT_FILE environment variable must be set for direct mode."
+ )
+ print(
+ "Please set DIRECT_MODE_INPUT_FILE to the path of your input file."
+ )
+ exit(1)
+
+ # For combine_review_pdfs and summarise, input_file can be a list (comma-separated paths in env)
+ if DIRECT_MODE_TASK == "combine_review_pdfs":
+ direct_mode_input_file = [
+ p.strip() for p in DIRECT_MODE_INPUT_FILE.split(",") if p.strip()
+ ]
+ elif DIRECT_MODE_TASK == "summarise":
+ direct_mode_input_file = [
+ p.strip() for p in DIRECT_MODE_INPUT_FILE.split(",") if p.strip()
+ ]
+ else:
+ direct_mode_input_file = DIRECT_MODE_INPUT_FILE
+
+ # Prepare direct mode arguments based on environment variables
+ direct_mode_args = {
+ # Task Selection
+ "task": DIRECT_MODE_TASK,
+ # General Arguments (apply to all file types)
+ "input_file": direct_mode_input_file,
+ "output_dir": DIRECT_MODE_OUTPUT_DIR,
+ "input_dir": INPUT_FOLDER,
+ "language": DIRECT_MODE_LANGUAGE,
+ "allow_list": ALLOW_LIST_PATH,
+ "pii_detector": DIRECT_MODE_PII_DETECTOR,
+ "username": DIRECT_MODE_DEFAULT_USER,
+ "save_to_user_folders": SESSION_OUTPUT_FOLDER,
+ "local_redact_entities": CHOSEN_REDACT_ENTITIES,
+ "aws_redact_entities": CHOSEN_COMPREHEND_ENTITIES,
+ "aws_access_key": AWS_ACCESS_KEY,
+ "aws_secret_key": AWS_SECRET_KEY,
+ "cost_code": DEFAULT_COST_CODE,
+ "aws_region": AWS_REGION,
+ "s3_bucket": DOCUMENT_REDACTION_BUCKET,
+ "do_initial_clean": DO_INITIAL_TABULAR_DATA_CLEAN,
+ "save_logs_to_csv": SAVE_LOGS_TO_CSV,
+ "save_logs_to_dynamodb": SAVE_LOGS_TO_DYNAMODB,
+ "display_file_names_in_logs": DISPLAY_FILE_NAMES_IN_LOGS,
+ "upload_logs_to_s3": RUN_AWS_FUNCTIONS,
+ "s3_logs_prefix": S3_USAGE_LOGS_FOLDER,
+ "feedback_logs_folder": FEEDBACK_LOGS_FOLDER,
+ "access_logs_folder": ACCESS_LOGS_FOLDER,
+ "usage_logs_folder": USAGE_LOGS_FOLDER,
+ "paddle_model_path": PADDLE_MODEL_PATH,
+ "spacy_model_path": SPACY_MODEL_PATH,
+ # PDF/Image Redaction Arguments
+ "ocr_method": DIRECT_MODE_OCR_METHOD,
+ "ocr_first_pass_max_workers": DIRECT_MODE_OCR_FIRST_PASS_MAX_WORKERS,
+ "efficient_ocr": EFFICIENT_OCR,
+ "efficient_ocr_min_words": EFFICIENT_OCR_MIN_WORDS,
+ "efficient_ocr_min_image_coverage_fraction": EFFICIENT_OCR_MIN_IMAGE_COVERAGE_FRACTION,
+ "hybrid_textract_bedrock_vlm": HYBRID_TEXTRACT_BEDROCK_VLM,
+ "page_min": DIRECT_MODE_PAGE_MIN,
+ "page_max": DIRECT_MODE_PAGE_MAX,
+ "images_dpi": DIRECT_MODE_IMAGES_DPI,
+ "chosen_local_ocr_model": DIRECT_MODE_CHOSEN_LOCAL_OCR_MODEL,
+ "preprocess_local_ocr_images": DIRECT_MODE_PREPROCESS_LOCAL_OCR_IMAGES,
+ "compress_redacted_pdf": DIRECT_MODE_COMPRESS_REDACTED_PDF,
+ "return_pdf_end_of_redaction": DIRECT_MODE_RETURN_PDF_END_OF_REDACTION,
+ "deny_list_file": DENY_LIST_PATH,
+ "allow_list_file": ALLOW_LIST_PATH,
+ "redact_whole_page_file": WHOLE_PAGE_REDACTION_LIST_PATH,
+ "handwrite_signature_extraction": DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX,
+ "extract_forms": DIRECT_MODE_EXTRACT_FORMS,
+ "extract_tables": DIRECT_MODE_EXTRACT_TABLES,
+ "extract_layout": DIRECT_MODE_EXTRACT_LAYOUT,
+ "extract_signatures": DIRECT_MODE_EXTRACT_SIGNATURES,
+ "match_fuzzy_whole_phrase_bool": DIRECT_MODE_MATCH_FUZZY_WHOLE_PHRASE_BOOL,
+ # VLM OCR Arguments
+ "vlm_model_choice": CLOUD_VLM_MODEL_CHOICE,
+ "inference_server_vlm_model": DEFAULT_INFERENCE_SERVER_VLM_MODEL,
+ "inference_server_api_url": INFERENCE_SERVER_API_URL,
+ "gemini_api_key": GEMINI_API_KEY,
+ "azure_openai_api_key": AZURE_OPENAI_API_KEY,
+ "azure_openai_endpoint": AZURE_OPENAI_INFERENCE_ENDPOINT,
+ # LLM PII Detection Arguments
+ # Note: The actual model used is determined by pii_identification_method in the downstream code
+ # This is just the default - it will be overridden based on the selected PII method
+ "llm_model_choice": CLOUD_LLM_PII_MODEL_CHOICE,
+ "llm_inference_method": CHOSEN_LLM_PII_INFERENCE_METHOD,
+ "inference_server_pii_model": DEFAULT_INFERENCE_SERVER_PII_MODEL,
+ "llm_temperature": LLM_TEMPERATURE,
+ "llm_max_tokens": LLM_MAX_NEW_TOKENS,
+ "llm_redact_entities": CHOSEN_LLM_ENTITIES,
+ "custom_llm_instructions": "", # Can be set via environment variable if needed
+ # Document Summarisation Arguments (used when task is summarise)
+ "summarisation_inference_method": AWS_LLM_PII_OPTION,
+ "summarisation_temperature": 0.6,
+ "summarisation_max_pages_per_group": 30,
+ "summary_page_group_max_workers": DIRECT_MODE_SUMMARY_PAGE_GROUP_MAX_WORKERS,
+ "summarisation_api_key": "",
+ "summarisation_context": "",
+ "summarisation_format": "detailed",
+ "summarisation_additional_instructions": "",
+ # Word/Tabular Anonymisation Arguments
+ "anon_strategy": DIRECT_MODE_ANON_STRATEGY,
+ "text_columns": DEFAULT_TEXT_COLUMNS,
+ "excel_sheets": DEFAULT_EXCEL_SHEETS,
+ "fuzzy_mistakes": DIRECT_MODE_FUZZY_MISTAKES,
+ # Duplicate Detection Arguments
+ "duplicate_type": DIRECT_MODE_DUPLICATE_TYPE,
+ "similarity_threshold": DIRECT_MODE_SIMILARITY_THRESHOLD,
+ "min_word_count": DIRECT_MODE_MIN_WORD_COUNT,
+ "min_consecutive_pages": DIRECT_MODE_MIN_CONSECUTIVE_PAGES,
+ "greedy_match": DIRECT_MODE_GREEDY_MATCH,
+ "combine_pages": DIRECT_MODE_COMBINE_PAGES,
+ "remove_duplicate_rows": DIRECT_MODE_REMOVE_DUPLICATE_ROWS,
+ # Textract Batch Operations Arguments
+ "textract_action": DIRECT_MODE_TEXTRACT_ACTION,
+ "job_id": DIRECT_MODE_JOB_ID,
+ "textract_bucket": TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET,
+ "textract_input_prefix": TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER,
+ "textract_output_prefix": TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER,
+ "s3_textract_document_logs_subfolder": TEXTRACT_JOBS_S3_LOC,
+ "local_textract_document_logs_subfolder": TEXTRACT_JOBS_LOCAL_LOC,
+ "poll_interval": 30,
+ "max_poll_attempts": 120,
+ # Additional arguments
+ "search_query": DEFAULT_SEARCH_QUERY,
+ }
+
+ print(f"Running in direct mode with task: {DIRECT_MODE_TASK}")
+ print(f"Input file: {DIRECT_MODE_INPUT_FILE}")
+ print(f"Output directory: {DIRECT_MODE_OUTPUT_DIR}")
+
+ if DIRECT_MODE_TASK == "deduplicate":
+ print(f"Duplicate type: {DIRECT_MODE_DUPLICATE_TYPE}")
+ print(f"Similarity threshold: {DEFAULT_DUPLICATE_DETECTION_THRESHOLD}")
+ print(f"Min word count: {DEFAULT_MIN_WORD_COUNT}")
+ if DEFAULT_SEARCH_QUERY:
+ print(f"Search query: {DEFAULT_SEARCH_QUERY}")
+ if DEFAULT_TEXT_COLUMNS:
+ print(f"Text columns: {DEFAULT_TEXT_COLUMNS}")
+ print(f"Remove duplicate rows: {REMOVE_DUPLICATE_ROWS}")
+
+ if DIRECT_MODE_TASK == "summarise":
+ print(
+ "Summarisation: supports PDF (extract text via ocr_method then summarise) or OCR CSV(s). "
+ "Use DIRECT_MODE_INPUT_FILE for a single path or comma-separated paths for multiple CSVs. "
+ "Options: summarisation_inference_method, summarisation_format, "
+ "summarisation_context, summarisation_additional_instructions; ocr_method applies when input is PDF."
+ )
+
+ if DIRECT_MODE_TASK == "combine_review_pdfs":
+ print(
+ "Combine review PDFs: merge redaction comments from multiple '_redactions_for_review' PDFs. "
+ "Set DIRECT_MODE_INPUT_FILE to comma-separated paths (at least 2)."
+ )
+
+ # Combine extraction options
+ extraction_options = (
+ list(direct_mode_args["handwrite_signature_extraction"])
+ if direct_mode_args["handwrite_signature_extraction"]
+ else list()
+ )
+ if direct_mode_args["extract_forms"]:
+ extraction_options.append("Extract forms")
+ if direct_mode_args["extract_tables"]:
+ extraction_options.append("Extract tables")
+ if direct_mode_args["extract_layout"]:
+ extraction_options.append("Extract layout")
+ direct_mode_args["handwrite_signature_extraction"] = extraction_options
+
+ # Run the CLI main function with direct mode arguments
+ main(direct_mode_args=direct_mode_args)
diff --git a/cdk/__init__.py b/cdk/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/cdk/app.py b/cdk/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..71dc39fd153f3c061b8c1945d5b4f3fc2bf12d62
--- /dev/null
+++ b/cdk/app.py
@@ -0,0 +1,83 @@
+import os
+
+from aws_cdk import App, Environment
+from cdk_config import AWS_ACCOUNT_ID, AWS_REGION, RUN_USEAST_STACK, USE_CLOUDFRONT
+from cdk_functions import create_basic_config_env, load_context_from_file
+from cdk_stack import CdkStack, CdkStackCloudfront # , CdkStackMain
+
+# Assuming these are still relevant for you
+from check_resources import CONTEXT_FILE, check_and_set_context
+
+# Initialize the CDK app
+app = App()
+
+# --- ENHANCED CONTEXT GENERATION AND LOADING ---
+# 1. Always ensure the old context file is removed before generation
+if os.path.exists(CONTEXT_FILE):
+ try:
+ os.remove(CONTEXT_FILE)
+ print(f"Removed stale context file: {CONTEXT_FILE}")
+ except OSError as e:
+ print(f"Warning: Could not remove old context file {CONTEXT_FILE}: {e}")
+ # Proceed anyway, check_and_set_context might handle overwriting
+
+# 2. Always run the pre-check script to generate fresh context
+print("Running pre-check script to generate application context...")
+try:
+ check_and_set_context()
+ if not os.path.exists(CONTEXT_FILE):
+ raise RuntimeError(
+ f"check_and_set_context() finished, but {CONTEXT_FILE} was not created."
+ )
+ print(f"Context generated successfully at {CONTEXT_FILE}.")
+except Exception as e:
+ raise RuntimeError(f"Failed to generate context via check_and_set_context(): {e}")
+
+if os.path.exists(CONTEXT_FILE):
+ load_context_from_file(app, CONTEXT_FILE)
+else:
+ raise RuntimeError(f"Could not find {CONTEXT_FILE}.")
+
+# Create basic config.env file that user can use to run the app later. Input is the folder it is saved into.
+create_basic_config_env("config")
+
+# Define the environment for the regional stack (where ALB resides)
+aws_env_regional = Environment(account=AWS_ACCOUNT_ID, region=AWS_REGION)
+
+# Create the regional stack (ALB, SGs, etc.)
+# regional_stack = CdkStack(app,
+# "RedactionStackSubnets",
+# env=aws_env_regional,
+# cross_region_references=True)
+
+# regional_stack_main = CdkStackMain(app,
+# "RedactionStackMain",
+# env=aws_env_regional,
+# private_subnets=regional_stack.params["private_subnets"],
+# private_route_tables=regional_stack.params["private_route_tables"],
+# public_subnets=regional_stack.params["public_subnets"],
+# public_route_tables=regional_stack.params["public_route_tables"],
+# cross_region_references=True)
+
+regional_stack = CdkStack(
+ app, "RedactionStack", env=aws_env_regional, cross_region_references=True
+)
+
+if USE_CLOUDFRONT == "True" and RUN_USEAST_STACK == "True":
+ # Define the environment for the CloudFront stack (always us-east-1 for CF-level resources like WAFv2 WebACLs for CF)
+ aws_env_us_east_1 = Environment(account=AWS_ACCOUNT_ID, region="us-east-1")
+
+ # Create the CloudFront stack, passing the outputs from the regional stack
+ cloudfront_stack = CdkStackCloudfront(
+ app,
+ "RedactionStackCloudfront",
+ env=aws_env_us_east_1,
+ alb_arn=regional_stack.params["alb_arn_output"],
+ alb_sec_group_id=regional_stack.params["alb_security_group_id"],
+ alb_dns_name=regional_stack.params["alb_dns_name"],
+ cross_region_references=True,
+ )
+
+
+# Synthesize the CloudFormation template
+app.synth(validate_on_synthesis=True)
diff --git a/cdk/cdk_config.py b/cdk/cdk_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7d53c07f8c4026527c88c989e98c09fba1f6d21
--- /dev/null
+++ b/cdk/cdk_config.py
@@ -0,0 +1,362 @@
+import os
+import tempfile
+
+from dotenv import load_dotenv
+
+# Set or retrieve configuration variables for CDK redaction deployment
+
+
+def convert_string_to_boolean(value: str) -> bool:
+ """Convert string to boolean, handling various formats."""
+ if isinstance(value, bool):
+ return value
+ elif value in ["True", "1", "true", "TRUE"]:
+ return True
+ elif value in ["False", "0", "false", "FALSE"]:
+ return False
+ else:
+ raise ValueError(f"Invalid boolean value: {value}")
+
+
+def get_or_create_env_var(var_name: str, default_value: str, print_val: bool = False):
+ """
+ Get an environmental variable, and set it to a default value if it doesn't exist
+ """
+ # Get the environment variable if it exists
+ value = os.environ.get(var_name)
+
+ # If it doesn't exist, set the environment variable to the default value
+ if value is None:
+ os.environ[var_name] = default_value
+ value = default_value
+
+ if print_val is True:
+ print(f"The value of {var_name} is {value}")
+
+ return value
+
+
+def ensure_folder_exists(output_folder: str):
+ """Checks if the specified folder exists, creates it if not."""
+
+ if not os.path.exists(output_folder):
+ # Create the folder if it doesn't exist
+ os.makedirs(output_folder, exist_ok=True)
+ print(f"Created the {output_folder} folder.")
+ else:
+ print(f"The {output_folder} folder already exists.")
+
+
+def add_folder_to_path(folder_path: str):
+ """
+ Check if a folder exists on your system. If so, get the absolute path and then add it to the system Path variable if it doesn't already exist. Function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
+ """
+
+ if os.path.exists(folder_path) and os.path.isdir(folder_path):
+ print(folder_path, "folder exists.")
+
+ # Resolve relative path to absolute path
+ absolute_path = os.path.abspath(folder_path)
+
+ current_path = os.environ["PATH"]
+ if absolute_path not in current_path.split(os.pathsep):
+ full_path_extension = absolute_path + os.pathsep + current_path
+ os.environ["PATH"] = full_path_extension
+ # print(f"Updated PATH with: ", full_path_extension)
+ else:
+ print(f"Directory {folder_path} already exists in PATH.")
+ else:
+ print(f"Folder not found at {folder_path} - not added to PATH")
+
+
+###
+# LOAD CONFIG FROM ENV FILE
+###
+CONFIG_FOLDER = get_or_create_env_var("CONFIG_FOLDER", "config/")
+
+ensure_folder_exists(CONFIG_FOLDER)
+
+# If you have an aws_config env file in the config folder, you can load in app variables this way, e.g. 'config/cdk_config.env'
+CDK_CONFIG_PATH = get_or_create_env_var(
+ "CDK_CONFIG_PATH", "config/cdk_config.env"
+) # e.g. config/cdk_config.env
+
+if CDK_CONFIG_PATH:
+ if os.path.exists(CDK_CONFIG_PATH):
+ print(f"Loading CDK variables from config file {CDK_CONFIG_PATH}")
+ load_dotenv(CDK_CONFIG_PATH)
+ else:
+ print("CDK config file not found at location:", CDK_CONFIG_PATH)
+
+###
+# AWS OPTIONS
+###
+AWS_REGION = get_or_create_env_var("AWS_REGION", "")
+AWS_ACCOUNT_ID = get_or_create_env_var("AWS_ACCOUNT_ID", "")
+
+###
+# CDK OPTIONS
+###
+CDK_PREFIX = get_or_create_env_var("CDK_PREFIX", "")
+CONTEXT_FILE = get_or_create_env_var(
+ "CONTEXT_FILE", "cdk.context.json"
+) # Define the CDK output context file name
+CDK_FOLDER = get_or_create_env_var(
+ "CDK_FOLDER", ""
+) # FULL_PATH_TO_CDK_FOLDER_HERE (with forward slash)
+RUN_USEAST_STACK = get_or_create_env_var("RUN_USEAST_STACK", "False")
+
+### VPC and connections
+VPC_NAME = get_or_create_env_var("VPC_NAME", "")
+NEW_VPC_DEFAULT_NAME = get_or_create_env_var("NEW_VPC_DEFAULT_NAME", f"{CDK_PREFIX}vpc")
+NEW_VPC_CIDR = get_or_create_env_var("NEW_VPC_CIDR", "") # "10.0.0.0/24"
+
+
+EXISTING_IGW_ID = get_or_create_env_var("EXISTING_IGW_ID", "")
+SINGLE_NAT_GATEWAY_ID = get_or_create_env_var("SINGLE_NAT_GATEWAY_ID", "")
+
+### SUBNETS / ROUTE TABLES / NAT GATEWAY
+PUBLIC_SUBNETS_TO_USE = get_or_create_env_var(
+ "PUBLIC_SUBNETS_TO_USE", ""
+) # e.g. ['PublicSubnet1', 'PublicSubnet2']
+PUBLIC_SUBNET_CIDR_BLOCKS = get_or_create_env_var(
+ "PUBLIC_SUBNET_CIDR_BLOCKS", ""
+) # e.g. ["10.0.1.0/24", "10.0.2.0/24"]
+PUBLIC_SUBNET_AVAILABILITY_ZONES = get_or_create_env_var(
+ "PUBLIC_SUBNET_AVAILABILITY_ZONES", ""
+) # e.g. ["eu-east-1b", "eu-east1b"]
+
+PRIVATE_SUBNETS_TO_USE = get_or_create_env_var(
+ "PRIVATE_SUBNETS_TO_USE", ""
+) # e.g. ['PrivateSubnet1', 'PrivateSubnet2']
+PRIVATE_SUBNET_CIDR_BLOCKS = get_or_create_env_var(
+ "PRIVATE_SUBNET_CIDR_BLOCKS", ""
+) # e.g. ["10.0.1.0/24", "10.0.2.0/24"]
+PRIVATE_SUBNET_AVAILABILITY_ZONES = get_or_create_env_var(
+ "PRIVATE_SUBNET_AVAILABILITY_ZONES", ""
+) # e.g. ["eu-east-1b", "eu-east1b"]
+
+ROUTE_TABLE_BASE_NAME = get_or_create_env_var(
+ "ROUTE_TABLE_BASE_NAME", f"{CDK_PREFIX}PrivateRouteTable"
+)
+NAT_GATEWAY_EIP_NAME = get_or_create_env_var(
+ "NAT_GATEWAY_EIP_NAME", f"{CDK_PREFIX}NatGatewayEip"
+)
+NAT_GATEWAY_NAME = get_or_create_env_var("NAT_GATEWAY_NAME", f"{CDK_PREFIX}NatGateway")
+
+# IAM roles
+AWS_MANAGED_TASK_ROLES_LIST = get_or_create_env_var(
+ "AWS_MANAGED_TASK_ROLES_LIST",
+ '["AmazonCognitoReadOnly", "service-role/AmazonECSTaskExecutionRolePolicy", "AmazonS3FullAccess", "AmazonTextractFullAccess", "ComprehendReadOnly", "AmazonDynamoDBFullAccess", "service-role/AWSAppSyncPushToCloudWatchLogs", "AmazonBedrockFullAccess"]',
+)
+POLICY_FILE_LOCATIONS = get_or_create_env_var(
+ "POLICY_FILE_LOCATIONS", ""
+) # e.g. '["config/sts_permissions.json"]'
+POLICY_FILE_ARNS = get_or_create_env_var("POLICY_FILE_ARNS", "")
+
+# GITHUB REPO
+GITHUB_REPO_USERNAME = get_or_create_env_var("GITHUB_REPO_USERNAME", "seanpedrick-case")
+GITHUB_REPO_NAME = get_or_create_env_var("GITHUB_REPO_NAME", "doc_redaction")
+GITHUB_REPO_BRANCH = get_or_create_env_var("GITHUB_REPO_BRANCH", "main")
+
+### CODEBUILD
+CODEBUILD_ROLE_NAME = get_or_create_env_var(
+ "CODEBUILD_ROLE_NAME", f"{CDK_PREFIX}CodeBuildRole"
+)
+CODEBUILD_PROJECT_NAME = get_or_create_env_var(
+ "CODEBUILD_PROJECT_NAME", f"{CDK_PREFIX}CodeBuildProject"
+)
+
+### ECR
+ECR_REPO_NAME = get_or_create_env_var(
+ "ECR_REPO_NAME", "doc-redaction"
+) # Beware - cannot have underscores and must be lower case
+ECR_CDK_REPO_NAME = get_or_create_env_var(
+ "ECR_CDK_REPO_NAME", f"{CDK_PREFIX}{ECR_REPO_NAME}".lower()
+)
+
+### S3
+S3_LOG_CONFIG_BUCKET_NAME = get_or_create_env_var(
+ "S3_LOG_CONFIG_BUCKET_NAME", f"{CDK_PREFIX}s3-logs".lower()
+) # S3 bucket names need to be lower case
+S3_OUTPUT_BUCKET_NAME = get_or_create_env_var(
+ "S3_OUTPUT_BUCKET_NAME", f"{CDK_PREFIX}s3-output".lower()
+)
+
+### KMS KEYS FOR S3 AND SECRETS MANAGER
+USE_CUSTOM_KMS_KEY = get_or_create_env_var("USE_CUSTOM_KMS_KEY", "1")
+CUSTOM_KMS_KEY_NAME = get_or_create_env_var(
+ "CUSTOM_KMS_KEY_NAME", f"alias/{CDK_PREFIX}kms-key".lower()
+)
+
+### ECS
+FARGATE_TASK_DEFINITION_NAME = get_or_create_env_var(
+ "FARGATE_TASK_DEFINITION_NAME", f"{CDK_PREFIX}FargateTaskDefinition"
+)
+TASK_DEFINITION_FILE_LOCATION = get_or_create_env_var(
+ "TASK_DEFINITION_FILE_LOCATION", CDK_FOLDER + CONFIG_FOLDER + "task_definition.json"
+)
+
+CLUSTER_NAME = get_or_create_env_var("CLUSTER_NAME", f"{CDK_PREFIX}Cluster")
+ECS_SERVICE_NAME = get_or_create_env_var("ECS_SERVICE_NAME", f"{CDK_PREFIX}ECSService")
+ECS_TASK_ROLE_NAME = get_or_create_env_var(
+ "ECS_TASK_ROLE_NAME", f"{CDK_PREFIX}TaskRole"
+)
+ECS_TASK_EXECUTION_ROLE_NAME = get_or_create_env_var(
+ "ECS_TASK_EXECUTION_ROLE_NAME", f"{CDK_PREFIX}ExecutionRole"
+)
+ECS_SECURITY_GROUP_NAME = get_or_create_env_var(
+ "ECS_SECURITY_GROUP_NAME", f"{CDK_PREFIX}SecurityGroupECS"
+)
+ECS_LOG_GROUP_NAME = get_or_create_env_var(
+ "ECS_LOG_GROUP_NAME", f"/ecs/{ECS_SERVICE_NAME}-logs".lower()
+)
+
+ECS_TASK_CPU_SIZE = get_or_create_env_var("ECS_TASK_CPU_SIZE", "1024")
+ECS_TASK_MEMORY_SIZE = get_or_create_env_var("ECS_TASK_MEMORY_SIZE", "4096")
+ECS_USE_FARGATE_SPOT = get_or_create_env_var("USE_FARGATE_SPOT", "False")
+ECS_READ_ONLY_FILE_SYSTEM = get_or_create_env_var("ECS_READ_ONLY_FILE_SYSTEM", "True")
+
+### Cognito
+COGNITO_USER_POOL_NAME = get_or_create_env_var(
+ "COGNITO_USER_POOL_NAME", f"{CDK_PREFIX}UserPool"
+)
+COGNITO_USER_POOL_CLIENT_NAME = get_or_create_env_var(
+ "COGNITO_USER_POOL_CLIENT_NAME", f"{CDK_PREFIX}UserPoolClient"
+)
+COGNITO_USER_POOL_CLIENT_SECRET_NAME = get_or_create_env_var(
+ "COGNITO_USER_POOL_CLIENT_SECRET_NAME", f"{CDK_PREFIX}ParamCognitoSecret"
+)
+COGNITO_USER_POOL_DOMAIN_PREFIX = get_or_create_env_var(
+ "COGNITO_USER_POOL_DOMAIN_PREFIX", "redaction-app-domain"
+) # Should change this to something unique or you'll probably hit an error
+
+COGNITO_REFRESH_TOKEN_VALIDITY = int(
+ get_or_create_env_var("COGNITO_REFRESH_TOKEN_VALIDITY", "480")
+) # Minutes
+COGNITO_ID_TOKEN_VALIDITY = int(
+ get_or_create_env_var("COGNITO_ID_TOKEN_VALIDITY", "60")
+) # Minutes
+COGNITO_ACCESS_TOKEN_VALIDITY = int(
+ get_or_create_env_var("COGNITO_ACCESS_TOKEN_VALIDITY", "60")
+) # Minutes
+
+# Application load balancer
+ALB_NAME = get_or_create_env_var(
+ "ALB_NAME", f"{CDK_PREFIX}Alb"[-32:]
+) # Application load balancer name can be max 32 characters, so taking the last 32 characters of the suggested name
+ALB_NAME_SECURITY_GROUP_NAME = get_or_create_env_var(
+ "ALB_SECURITY_GROUP_NAME", f"{CDK_PREFIX}SecurityGroupALB"
+)
+ALB_TARGET_GROUP_NAME = get_or_create_env_var(
+ "ALB_TARGET_GROUP_NAME", f"{CDK_PREFIX}-tg"[-32:]
+) # Max 32 characters
+EXISTING_LOAD_BALANCER_ARN = get_or_create_env_var("EXISTING_LOAD_BALANCER_ARN", "")
+EXISTING_LOAD_BALANCER_DNS = get_or_create_env_var(
+ "EXISTING_LOAD_BALANCER_ARN", "placeholder_load_balancer_dns.net"
+)
+
+## CLOUDFRONT
+USE_CLOUDFRONT = get_or_create_env_var("USE_CLOUDFRONT", "True")
+CLOUDFRONT_PREFIX_LIST_ID = get_or_create_env_var(
+ "CLOUDFRONT_PREFIX_LIST_ID", "pl-93a247fa"
+)
+CLOUDFRONT_GEO_RESTRICTION = get_or_create_env_var(
+ "CLOUDFRONT_GEO_RESTRICTION", ""
+) # A country that Cloudfront restricts access to. See here: https://docs.aws.amazon.com/AmazonCloudFront/latest/DeveloperGuide/georestrictions.html
+CLOUDFRONT_DISTRIBUTION_NAME = get_or_create_env_var(
+ "CLOUDFRONT_DISTRIBUTION_NAME", f"{CDK_PREFIX}CfDist"
+)
+CLOUDFRONT_DOMAIN = get_or_create_env_var(
+ "CLOUDFRONT_DOMAIN", "cloudfront_placeholder.net"
+)
+
+
+# Certificate for Application load balancer (optional, for HTTPS and logins through the ALB)
+ACM_SSL_CERTIFICATE_ARN = get_or_create_env_var("ACM_SSL_CERTIFICATE_ARN", "")
+SSL_CERTIFICATE_DOMAIN = get_or_create_env_var(
+ "SSL_CERTIFICATE_DOMAIN", ""
+) # e.g. example.com or www.example.com
+
+# This should be the CloudFront domain, the domain linked to your ACM certificate, or the DNS of your application load balancer in console afterwards
+if USE_CLOUDFRONT == "True":
+ COGNITO_REDIRECTION_URL = get_or_create_env_var(
+ "COGNITO_REDIRECTION_URL", "https://" + CLOUDFRONT_DOMAIN
+ )
+elif SSL_CERTIFICATE_DOMAIN:
+ COGNITO_REDIRECTION_URL = get_or_create_env_var(
+ "COGNITO_REDIRECTION_URL", "https://" + SSL_CERTIFICATE_DOMAIN
+ )
+else:
+ COGNITO_REDIRECTION_URL = get_or_create_env_var(
+ "COGNITO_REDIRECTION_URL", "https://" + EXISTING_LOAD_BALANCER_DNS
+ )
+
+# Custom headers e.g. if routing traffic through Cloudfront
+CUSTOM_HEADER = get_or_create_env_var(
+ "CUSTOM_HEADER", ""
+) # Retrieving or setting CUSTOM_HEADER
+CUSTOM_HEADER_VALUE = get_or_create_env_var(
+ "CUSTOM_HEADER_VALUE", ""
+) # Retrieving or setting CUSTOM_HEADER_VALUE
+
+# Firewall on top of load balancer
+LOAD_BALANCER_WEB_ACL_NAME = get_or_create_env_var(
+ "LOAD_BALANCER_WEB_ACL_NAME", f"{CDK_PREFIX}alb-web-acl"
+)
+
+# Firewall on top of CloudFront
+WEB_ACL_NAME = get_or_create_env_var("WEB_ACL_NAME", f"{CDK_PREFIX}cloudfront-web-acl")
+
+###
+# File I/O options
+###
+
+OUTPUT_FOLDER = get_or_create_env_var("GRADIO_OUTPUT_FOLDER", "output/") # 'output/'
+INPUT_FOLDER = get_or_create_env_var("GRADIO_INPUT_FOLDER", "input/") # 'input/'
+
+# Allow for files to be saved in a temporary folder for increased security in some instances
+if OUTPUT_FOLDER == "TEMP" or INPUT_FOLDER == "TEMP":
+ # Create a temporary directory
+ with tempfile.TemporaryDirectory() as temp_dir:
+ print(f"Temporary directory created at: {temp_dir}")
+
+ if OUTPUT_FOLDER == "TEMP":
+ OUTPUT_FOLDER = temp_dir + "/"
+ if INPUT_FOLDER == "TEMP":
+ INPUT_FOLDER = temp_dir + "/"
+
+###
+# LOGGING OPTIONS
+###
+
+SAVE_LOGS_TO_CSV = get_or_create_env_var("SAVE_LOGS_TO_CSV", "True")
+
+### DYNAMODB logs. Whether to save to DynamoDB, and the headers of the table
+SAVE_LOGS_TO_DYNAMODB = get_or_create_env_var("SAVE_LOGS_TO_DYNAMODB", "True")
+ACCESS_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var(
+ "ACCESS_LOG_DYNAMODB_TABLE_NAME", f"{CDK_PREFIX}dynamodb-access-logs".lower()
+)
+FEEDBACK_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var(
+ "FEEDBACK_LOG_DYNAMODB_TABLE_NAME", f"{CDK_PREFIX}dynamodb-feedback-logs".lower()
+)
+USAGE_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var(
+ "USAGE_LOG_DYNAMODB_TABLE_NAME", f"{CDK_PREFIX}dynamodb-usage-logs".lower()
+)
+
+###
+# REDACTION OPTIONS
+###
+
+# Get some environment variables and Launch the Gradio app
+COGNITO_AUTH = get_or_create_env_var("COGNITO_AUTH", "0")
+
+GRADIO_SERVER_PORT = int(get_or_create_env_var("GRADIO_SERVER_PORT", "7860"))
+
+###
+# WHOLE DOCUMENT API OPTIONS
+###
+
+DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS = get_or_create_env_var(
+ "DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS", "7"
+) # How many days into the past should whole document Textract jobs be displayed? After that, the data is not deleted from the Textract jobs csv, but it is just filtered out. Included to align with S3 buckets where the file outputs will be automatically deleted after X days.
diff --git a/cdk/cdk_functions.py b/cdk/cdk_functions.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3d51e84ec35a339536546d3c2531a7f442cc59d
--- /dev/null
+++ b/cdk/cdk_functions.py
@@ -0,0 +1,1482 @@
+import ipaddress
+import json
+import os
+from typing import Any, Dict, List, Optional, Tuple
+
+import boto3
+import pandas as pd
+from aws_cdk import App, CfnOutput, CfnTag, Tags
+from aws_cdk import aws_cognito as cognito
+from aws_cdk import aws_ec2 as ec2
+from aws_cdk import aws_elasticloadbalancingv2 as elb
+from aws_cdk import aws_elasticloadbalancingv2_actions as elb_act
+from aws_cdk import aws_iam as iam
+from aws_cdk import aws_wafv2 as wafv2
+from botocore.exceptions import ClientError
+from cdk_config import (
+ ACCESS_LOG_DYNAMODB_TABLE_NAME,
+ AWS_REGION,
+ FEEDBACK_LOG_DYNAMODB_TABLE_NAME,
+ NAT_GATEWAY_EIP_NAME,
+ POLICY_FILE_LOCATIONS,
+ PRIVATE_SUBNET_AVAILABILITY_ZONES,
+ PRIVATE_SUBNET_CIDR_BLOCKS,
+ PRIVATE_SUBNETS_TO_USE,
+ PUBLIC_SUBNET_AVAILABILITY_ZONES,
+ PUBLIC_SUBNET_CIDR_BLOCKS,
+ PUBLIC_SUBNETS_TO_USE,
+ S3_LOG_CONFIG_BUCKET_NAME,
+ S3_OUTPUT_BUCKET_NAME,
+ USAGE_LOG_DYNAMODB_TABLE_NAME,
+)
+from constructs import Construct
+from dotenv import set_key
+
+
+# --- Function to load context from file ---
+def load_context_from_file(app: App, file_path: str):
+ if os.path.exists(file_path):
+ with open(file_path, "r") as f:
+ context_data = json.load(f)
+ for key, value in context_data.items():
+ app.node.set_context(key, value)
+ print(f"Loaded context from {file_path}")
+ else:
+ print(f"Context file not found: {file_path}")
+
+
+# --- Helper to parse environment variables into lists ---
+def _get_env_list(env_var_name: str) -> List[str]:
+ """Parses a comma-separated environment variable into a list of strings."""
+ value = env_var_name[1:-1].strip().replace('"', "").replace("'", "")
+ if not value:
+ return []
+ # Split by comma and filter out any empty strings that might result from extra commas
+ return [s.strip() for s in value.split(",") if s.strip()]
+
+
+# 1. Try to load CIDR/AZs from environment variables
+if PUBLIC_SUBNETS_TO_USE:
+ PUBLIC_SUBNETS_TO_USE = _get_env_list(PUBLIC_SUBNETS_TO_USE)
+if PRIVATE_SUBNETS_TO_USE:
+ PRIVATE_SUBNETS_TO_USE = _get_env_list(PRIVATE_SUBNETS_TO_USE)
+
+if PUBLIC_SUBNET_CIDR_BLOCKS:
+ PUBLIC_SUBNET_CIDR_BLOCKS = _get_env_list("PUBLIC_SUBNET_CIDR_BLOCKS")
+if PUBLIC_SUBNET_AVAILABILITY_ZONES:
+ PUBLIC_SUBNET_AVAILABILITY_ZONES = _get_env_list("PUBLIC_SUBNET_AVAILABILITY_ZONES")
+if PRIVATE_SUBNET_CIDR_BLOCKS:
+ PRIVATE_SUBNET_CIDR_BLOCKS = _get_env_list("PRIVATE_SUBNET_CIDR_BLOCKS")
+if PRIVATE_SUBNET_AVAILABILITY_ZONES:
+ PRIVATE_SUBNET_AVAILABILITY_ZONES = _get_env_list(
+ "PRIVATE_SUBNET_AVAILABILITY_ZONES"
+ )
+
+if POLICY_FILE_LOCATIONS:
+ POLICY_FILE_LOCATIONS = _get_env_list(POLICY_FILE_LOCATIONS)
+
+
+def check_for_existing_role(role_name: str):
+ try:
+ iam = boto3.client("iam")
+ # iam.get_role(RoleName=role_name)
+
+ response = iam.get_role(RoleName=role_name)
+ role = response["Role"]["Arn"]
+
+ print("Response Role:", role)
+
+ return True, role, ""
+ except iam.exceptions.NoSuchEntityException:
+ return False, "", ""
+ except Exception as e:
+ raise Exception("Getting information on IAM role failed due to:", e)
+
+
+from typing import List
+
+# Assume POLICY_FILE_LOCATIONS is defined globally or passed as a default
+# For example:
+# POLICY_FILE_LOCATIONS = ["./policies/my_read_policy.json", "./policies/my_write_policy.json"]
+
+
+def add_statement_to_policy(role: iam.IRole, policy_document: Dict[str, Any]):
+ """
+ Adds individual policy statements from a parsed policy document to a CDK Role.
+
+ Args:
+ role: The CDK Role construct to attach policies to.
+ policy_document: A Python dictionary representing an IAM policy document.
+ """
+ # Ensure the loaded JSON is a valid policy document structure
+ if "Statement" not in policy_document or not isinstance(
+ policy_document["Statement"], list
+ ):
+ print("Warning: Policy document does not contain a 'Statement' list. Skipping.")
+ return # Do not return role, just log and exit
+
+ for statement_dict in policy_document["Statement"]:
+ try:
+ # Create a CDK PolicyStatement from the dictionary
+ cdk_policy_statement = iam.PolicyStatement.from_json(statement_dict)
+
+ # Add the policy statement to the role
+ role.add_to_policy(cdk_policy_statement)
+ print(f" - Added statement: {statement_dict.get('Sid', 'No Sid')}")
+ except Exception as e:
+ print(
+ f"Warning: Could not process policy statement: {statement_dict}. Error: {e}"
+ )
+
+
+def add_custom_policies(
+ scope: Construct, # Not strictly used here, but good practice if you expand to ManagedPolicies
+ role: iam.IRole,
+ policy_file_locations: Optional[List[str]] = None,
+ custom_policy_text: Optional[str] = None,
+) -> iam.IRole:
+ """
+ Loads custom policies from JSON files or a string and attaches them to a CDK Role.
+
+ Args:
+ scope: The scope in which to define constructs (if needed, e.g., for iam.ManagedPolicy).
+ role: The CDK Role construct to attach policies to.
+ policy_file_locations: List of file paths to JSON policy documents.
+ custom_policy_text: A JSON string representing a policy document.
+
+ Returns:
+ The modified CDK Role construct.
+ """
+ if policy_file_locations is None:
+ policy_file_locations = []
+
+ current_source = "unknown source" # For error messages
+
+ try:
+ if policy_file_locations:
+ print(f"Attempting to add policies from files to role {role.node.id}...")
+ for path in policy_file_locations:
+ current_source = f"file: {path}"
+ try:
+ with open(path, "r") as f:
+ policy_document = json.load(f)
+ print(f"Processing policy from {current_source}...")
+ add_statement_to_policy(role, policy_document)
+ except FileNotFoundError:
+ print(f"Warning: Policy file not found at {path}. Skipping.")
+ except json.JSONDecodeError as e:
+ print(
+ f"Warning: Invalid JSON in policy file {path}: {e}. Skipping."
+ )
+ except Exception as e:
+ print(
+ f"An unexpected error occurred processing policy from {path}: {e}. Skipping."
+ )
+
+ if custom_policy_text:
+ current_source = "custom policy text string"
+ print(
+ f"Attempting to add policy from custom text to role {role.node.id}..."
+ )
+ try:
+ # *** FIX: Parse the JSON string into a Python dictionary ***
+ policy_document = json.loads(custom_policy_text)
+ print(f"Processing policy from {current_source}...")
+ add_statement_to_policy(role, policy_document)
+ except json.JSONDecodeError as e:
+ print(f"Warning: Invalid JSON in custom_policy_text: {e}. Skipping.")
+ except Exception as e:
+ print(
+ f"An unexpected error occurred processing policy from custom_policy_text: {e}. Skipping."
+ )
+
+ # You might want a final success message, but individual processing messages are also good.
+ print(f"Finished processing custom policies for role {role.node.id}.")
+
+ except Exception as e:
+ print(
+ f"An unhandled error occurred during policy addition for {current_source}: {e}"
+ )
+
+ return role
+
+
+# Import the S3 Bucket class if you intend to return a CDK object later
+# from aws_cdk import aws_s3 as s3
+
+
+def check_s3_bucket_exists(
+ bucket_name: str,
+): # Return type hint depends on what you return
+ """
+ Checks if an S3 bucket with the given name exists and is accessible.
+
+ Args:
+ bucket_name: The name of the S3 bucket to check.
+
+ Returns:
+ A tuple: (bool indicating existence, optional S3 Bucket object or None)
+ Note: Returning a Boto3 S3 Bucket object from here is NOT ideal
+ for direct use in CDK. You'll likely only need the boolean result
+ or the bucket name for CDK lookups/creations.
+ For this example, let's return the boolean and the name.
+ """
+ s3_client = boto3.client("s3")
+ try:
+ # Use head_bucket to check for existence and access
+ s3_client.head_bucket(Bucket=bucket_name)
+ print(f"Bucket '{bucket_name}' exists and is accessible.")
+ return True, bucket_name # Return True and the bucket name
+
+ except ClientError as e:
+ # If a ClientError occurs, check the error code.
+ # '404' means the bucket does not exist.
+ # '403' means the bucket exists but you don't have permission.
+ error_code = e.response["Error"]["Code"]
+ if error_code == "404":
+ print(f"Bucket '{bucket_name}' does not exist.")
+ return False, None
+ elif error_code == "403":
+ # The bucket exists, but you can't access it.
+ # Depending on your requirements, this might be treated as "exists"
+ # or "not accessible for our purpose". For checking existence,
+ # we'll say it exists here, but note the permission issue.
+ # NOTE - when I tested this, it was returning 403 even for buckets that don't exist. So I will return False instead
+ print(
+ f"Bucket '{bucket_name}' returned 403, which indicates it may exist but is not accessible due to permissions, or that it doesn't exist. Returning False for existence just in case."
+ )
+ return False, bucket_name # It exists, even if not accessible
+ else:
+ # For other errors, it's better to raise the exception
+ # to indicate something unexpected happened.
+ print(
+ f"An unexpected AWS ClientError occurred checking bucket '{bucket_name}': {e}"
+ )
+ # Decide how to handle other errors - raising might be safer
+ raise # Re-raise the original exception
+ except Exception as e:
+ print(
+ f"An unexpected non-ClientError occurred checking bucket '{bucket_name}': {e}"
+ )
+ # Decide how to handle other errors
+ raise # Re-raise the original exception
+
+
+# Example usage in your check_resources.py:
+# exists, bucket_name_if_exists = check_s3_bucket_exists(log_bucket_name)
+# context_data[f"exists:{log_bucket_name}"] = exists
+# # You don't necessarily need to store the name in context if using from_bucket_name
+
+
+# Delete an S3 bucket
+def delete_s3_bucket(bucket_name: str):
+ s3 = boto3.client("s3")
+
+ try:
+ # List and delete all objects
+ response = s3.list_object_versions(Bucket=bucket_name)
+ versions = response.get("Versions", []) + response.get("DeleteMarkers", [])
+ for version in versions:
+ s3.delete_object(
+ Bucket=bucket_name, Key=version["Key"], VersionId=version["VersionId"]
+ )
+
+ # Delete the bucket
+ s3.delete_bucket(Bucket=bucket_name)
+ return {"Status": "SUCCESS"}
+ except Exception as e:
+ return {"Status": "FAILED", "Reason": str(e)}
+
+
+# Function to get subnet ID from subnet name
+def get_subnet_id(vpc: str, ec2_client: str, subnet_name: str):
+ response = ec2_client.describe_subnets(
+ Filters=[{"Name": "vpc-id", "Values": [vpc.vpc_id]}]
+ )
+
+ for subnet in response["Subnets"]:
+ if subnet["Tags"] and any(
+ tag["Key"] == "Name" and tag["Value"] == subnet_name
+ for tag in subnet["Tags"]
+ ):
+ return subnet["SubnetId"]
+
+ return None
+
+
+def check_ecr_repo_exists(repo_name: str) -> tuple[bool, dict]:
+ """
+ Checks if an ECR repository with the given name exists.
+
+ Args:
+ repo_name: The name of the ECR repository to check.
+
+ Returns:
+ True if the repository exists, False otherwise.
+ """
+ ecr_client = boto3.client("ecr")
+ try:
+ print("ecr repo_name to check:", repo_name)
+ response = ecr_client.describe_repositories(repositoryNames=[repo_name])
+ # If describe_repositories succeeds and returns a list of repositories,
+ # and the list is not empty, the repository exists.
+ return len(response["repositories"]) > 0, response["repositories"][0]
+ except ClientError as e:
+ # Check for the specific error code indicating the repository doesn't exist
+ if e.response["Error"]["Code"] == "RepositoryNotFoundException":
+ return False, {}
+ else:
+ # Re-raise other exceptions to handle unexpected errors
+ raise
+ except Exception as e:
+ print(f"An unexpected error occurred: {e}")
+ return False, {}
+
+
+def check_codebuild_project_exists(
+ project_name: str,
+): # Adjust return type hint as needed
+ """
+ Checks if a CodeBuild project with the given name exists.
+
+ Args:
+ project_name: The name of the CodeBuild project to check.
+
+ Returns:
+ A tuple:
+ - The first element is True if the project exists, False otherwise.
+ - The second element is the project object (dictionary) if found,
+ None otherwise.
+ """
+ codebuild_client = boto3.client("codebuild")
+ try:
+ # Use batch_get_projects with a list containing the single project name
+ response = codebuild_client.batch_get_projects(names=[project_name])
+
+ # The response for batch_get_projects includes 'projects' (found)
+ # and 'projectsNotFound' (not found).
+ if response["projects"]:
+ # If the project is found in the 'projects' list
+ print(f"CodeBuild project '{project_name}' found.")
+ return (
+ True,
+ response["projects"][0]["arn"],
+ ) # Return True and the project details dict
+ elif (
+ response["projectsNotFound"]
+ and project_name in response["projectsNotFound"]
+ ):
+ # If the project name is explicitly in the 'projectsNotFound' list
+ print(f"CodeBuild project '{project_name}' not found.")
+ return False, None
+ else:
+ # This case is less expected for a single name lookup,
+ # but could happen if there's an internal issue or the response
+ # structure is slightly different than expected for an error.
+ # It's safer to assume it wasn't found if not in 'projects'.
+ print(
+ f"CodeBuild project '{project_name}' not found (not in 'projects' list)."
+ )
+ return False, None
+
+ except ClientError as e:
+ # Catch specific ClientErrors. batch_get_projects might not throw
+ # 'InvalidInputException' for a non-existent project name if the
+ # name format is valid. It typically just lists it in projectsNotFound.
+ # However, other ClientErrors are possible (e.g., permissions).
+ print(
+ f"An AWS ClientError occurred checking CodeBuild project '{project_name}': {e}"
+ )
+ # Decide how to handle other ClientErrors - raising might be safer
+ raise # Re-raise the original exception
+ except Exception as e:
+ print(
+ f"An unexpected non-ClientError occurred checking CodeBuild project '{project_name}': {e}"
+ )
+ # Decide how to handle other errors
+ raise # Re-raise the original exception
+
+
+def get_vpc_id_by_name(vpc_name: str) -> Optional[str]:
+ """
+ Finds a VPC ID by its 'Name' tag.
+ """
+ ec2_client = boto3.client("ec2")
+ try:
+ response = ec2_client.describe_vpcs(
+ Filters=[{"Name": "tag:Name", "Values": [vpc_name]}]
+ )
+ if response and response["Vpcs"]:
+ vpc_id = response["Vpcs"][0]["VpcId"]
+ print(f"VPC '{vpc_name}' found with ID: {vpc_id}")
+
+ # In get_vpc_id_by_name, after finding VPC ID:
+
+ # Look for NAT Gateways in this VPC
+ ec2_client = boto3.client("ec2")
+ nat_gateways = []
+ try:
+ response = ec2_client.describe_nat_gateways(
+ Filters=[
+ {"Name": "vpc-id", "Values": [vpc_id]},
+ # Optional: Add a tag filter if you consistently tag your NATs
+ # {'Name': 'tag:Name', 'Values': [f"{prefix}-nat-gateway"]}
+ ]
+ )
+ nat_gateways = response.get("NatGateways", [])
+ except Exception as e:
+ print(
+ f"Warning: Could not describe NAT Gateways in VPC '{vpc_id}': {e}"
+ )
+ # Decide how to handle this error - proceed or raise?
+
+ # Decide how to identify the specific NAT Gateway you want to check for.
+
+ return vpc_id, nat_gateways
+ else:
+ print(f"VPC '{vpc_name}' not found.")
+ return None
+ except Exception as e:
+ print(f"An unexpected error occurred finding VPC '{vpc_name}': {e}")
+ raise
+
+
+# --- Helper to fetch all existing subnets in a VPC once ---
+def _get_existing_subnets_in_vpc(vpc_id: str) -> Dict[str, Any]:
+ """
+ Fetches all subnets in a given VPC.
+ Returns a dictionary with 'by_name' (map of name to subnet data),
+ 'by_id' (map of id to subnet data), and 'cidr_networks' (list of ipaddress.IPv4Network).
+ """
+ ec2_client = boto3.client("ec2")
+ existing_subnets_data = {
+ "by_name": {}, # {subnet_name: {'id': 'subnet-id', 'cidr': 'x.x.x.x/x'}}
+ "by_id": {}, # {subnet_id: {'name': 'subnet-name', 'cidr': 'x.x.x.x/x'}}
+ "cidr_networks": [], # List of ipaddress.IPv4Network objects
+ }
+ try:
+ response = ec2_client.describe_subnets(
+ Filters=[{"Name": "vpc-id", "Values": [vpc_id]}]
+ )
+ for s in response.get("Subnets", []):
+ subnet_id = s["SubnetId"]
+ cidr_block = s.get("CidrBlock")
+ # Extract 'Name' tag, which is crucial for lookup by name
+ name_tag = next(
+ (tag["Value"] for tag in s.get("Tags", []) if tag["Key"] == "Name"),
+ None,
+ )
+
+ subnet_info = {"id": subnet_id, "cidr": cidr_block, "name": name_tag}
+
+ if name_tag:
+ existing_subnets_data["by_name"][name_tag] = subnet_info
+ existing_subnets_data["by_id"][subnet_id] = subnet_info
+
+ if cidr_block:
+ try:
+ existing_subnets_data["cidr_networks"].append(
+ ipaddress.ip_network(cidr_block, strict=False)
+ )
+ except ValueError:
+ print(
+ f"Warning: Existing subnet {subnet_id} has an invalid CIDR: {cidr_block}. Skipping for overlap check."
+ )
+
+ print(
+ f"Fetched {len(response.get('Subnets', []))} existing subnets from VPC '{vpc_id}'."
+ )
+ except Exception as e:
+ print(
+ f"Error describing existing subnets in VPC '{vpc_id}': {e}. Cannot perform full validation."
+ )
+ raise # Re-raise if this essential step fails
+
+ return existing_subnets_data
+
+
+# --- Modified validate_subnet_creation_parameters to take pre-fetched data ---
+def validate_subnet_creation_parameters(
+ vpc_id: str,
+ proposed_subnets_data: List[
+ Dict[str, str]
+ ], # e.g., [{'name': 'my-public-subnet', 'cidr': '10.0.0.0/24', 'az': 'us-east-1a'}]
+ existing_aws_subnets_data: Dict[
+ str, Any
+ ], # Pre-fetched data from _get_existing_subnets_in_vpc
+) -> None:
+ """
+ Validates proposed subnet names and CIDR blocks against existing AWS subnets
+ in the specified VPC and against each other.
+ This function uses pre-fetched AWS subnet data.
+
+ Args:
+ vpc_id: The ID of the VPC (for logging/error messages).
+ proposed_subnets_data: A list of dictionaries, where each dict represents
+ a proposed subnet with 'name', 'cidr', and 'az'.
+ existing_aws_subnets_data: Dictionary containing existing AWS subnet data
+ (e.g., from _get_existing_subnets_in_vpc).
+
+ Raises:
+ ValueError: If any proposed subnet name or CIDR block
+ conflicts with existing AWS resources or other proposed resources.
+ """
+ if not proposed_subnets_data:
+ print("No proposed subnet data provided for validation. Skipping.")
+ return
+
+ print(
+ f"--- Starting pre-synth validation for VPC '{vpc_id}' with proposed subnets ---"
+ )
+
+ print("Existing subnet data:", pd.DataFrame(existing_aws_subnets_data["by_name"]))
+
+ existing_aws_subnet_names = set(existing_aws_subnets_data["by_name"].keys())
+ existing_aws_cidr_networks = existing_aws_subnets_data["cidr_networks"]
+
+ # Sets to track names and list to track networks for internal batch consistency
+ proposed_names_seen: set[str] = set()
+ proposed_cidr_networks_seen: List[ipaddress.IPv4Network] = []
+
+ for i, proposed_subnet in enumerate(proposed_subnets_data):
+ subnet_name = proposed_subnet.get("name")
+ cidr_block_str = proposed_subnet.get("cidr")
+ availability_zone = proposed_subnet.get("az")
+
+ if not all([subnet_name, cidr_block_str, availability_zone]):
+ raise ValueError(
+ f"Proposed subnet at index {i} is incomplete. Requires 'name', 'cidr', and 'az'."
+ )
+
+ # 1. Check for duplicate names within the proposed batch
+ if subnet_name in proposed_names_seen:
+ raise ValueError(
+ f"Proposed subnet name '{subnet_name}' is duplicated within the input list."
+ )
+ proposed_names_seen.add(subnet_name)
+
+ # 2. Check for duplicate names against existing AWS subnets
+ if subnet_name in existing_aws_subnet_names:
+ print(
+ f"Proposed subnet name '{subnet_name}' already exists in VPC '{vpc_id}'."
+ )
+
+ # Parse proposed CIDR
+ try:
+ proposed_net = ipaddress.ip_network(cidr_block_str, strict=False)
+ except ValueError as e:
+ raise ValueError(
+ f"Invalid CIDR format '{cidr_block_str}' for proposed subnet '{subnet_name}': {e}"
+ )
+
+ # 3. Check for overlapping CIDRs within the proposed batch
+ for existing_proposed_net in proposed_cidr_networks_seen:
+ if proposed_net.overlaps(existing_proposed_net):
+ raise ValueError(
+ f"Proposed CIDR '{cidr_block_str}' for subnet '{subnet_name}' "
+ f"overlaps with another proposed CIDR '{str(existing_proposed_net)}' "
+ f"within the same batch."
+ )
+
+ # 4. Check for overlapping CIDRs against existing AWS subnets
+ for existing_aws_net in existing_aws_cidr_networks:
+ if proposed_net.overlaps(existing_aws_net):
+ raise ValueError(
+ f"Proposed CIDR '{cidr_block_str}' for subnet '{subnet_name}' "
+ f"overlaps with an existing AWS subnet CIDR '{str(existing_aws_net)}' "
+ f"in VPC '{vpc_id}'."
+ )
+
+ # If all checks pass for this subnet, add its network to the list for subsequent checks
+ proposed_cidr_networks_seen.append(proposed_net)
+ print(
+ f"Validation successful for proposed subnet '{subnet_name}' with CIDR '{cidr_block_str}'."
+ )
+
+ print(
+ f"--- All proposed subnets passed pre-synth validation checks for VPC '{vpc_id}'. ---"
+ )
+
+
+# --- Modified check_subnet_exists_by_name (Uses pre-fetched data) ---
+def check_subnet_exists_by_name(
+ subnet_name: str, existing_aws_subnets_data: Dict[str, Any]
+) -> Tuple[bool, Optional[str]]:
+ """
+ Checks if a subnet with the given name exists within the pre-fetched data.
+
+ Args:
+ subnet_name: The 'Name' tag value of the subnet to check.
+ existing_aws_subnets_data: Dictionary containing existing AWS subnet data
+ (e.g., from _get_existing_subnets_in_vpc).
+
+ Returns:
+ A tuple:
+ - The first element is True if the subnet exists, False otherwise.
+ - The second element is the Subnet ID if found, None otherwise.
+ """
+ subnet_info = existing_aws_subnets_data["by_name"].get(subnet_name)
+ if subnet_info:
+ print(f"Subnet '{subnet_name}' found with ID: {subnet_info['id']}")
+ return True, subnet_info["id"]
+ else:
+ print(f"Subnet '{subnet_name}' not found.")
+ return False, None
+
+
+def create_nat_gateway(
+ scope: Construct,
+ public_subnet_for_nat: ec2.ISubnet, # Expects a proper ISubnet
+ nat_gateway_name: str,
+ nat_gateway_id_context_key: str,
+) -> str:
+ """
+ Creates a single NAT Gateway in the specified public subnet.
+ It does not handle lookup from context; the calling stack should do that.
+ Returns the CloudFormation Ref of the NAT Gateway ID.
+ """
+ print(
+ f"Defining a new NAT Gateway '{nat_gateway_name}' in subnet '{public_subnet_for_nat.subnet_id}'."
+ )
+
+ # Create an Elastic IP for the NAT Gateway
+ eip = ec2.CfnEIP(
+ scope,
+ NAT_GATEWAY_EIP_NAME,
+ tags=[CfnTag(key="Name", value=NAT_GATEWAY_EIP_NAME)],
+ )
+
+ # Create the NAT Gateway
+ nat_gateway_logical_id = nat_gateway_name.replace("-", "") + "NatGateway"
+ nat_gateway = ec2.CfnNatGateway(
+ scope,
+ nat_gateway_logical_id,
+ subnet_id=public_subnet_for_nat.subnet_id, # Associate with the public subnet
+ allocation_id=eip.attr_allocation_id, # Associate with the EIP
+ tags=[CfnTag(key="Name", value=nat_gateway_name)],
+ )
+ # The NAT GW depends on the EIP. The dependency on the subnet is implicit via subnet_id.
+ nat_gateway.add_dependency(eip)
+
+ # *** CRUCIAL: Use CfnOutput to export the ID after deployment ***
+ # This is how you will get the ID to put into cdk.context.json
+ CfnOutput(
+ scope,
+ "SingleNatGatewayIdOutput",
+ value=nat_gateway.ref,
+ description=f"Physical ID of the Single NAT Gateway. Add this to cdk.context.json under the key '{nat_gateway_id_context_key}'.",
+ export_name=f"{scope.stack_name}-NatGatewayId", # Make export name unique
+ )
+
+ print(
+ f"CDK: Defined new NAT Gateway '{nat_gateway.ref}'. Its physical ID will be available in the stack outputs after deployment."
+ )
+ # Return the tokenised reference for use within this synthesis
+ return nat_gateway.ref
+
+
+def create_subnets(
+ scope: Construct,
+ vpc: ec2.IVpc,
+ prefix: str,
+ subnet_names: List[str],
+ cidr_blocks: List[str],
+ availability_zones: List[str],
+ is_public: bool,
+ internet_gateway_id: Optional[str] = None,
+ single_nat_gateway_id: Optional[str] = None,
+) -> Tuple[List[ec2.CfnSubnet], List[ec2.CfnRouteTable]]:
+ """
+ Creates subnets using L2 constructs but returns the underlying L1 Cfn objects
+ for backward compatibility.
+ """
+ # --- Validations remain the same ---
+ if not (len(subnet_names) == len(cidr_blocks) == len(availability_zones) > 0):
+ raise ValueError(
+ "Subnet names, CIDR blocks, and Availability Zones lists must be non-empty and match in length."
+ )
+ if is_public and not internet_gateway_id:
+ raise ValueError("internet_gateway_id must be provided for public subnets.")
+ if not is_public and not single_nat_gateway_id:
+ raise ValueError(
+ "single_nat_gateway_id must be provided for private subnets when using a single NAT Gateway."
+ )
+
+ # --- We will populate these lists with the L1 objects to return ---
+ created_subnets: List[ec2.CfnSubnet] = []
+ created_route_tables: List[ec2.CfnRouteTable] = []
+
+ subnet_type_tag = "public" if is_public else "private"
+
+ for i, subnet_name in enumerate(subnet_names):
+ logical_id = f"{prefix}{subnet_type_tag.capitalize()}Subnet{i+1}"
+
+ # 1. Create the L2 Subnet (this is the easy part)
+ subnet = ec2.Subnet(
+ scope,
+ logical_id,
+ vpc_id=vpc.vpc_id,
+ cidr_block=cidr_blocks[i],
+ availability_zone=availability_zones[i],
+ map_public_ip_on_launch=is_public,
+ )
+ Tags.of(subnet).add("Name", subnet_name)
+ Tags.of(subnet).add("Type", subnet_type_tag)
+
+ if is_public:
+ # The subnet's route_table is automatically created by the L2 Subnet construct
+ try:
+ subnet.add_route(
+ "DefaultInternetRoute", # A logical ID for the CfnRoute resource
+ router_id=internet_gateway_id,
+ router_type=ec2.RouterType.GATEWAY,
+ # destination_cidr_block="0.0.0.0/0" is the default for this method
+ )
+ except Exception as e:
+ print("Could not create IGW route for public subnet due to:", e)
+ print(f"CDK: Defined public L2 subnet '{subnet_name}' and added IGW route.")
+ else:
+ try:
+ # Using .add_route() for private subnets as well for consistency
+ subnet.add_route(
+ "DefaultNatRoute", # A logical ID for the CfnRoute resource
+ router_id=single_nat_gateway_id,
+ router_type=ec2.RouterType.NAT_GATEWAY,
+ )
+ except Exception as e:
+ print("Could not create NAT gateway route for public subnet due to:", e)
+ print(
+ f"CDK: Defined private L2 subnet '{subnet_name}' and added NAT GW route."
+ )
+
+ route_table = subnet.route_table
+
+ created_subnets.append(subnet)
+ created_route_tables.append(route_table)
+
+ return created_subnets, created_route_tables
+
+
+def ingress_rule_exists(security_group: str, peer: str, port: str):
+ for rule in security_group.connections.security_groups:
+ if port:
+ if rule.peer == peer and rule.connection == port:
+ return True
+ else:
+ if rule.peer == peer:
+ return True
+ return False
+
+
+def check_for_existing_user_pool(user_pool_name: str):
+ cognito_client = boto3.client("cognito-idp")
+ list_pools_response = cognito_client.list_user_pools(
+ MaxResults=60
+ ) # MaxResults up to 60
+
+ # ListUserPools might require pagination if you have more than 60 pools
+ # This simple example doesn't handle pagination, which could miss your pool
+
+ existing_user_pool_id = ""
+
+ for pool in list_pools_response.get("UserPools", []):
+ if pool.get("Name") == user_pool_name:
+ existing_user_pool_id = pool["Id"]
+ print(
+ f"Found existing user pool by name '{user_pool_name}' with ID: {existing_user_pool_id}"
+ )
+ break # Found the one we're looking for
+
+ if existing_user_pool_id:
+ return True, existing_user_pool_id, pool
+ else:
+ return False, "", ""
+
+
+def check_for_existing_user_pool_client(user_pool_id: str, user_pool_client_name: str):
+ """
+ Checks if a Cognito User Pool Client with the given name exists in the specified User Pool.
+
+ Args:
+ user_pool_id: The ID of the Cognito User Pool.
+ user_pool_client_name: The name of the User Pool Client to check for.
+
+ Returns:
+ A tuple:
+ - True, client_id, client_details if the client exists.
+ - False, "", {} otherwise.
+ """
+ cognito_client = boto3.client("cognito-idp")
+ next_token = "string"
+
+ while True:
+ try:
+ response = cognito_client.list_user_pool_clients(
+ UserPoolId=user_pool_id, MaxResults=60, NextToken=next_token
+ )
+ except cognito_client.exceptions.ResourceNotFoundException:
+ print(f"Error: User pool with ID '{user_pool_id}' not found.")
+ return False, "", {}
+
+ except cognito_client.exceptions.InvalidParameterException:
+ print(f"Error: No app clients for '{user_pool_id}' found.")
+ return False, "", {}
+
+ except Exception as e:
+ print("Could not check User Pool clients due to:", e)
+
+ for client in response.get("UserPoolClients", []):
+ if client.get("ClientName") == user_pool_client_name:
+ print(
+ f"Found existing user pool client '{user_pool_client_name}' with ID: {client['ClientId']}"
+ )
+ return True, client["ClientId"], client
+
+ next_token = response.get("NextToken")
+ if not next_token:
+ break
+
+ return False, "", {}
+
+
+def check_for_secret(secret_name: str, secret_value: dict = ""):
+ """
+ Checks if a Secrets Manager secret with the given name exists.
+ If it doesn't exist, it creates the secret.
+
+ Args:
+ secret_name: The name of the Secrets Manager secret.
+ secret_value: A dictionary containing the key-value pairs for the secret.
+
+ Returns:
+ True if the secret existed or was created, False otherwise (due to other errors).
+ """
+ secretsmanager_client = boto3.client("secretsmanager")
+
+ try:
+ # Try to get the secret. If it doesn't exist, a ResourceNotFoundException will be raised.
+ secret_value = secretsmanager_client.get_secret_value(SecretId=secret_name)
+ print("Secret already exists.")
+ return True, secret_value
+ except secretsmanager_client.exceptions.ResourceNotFoundException:
+ print("Secret not found")
+ return False, {}
+ except Exception as e:
+ # Handle other potential exceptions during the get operation
+ print(f"Error checking for secret: {e}")
+ return False, {}
+
+
+def check_alb_exists(
+ load_balancer_name: str, region_name: str = None
+) -> tuple[bool, dict]:
+ """
+ Checks if an Application Load Balancer (ALB) with the given name exists.
+
+ Args:
+ load_balancer_name: The name of the ALB to check.
+ region_name: The AWS region to check in. If None, uses the default
+ session region.
+
+ Returns:
+ A tuple:
+ - The first element is True if the ALB exists, False otherwise.
+ - The second element is the ALB object (dictionary) if found,
+ None otherwise. Specifically, it returns the first element of
+ the LoadBalancers list from the describe_load_balancers response.
+ """
+ if region_name:
+ elbv2_client = boto3.client("elbv2", region_name=region_name)
+ else:
+ elbv2_client = boto3.client("elbv2")
+ try:
+ response = elbv2_client.describe_load_balancers(Names=[load_balancer_name])
+ if response["LoadBalancers"]:
+ return (
+ True,
+ response["LoadBalancers"][0],
+ ) # Return True and the first ALB object
+ else:
+ return False, {}
+ except ClientError as e:
+ # If the error indicates the ALB doesn't exist, return False
+ if e.response["Error"]["Code"] == "LoadBalancerNotFound":
+ return False, {}
+ else:
+ # Re-raise other exceptions
+ raise
+ except Exception as e:
+ print(f"An unexpected error occurred: {e}")
+ return False, {}
+
+
+def check_fargate_task_definition_exists(
+ task_definition_name: str, region_name: str = None
+) -> tuple[bool, dict]:
+ """
+ Checks if a Fargate task definition with the given name exists.
+
+ Args:
+ task_definition_name: The name or ARN of the task definition to check.
+ region_name: The AWS region to check in. If None, uses the default
+ session region.
+
+ Returns:
+ A tuple:
+ - The first element is True if the task definition exists, False otherwise.
+ - The second element is the task definition object (dictionary) if found,
+ None otherwise. Specifically, it returns the first element of the
+ taskDefinitions list from the describe_task_definition response.
+ """
+ if region_name:
+ ecs_client = boto3.client("ecs", region_name=region_name)
+ else:
+ ecs_client = boto3.client("ecs")
+ try:
+ response = ecs_client.describe_task_definition(
+ taskDefinition=task_definition_name
+ )
+ # If describe_task_definition succeeds, it returns the task definition.
+ # We can directly return True and the task definition.
+ return True, response["taskDefinition"]
+ except ClientError as e:
+ # Check for the error code indicating the task definition doesn't exist.
+ if (
+ e.response["Error"]["Code"] == "ClientException"
+ and "Task definition" in e.response["Message"]
+ and "does not exist" in e.response["Message"]
+ ):
+ return False, {}
+ else:
+ # Re-raise other exceptions.
+ raise
+ except Exception as e:
+ print(f"An unexpected error occurred: {e}")
+ return False, {}
+
+
+def check_ecs_service_exists(
+ cluster_name: str, service_name: str, region_name: str = None
+) -> tuple[bool, dict]:
+ """
+ Checks if an ECS service with the given name exists in the specified cluster.
+
+ Args:
+ cluster_name: The name or ARN of the ECS cluster.
+ service_name: The name of the ECS service to check.
+ region_name: The AWS region to check in. If None, uses the default
+ session region.
+
+ Returns:
+ A tuple:
+ - The first element is True if the service exists, False otherwise.
+ - The second element is the service object (dictionary) if found,
+ None otherwise.
+ """
+ if region_name:
+ ecs_client = boto3.client("ecs", region_name=region_name)
+ else:
+ ecs_client = boto3.client("ecs")
+ try:
+ response = ecs_client.describe_services(
+ cluster=cluster_name, services=[service_name]
+ )
+ if response["services"]:
+ return (
+ True,
+ response["services"][0],
+ ) # Return True and the first service object
+ else:
+ return False, {}
+ except ClientError as e:
+ # Check for the error code indicating the service doesn't exist.
+ if e.response["Error"]["Code"] == "ClusterNotFoundException":
+ return False, {}
+ elif e.response["Error"]["Code"] == "ServiceNotFoundException":
+ return False, {}
+ else:
+ # Re-raise other exceptions.
+ raise
+ except Exception as e:
+ print(f"An unexpected error occurred: {e}")
+ return False, {}
+
+
+def check_cloudfront_distribution_exists(
+ distribution_name: str, region_name: str = None
+) -> tuple[bool, dict | None]:
+ """
+ Checks if a CloudFront distribution with the given name exists.
+
+ Args:
+ distribution_name: The name of the CloudFront distribution to check.
+ region_name: The AWS region to check in. If None, uses the default
+ session region. Note: CloudFront is a global service,
+ so the region is usually 'us-east-1', but this parameter
+ is included for completeness.
+
+ Returns:
+ A tuple:
+ - The first element is True if the distribution exists, False otherwise.
+ - The second element is the distribution object (dictionary) if found,
+ None otherwise. Specifically, it returns the first element of the
+ DistributionList from the ListDistributions response.
+ """
+ if region_name:
+ cf_client = boto3.client("cloudfront", region_name=region_name)
+ else:
+ cf_client = boto3.client("cloudfront")
+ try:
+ response = cf_client.list_distributions()
+ if "Items" in response["DistributionList"]:
+ for distribution in response["DistributionList"]["Items"]:
+ # CloudFront doesn't directly filter by name, so we have to iterate.
+ if (
+ distribution["AliasSet"]["Items"]
+ and distribution["AliasSet"]["Items"][0] == distribution_name
+ ):
+ return True, distribution
+ return False, None
+ else:
+ return False, None
+ except ClientError as e:
+ # If the error indicates the Distribution doesn't exist, return False
+ if e.response["Error"]["Code"] == "NoSuchDistribution":
+ return False, None
+ else:
+ # Re-raise other exceptions
+ raise
+ except Exception as e:
+ print(f"An unexpected error occurred: {e}")
+ return False, None
+
+
+def create_web_acl_with_common_rules(
+ scope: Construct, web_acl_name: str, waf_scope: str = "CLOUDFRONT"
+):
+ """
+ Use CDK to create a web ACL based on an AWS common rule set with overrides.
+ This function now expects a 'scope' argument, typically 'self' from your stack,
+ as CfnWebACL requires a construct scope.
+ """
+
+ # Create full list of rules
+ rules = []
+ aws_ruleset_names = [
+ "AWSManagedRulesCommonRuleSet",
+ "AWSManagedRulesKnownBadInputsRuleSet",
+ "AWSManagedRulesAmazonIpReputationList",
+ ]
+
+ # Use a separate counter to assign unique priorities sequentially
+ priority_counter = 1
+
+ for aws_rule_name in aws_ruleset_names:
+ current_rule_action_overrides = None
+
+ # All managed rule groups need an override_action.
+ # 'none' means use the managed rule group's default action.
+ current_override_action = wafv2.CfnWebACL.OverrideActionProperty(none={})
+
+ current_priority = priority_counter
+ priority_counter += 1
+
+ if aws_rule_name == "AWSManagedRulesCommonRuleSet":
+ current_rule_action_overrides = [
+ wafv2.CfnWebACL.RuleActionOverrideProperty(
+ name="SizeRestrictions_BODY",
+ action_to_use=wafv2.CfnWebACL.RuleActionProperty(allow={}),
+ )
+ ]
+ # No need to set current_override_action here, it's already set above.
+ # If you wanted this specific rule to have a *fixed* priority, you'd handle it differently
+ # For now, it will get priority 1 from the counter.
+
+ rule_property = wafv2.CfnWebACL.RuleProperty(
+ name=aws_rule_name,
+ priority=current_priority,
+ statement=wafv2.CfnWebACL.StatementProperty(
+ managed_rule_group_statement=wafv2.CfnWebACL.ManagedRuleGroupStatementProperty(
+ vendor_name="AWS",
+ name=aws_rule_name,
+ rule_action_overrides=current_rule_action_overrides,
+ )
+ ),
+ visibility_config=wafv2.CfnWebACL.VisibilityConfigProperty(
+ cloud_watch_metrics_enabled=True,
+ metric_name=aws_rule_name,
+ sampled_requests_enabled=True,
+ ),
+ override_action=current_override_action, # THIS IS THE CRUCIAL PART FOR ALL MANAGED RULES
+ )
+
+ rules.append(rule_property)
+
+ # Add the rate limit rule
+ rate_limit_priority = priority_counter # Use the next available priority
+ rules.append(
+ wafv2.CfnWebACL.RuleProperty(
+ name="RateLimitRule",
+ priority=rate_limit_priority,
+ statement=wafv2.CfnWebACL.StatementProperty(
+ rate_based_statement=wafv2.CfnWebACL.RateBasedStatementProperty(
+ limit=1000, aggregate_key_type="IP"
+ )
+ ),
+ visibility_config=wafv2.CfnWebACL.VisibilityConfigProperty(
+ cloud_watch_metrics_enabled=True,
+ metric_name="RateLimitRule",
+ sampled_requests_enabled=True,
+ ),
+ action=wafv2.CfnWebACL.RuleActionProperty(block={}),
+ )
+ )
+
+ web_acl = wafv2.CfnWebACL(
+ scope,
+ "WebACL",
+ name=web_acl_name,
+ default_action=wafv2.CfnWebACL.DefaultActionProperty(allow={}),
+ scope=waf_scope,
+ visibility_config=wafv2.CfnWebACL.VisibilityConfigProperty(
+ cloud_watch_metrics_enabled=True,
+ metric_name="webACL",
+ sampled_requests_enabled=True,
+ ),
+ rules=rules,
+ )
+
+ CfnOutput(scope, "WebACLArn", value=web_acl.attr_arn)
+
+ return web_acl
+
+
+def check_web_acl_exists(
+ web_acl_name: str, scope: str, region_name: str = None
+) -> tuple[bool, dict]:
+ """
+ Checks if a Web ACL with the given name and scope exists.
+
+ Args:
+ web_acl_name: The name of the Web ACL to check.
+ scope: The scope of the Web ACL ('CLOUDFRONT' or 'REGIONAL').
+ region_name: The AWS region to check in. Required for REGIONAL scope.
+ If None, uses the default session region. For CLOUDFRONT,
+ the region should be 'us-east-1'.
+
+ Returns:
+ A tuple:
+ - The first element is True if the Web ACL exists, False otherwise.
+ - The second element is the Web ACL object (dictionary) if found,
+ None otherwise.
+ """
+ if scope not in ["CLOUDFRONT", "REGIONAL"]:
+ raise ValueError("Scope must be either 'CLOUDFRONT' or 'REGIONAL'")
+
+ if scope == "REGIONAL" and not region_name:
+ raise ValueError("Region name is required for REGIONAL scope")
+
+ if scope == "CLOUDFRONT":
+ region_name = "us-east-1" # CloudFront scope requires us-east-1
+
+ if region_name:
+ waf_client = boto3.client("wafv2", region_name=region_name)
+ else:
+ waf_client = boto3.client("wafv2")
+ try:
+ response = waf_client.list_web_acls(Scope=scope)
+ if "WebACLs" in response:
+ for web_acl in response["WebACLs"]:
+ if web_acl["Name"] == web_acl_name:
+ # Describe the Web ACL to get the full object.
+ describe_response = waf_client.describe_web_acl(
+ Name=web_acl_name, Scope=scope
+ )
+ return True, describe_response["WebACL"]
+ return False, {}
+ else:
+ return False, {}
+ except ClientError as e:
+ # Check for the error code indicating the web ACL doesn't exist.
+ if e.response["Error"]["Code"] == "ResourceNotFoundException":
+ return False, {}
+ else:
+ # Re-raise other exceptions.
+ raise
+ except Exception as e:
+ print(f"An unexpected error occurred: {e}")
+ return False, {}
+
+
+def add_alb_https_listener_with_cert(
+ scope: Construct,
+ logical_id: str, # A unique ID for this listener construct
+ alb: elb.ApplicationLoadBalancer,
+ acm_certificate_arn: Optional[
+ str
+ ], # Optional: If None, no HTTPS listener will be created
+ default_target_group: elb.ITargetGroup, # Mandatory: The target group to forward traffic to
+ listener_port_https: int = 443,
+ listener_open_to_internet: bool = False, # Be cautious with True, ensure ALB security group restricts access
+ # --- Cognito Authentication Parameters ---
+ enable_cognito_auth: bool = False,
+ cognito_user_pool: Optional[cognito.IUserPool] = None,
+ cognito_user_pool_client: Optional[cognito.IUserPoolClient] = None,
+ cognito_user_pool_domain: Optional[
+ str
+ ] = None, # E.g., "my-app-domain" for "my-app-domain.auth.region.amazoncognito.com"
+ cognito_auth_scope: Optional[
+ str
+ ] = "openid profile email", # Default recommended scope
+ cognito_auth_on_unauthenticated_request: elb.UnauthenticatedAction = elb.UnauthenticatedAction.AUTHENTICATE,
+ stickiness_cookie_duration=None,
+ # --- End Cognito Parameters ---
+) -> Optional[elb.ApplicationListener]:
+ """
+ Conditionally adds an HTTPS listener to an ALB with an ACM certificate,
+ and optionally enables Cognito User Pool authentication.
+
+ Args:
+ scope (Construct): The scope in which to define this construct (e.g., your CDK Stack).
+ logical_id (str): A unique logical ID for the listener construct within the stack.
+ alb (elb.ApplicationLoadBalancer): The Application Load Balancer to add the listener to.
+ acm_certificate_arn (Optional[str]): The ARN of the ACM certificate to attach.
+ If None, the HTTPS listener will NOT be created.
+ default_target_group (elb.ITargetGroup): The default target group for the listener to forward traffic to.
+ This is mandatory for a functional listener.
+ listener_port_https (int): The HTTPS port to listen on (default: 443).
+ listener_open_to_internet (bool): Whether the listener should allow connections from all sources.
+ If False (recommended), ensure your ALB's security group allows
+ inbound traffic on this port from desired sources.
+ enable_cognito_auth (bool): Set to True to enable Cognito User Pool authentication.
+ cognito_user_pool (Optional[cognito.IUserPool]): The Cognito User Pool object. Required if enable_cognito_auth is True.
+ cognito_user_pool_client (Optional[cognito.IUserPoolClient]): The Cognito User Pool App Client object. Required if enable_cognito_auth is True.
+ cognito_user_pool_domain (Optional[str]): The domain prefix for your Cognito User Pool. Required if enable_cognito_auth is True.
+ cognito_auth_scope (Optional[str]): The scope for the Cognito authentication.
+ cognito_auth_on_unauthenticated_request (elb.UnauthenticatedAction): Action for unauthenticated requests.
+ Defaults to AUTHENTICATE (redirect to login).
+
+ Returns:
+ Optional[elb.ApplicationListener]: The created ApplicationListener if successful,
+ None if no ACM certificate ARN was provided.
+ """
+ https_listener = None
+ if acm_certificate_arn:
+ certificates_list = [elb.ListenerCertificate.from_arn(acm_certificate_arn)]
+ print(
+ f"Attempting to add ALB HTTPS listener on port {listener_port_https} with ACM certificate: {acm_certificate_arn}"
+ )
+
+ # Determine the default action based on whether Cognito auth is enabled
+ default_action = None
+ if enable_cognito_auth is True:
+ if not all(
+ [cognito_user_pool, cognito_user_pool_client, cognito_user_pool_domain]
+ ):
+ raise ValueError(
+ "Cognito User Pool, Client, and Domain must be provided if enable_cognito_auth is True."
+ )
+ print(
+ f"Enabling Cognito authentication with User Pool: {cognito_user_pool.user_pool_id}"
+ )
+
+ default_action = elb_act.AuthenticateCognitoAction(
+ next=elb.ListenerAction.forward(
+ [default_target_group]
+ ), # After successful auth, forward to TG
+ user_pool=cognito_user_pool,
+ user_pool_client=cognito_user_pool_client,
+ user_pool_domain=cognito_user_pool_domain,
+ scope=cognito_auth_scope,
+ on_unauthenticated_request=cognito_auth_on_unauthenticated_request,
+ session_timeout=stickiness_cookie_duration,
+ # Additional options you might want to configure:
+ # session_cookie_name="AWSELBCookies"
+ )
+ else:
+ default_action = elb.ListenerAction.forward([default_target_group])
+ print("Cognito authentication is NOT enabled for this listener.")
+
+ # Add the HTTPS listener
+ https_listener = alb.add_listener(
+ logical_id,
+ port=listener_port_https,
+ open=listener_open_to_internet,
+ certificates=certificates_list,
+ default_action=default_action, # Use the determined default action
+ )
+ print(f"ALB HTTPS listener on port {listener_port_https} defined.")
+ else:
+ print("ACM_CERTIFICATE_ARN is not provided. Skipping HTTPS listener creation.")
+
+ return https_listener
+
+
+def ensure_folder_exists(output_folder: str):
+ """Checks if the specified folder exists, creates it if not."""
+
+ if not os.path.exists(output_folder):
+ # Create the folder if it doesn't exist
+ os.makedirs(output_folder, exist_ok=True)
+ print(f"Created the {output_folder} folder.")
+ else:
+ print(f"The {output_folder} folder already exists.")
+
+
+def create_basic_config_env(
+ out_dir: str = "config",
+ S3_LOG_CONFIG_BUCKET_NAME=S3_LOG_CONFIG_BUCKET_NAME,
+ S3_OUTPUT_BUCKET_NAME=S3_OUTPUT_BUCKET_NAME,
+ ACCESS_LOG_DYNAMODB_TABLE_NAME=ACCESS_LOG_DYNAMODB_TABLE_NAME,
+ FEEDBACK_LOG_DYNAMODB_TABLE_NAME=FEEDBACK_LOG_DYNAMODB_TABLE_NAME,
+ USAGE_LOG_DYNAMODB_TABLE_NAME=USAGE_LOG_DYNAMODB_TABLE_NAME,
+):
+ """
+ Create a basic config.env file for the user to use with their newly deployed redaction app.
+ """
+ variables = {
+ "COGNITO_AUTH": "True",
+ "RUN_AWS_FUNCTIONS": "True",
+ "DISPLAY_FILE_NAMES_IN_LOGS": "False",
+ "SESSION_OUTPUT_FOLDER": "True",
+ "SAVE_LOGS_TO_DYNAMODB": "True",
+ "SHOW_COSTS": "True",
+ "SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS": "True",
+ "LOAD_PREVIOUS_TEXTRACT_JOBS_S3": "True",
+ "DOCUMENT_REDACTION_BUCKET": S3_LOG_CONFIG_BUCKET_NAME,
+ "TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET": S3_OUTPUT_BUCKET_NAME,
+ "ACCESS_LOG_DYNAMODB_TABLE_NAME": ACCESS_LOG_DYNAMODB_TABLE_NAME,
+ "FEEDBACK_LOG_DYNAMODB_TABLE_NAME": FEEDBACK_LOG_DYNAMODB_TABLE_NAME,
+ "USAGE_LOG_DYNAMODB_TABLE_NAME": USAGE_LOG_DYNAMODB_TABLE_NAME,
+ }
+
+ # Write variables to .env file
+ ensure_folder_exists(out_dir + "/")
+ env_file_path = os.path.abspath(os.path.join(out_dir, "config.env"))
+
+ # It's good practice to ensure the file exists before calling set_key repeatedly.
+ # set_key will create it, but for a loop, it might be cleaner to ensure it's empty/exists once.
+ if not os.path.exists(env_file_path):
+ with open(env_file_path, "w"):
+ pass # Create empty file
+
+ for key, value in variables.items():
+ set_key(env_file_path, key, str(value), quote_mode="never")
+
+ return variables
+
+
+def start_codebuild_build(PROJECT_NAME: str, AWS_REGION: str = AWS_REGION):
+ """
+ Start an existing Codebuild project build
+ """
+
+ # --- Initialize CodeBuild client ---
+ client = boto3.client("codebuild", region_name=AWS_REGION)
+
+ try:
+ print(f"Attempting to start build for project: {PROJECT_NAME}")
+
+ response = client.start_build(projectName=PROJECT_NAME)
+
+ build_id = response["build"]["id"]
+ print(f"Successfully started build with ID: {build_id}")
+ print(f"Build ARN: {response['build']['arn']}")
+ print("Build URL (approximate - construct based on region and ID):")
+ print(
+ f"https://{AWS_REGION}.console.aws.amazon.com/codesuite/codebuild/projects/{PROJECT_NAME}/build/{build_id.split(':')[-1]}/detail"
+ )
+
+ # You can inspect the full response if needed
+ # print("\nFull response:")
+ # import json
+ # print(json.dumps(response, indent=2))
+
+ except client.exceptions.ResourceNotFoundException:
+ print(f"Error: Project '{PROJECT_NAME}' not found in region '{AWS_REGION}'.")
+ except Exception as e:
+ print(f"An unexpected error occurred: {e}")
+
+
+def upload_file_to_s3(
+ local_file_paths: List[str],
+ s3_key: str,
+ s3_bucket: str,
+ RUN_AWS_FUNCTIONS: str = "1",
+):
+ """
+ Uploads a file from local machine to Amazon S3.
+
+ Args:
+ - local_file_path: Local file path(s) of the file(s) to upload.
+ - s3_key: Key (path) to the file in the S3 bucket.
+ - s3_bucket: Name of the S3 bucket.
+
+ Returns:
+ - Message as variable/printed to console
+ """
+ final_out_message = []
+ final_out_message_str = ""
+
+ if RUN_AWS_FUNCTIONS == "1":
+ try:
+ if s3_bucket and local_file_paths:
+
+ s3_client = boto3.client("s3", region_name=AWS_REGION)
+
+ if isinstance(local_file_paths, str):
+ local_file_paths = [local_file_paths]
+
+ for file in local_file_paths:
+ if s3_client:
+ # print(s3_client)
+ try:
+ # Get file name off file path
+ file_name = os.path.basename(file)
+
+ s3_key_full = s3_key + file_name
+ print("S3 key: ", s3_key_full)
+
+ s3_client.upload_file(file, s3_bucket, s3_key_full)
+ out_message = (
+ "File " + file_name + " uploaded successfully!"
+ )
+ print(out_message)
+
+ except Exception as e:
+ out_message = f"Error uploading file(s): {e}"
+ print(out_message)
+
+ final_out_message.append(out_message)
+ final_out_message_str = "\n".join(final_out_message)
+
+ else:
+ final_out_message_str = "Could not connect to AWS."
+ else:
+ final_out_message_str = (
+ "At least one essential variable is empty, could not upload to S3"
+ )
+ except Exception as e:
+ final_out_message_str = "Could not upload files to S3 due to: " + str(e)
+ print(final_out_message_str)
+ else:
+ final_out_message_str = "App not set to run AWS functions"
+
+ return final_out_message_str
+
+
+# Initialize ECS client
+def start_ecs_task(cluster_name, service_name):
+ ecs_client = boto3.client("ecs")
+
+ try:
+ # Update the service to set the desired count to 1
+ ecs_client.update_service(
+ cluster=cluster_name, service=service_name, desiredCount=1
+ )
+ return {
+ "statusCode": 200,
+ "body": f"Service {service_name} in cluster {cluster_name} has been updated to 1 task.",
+ }
+ except Exception as e:
+ return {"statusCode": 500, "body": f"Error updating service: {str(e)}"}
diff --git a/cdk/cdk_stack.py b/cdk/cdk_stack.py
new file mode 100644
index 0000000000000000000000000000000000000000..73d51c134eb90dc96f8d685113599c4b09e91da8
--- /dev/null
+++ b/cdk/cdk_stack.py
@@ -0,0 +1,1869 @@
+import json # You might still need json if loading task_definition.json
+import os
+from typing import Any, Dict, List
+
+from aws_cdk import (
+ CfnOutput, # <-- Import CfnOutput directly
+ Duration,
+ RemovalPolicy,
+ SecretValue,
+ Stack,
+)
+from aws_cdk import aws_cloudfront as cloudfront
+from aws_cdk import aws_cloudfront_origins as origins
+from aws_cdk import aws_codebuild as codebuild
+from aws_cdk import aws_cognito as cognito
+from aws_cdk import aws_dynamodb as dynamodb # Import the DynamoDB module
+from aws_cdk import aws_ec2 as ec2
+from aws_cdk import aws_ecr as ecr
+from aws_cdk import aws_ecs as ecs
+from aws_cdk import aws_elasticloadbalancingv2 as elbv2
+from aws_cdk import aws_iam as iam
+from aws_cdk import aws_kms as kms
+from aws_cdk import aws_logs as logs
+from aws_cdk import aws_s3 as s3
+from aws_cdk import aws_secretsmanager as secretsmanager
+from aws_cdk import aws_wafv2 as wafv2
+from cdk_config import (
+ ACCESS_LOG_DYNAMODB_TABLE_NAME,
+ ACM_SSL_CERTIFICATE_ARN,
+ ALB_NAME,
+ ALB_NAME_SECURITY_GROUP_NAME,
+ ALB_TARGET_GROUP_NAME,
+ AWS_ACCOUNT_ID,
+ AWS_MANAGED_TASK_ROLES_LIST,
+ AWS_REGION,
+ CDK_PREFIX,
+ CLOUDFRONT_DISTRIBUTION_NAME,
+ CLOUDFRONT_GEO_RESTRICTION,
+ CLUSTER_NAME,
+ CODEBUILD_PROJECT_NAME,
+ CODEBUILD_ROLE_NAME,
+ COGNITO_ACCESS_TOKEN_VALIDITY,
+ COGNITO_ID_TOKEN_VALIDITY,
+ COGNITO_REDIRECTION_URL,
+ COGNITO_REFRESH_TOKEN_VALIDITY,
+ COGNITO_USER_POOL_CLIENT_NAME,
+ COGNITO_USER_POOL_CLIENT_SECRET_NAME,
+ COGNITO_USER_POOL_DOMAIN_PREFIX,
+ COGNITO_USER_POOL_NAME,
+ CUSTOM_HEADER,
+ CUSTOM_HEADER_VALUE,
+ CUSTOM_KMS_KEY_NAME,
+ DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS,
+ ECR_CDK_REPO_NAME,
+ ECS_LOG_GROUP_NAME,
+ ECS_READ_ONLY_FILE_SYSTEM,
+ ECS_SECURITY_GROUP_NAME,
+ ECS_SERVICE_NAME,
+ ECS_TASK_CPU_SIZE,
+ ECS_TASK_EXECUTION_ROLE_NAME,
+ ECS_TASK_MEMORY_SIZE,
+ ECS_TASK_ROLE_NAME,
+ ECS_USE_FARGATE_SPOT,
+ EXISTING_IGW_ID,
+ FARGATE_TASK_DEFINITION_NAME,
+ FEEDBACK_LOG_DYNAMODB_TABLE_NAME,
+ GITHUB_REPO_BRANCH,
+ GITHUB_REPO_NAME,
+ GITHUB_REPO_USERNAME,
+ GRADIO_SERVER_PORT,
+ LOAD_BALANCER_WEB_ACL_NAME,
+ NAT_GATEWAY_NAME,
+ NEW_VPC_CIDR,
+ NEW_VPC_DEFAULT_NAME,
+ PRIVATE_SUBNET_AVAILABILITY_ZONES,
+ PRIVATE_SUBNET_CIDR_BLOCKS,
+ PRIVATE_SUBNETS_TO_USE,
+ PUBLIC_SUBNET_AVAILABILITY_ZONES,
+ PUBLIC_SUBNET_CIDR_BLOCKS,
+ PUBLIC_SUBNETS_TO_USE,
+ S3_LOG_CONFIG_BUCKET_NAME,
+ S3_OUTPUT_BUCKET_NAME,
+ SAVE_LOGS_TO_DYNAMODB,
+ SINGLE_NAT_GATEWAY_ID,
+ TASK_DEFINITION_FILE_LOCATION,
+ USAGE_LOG_DYNAMODB_TABLE_NAME,
+ USE_CLOUDFRONT,
+ USE_CUSTOM_KMS_KEY,
+ VPC_NAME,
+ WEB_ACL_NAME,
+)
+from cdk_functions import ( # Only keep CDK-native functions
+ add_alb_https_listener_with_cert,
+ add_custom_policies,
+ create_nat_gateway,
+ create_subnets,
+ create_web_acl_with_common_rules,
+)
+from constructs import Construct
+
+
+def _get_env_list(env_var_name: str) -> List[str]:
+ """Parses a comma-separated environment variable into a list of strings."""
+ value = env_var_name[1:-1].strip().replace('"', "").replace("'", "")
+ if not value:
+ return []
+ # Split by comma and filter out any empty strings that might result from extra commas
+ return [s.strip() for s in value.split(",") if s.strip()]
+
+
+# 1. Try to load CIDR/AZs from environment variables
+if PUBLIC_SUBNETS_TO_USE:
+ PUBLIC_SUBNETS_TO_USE = _get_env_list(PUBLIC_SUBNETS_TO_USE)
+if PRIVATE_SUBNETS_TO_USE:
+ PRIVATE_SUBNETS_TO_USE = _get_env_list(PRIVATE_SUBNETS_TO_USE)
+
+if PUBLIC_SUBNET_CIDR_BLOCKS:
+ PUBLIC_SUBNET_CIDR_BLOCKS = _get_env_list("PUBLIC_SUBNET_CIDR_BLOCKS")
+if PUBLIC_SUBNET_AVAILABILITY_ZONES:
+ PUBLIC_SUBNET_AVAILABILITY_ZONES = _get_env_list("PUBLIC_SUBNET_AVAILABILITY_ZONES")
+if PRIVATE_SUBNET_CIDR_BLOCKS:
+ PRIVATE_SUBNET_CIDR_BLOCKS = _get_env_list("PRIVATE_SUBNET_CIDR_BLOCKS")
+if PRIVATE_SUBNET_AVAILABILITY_ZONES:
+ PRIVATE_SUBNET_AVAILABILITY_ZONES = _get_env_list(
+ "PRIVATE_SUBNET_AVAILABILITY_ZONES"
+ )
+
+if AWS_MANAGED_TASK_ROLES_LIST:
+ AWS_MANAGED_TASK_ROLES_LIST = _get_env_list(AWS_MANAGED_TASK_ROLES_LIST)
+
+
+class CdkStack(Stack):
+
+ def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
+ super().__init__(scope, construct_id, **kwargs)
+
+ # --- Helper to get context values ---
+ def get_context_bool(key: str, default: bool = False) -> bool:
+ return self.node.try_get_context(key) or default
+
+ def get_context_str(key: str, default: str = None) -> str:
+ return self.node.try_get_context(key) or default
+
+ def get_context_dict(key: str, default: dict = None) -> dict:
+ return self.node.try_get_context(key) or default
+
+ def get_context_list_of_dicts(key: str) -> List[Dict[str, Any]]:
+ ctx_value = self.node.try_get_context(key)
+ if not isinstance(ctx_value, list):
+ print(
+ f"Warning: Context key '{key}' not found or not a list. Returning empty list."
+ )
+ return []
+ # Optional: Add validation that all items in the list are dicts
+ return ctx_value
+
+ self.template_options.description = "Deployment of the 'doc_redaction' PDF, image, and XLSX/CSV redaction app. Git repo available at: https://github.com/seanpedrick-case/doc_redaction."
+
+ # --- VPC and Subnets (Assuming VPC is always lookup, Subnets are created/returned by create_subnets) ---
+ new_vpc_created = False
+ if VPC_NAME:
+ print("Looking for current VPC:", VPC_NAME)
+ try:
+ vpc = ec2.Vpc.from_lookup(self, "VPC", vpc_name=VPC_NAME)
+ print("Successfully looked up VPC:", vpc.vpc_id)
+ except Exception as e:
+ raise Exception(
+ f"Could not look up VPC with name '{VPC_NAME}' due to: {e}"
+ )
+
+ elif NEW_VPC_DEFAULT_NAME:
+ new_vpc_created = True
+ print(
+ f"NEW_VPC_DEFAULT_NAME ('{NEW_VPC_DEFAULT_NAME}') is set. Creating a new VPC."
+ )
+
+ # Configuration for the new VPC
+ # You can make these configurable via context as well, e.g.,
+ # new_vpc_cidr = self.node.try_get_context("new_vpc_cidr") or "10.0.0.0/24"
+ # new_vpc_max_azs = self.node.try_get_context("new_vpc_max_azs") or 2 # Use 2 AZs by default for HA
+ # new_vpc_nat_gateways = self.node.try_get_context("new_vpc_nat_gateways") or new_vpc_max_azs # One NAT GW per AZ for HA
+ # or 1 for cost savings if acceptable
+ if not NEW_VPC_CIDR:
+ raise Exception(
+ "App has been instructed to create a new VPC but not VPC CDR range provided to variable NEW_VPC_CIDR"
+ )
+
+ print("Provided NEW_VPC_CIDR range:", NEW_VPC_CIDR)
+
+ new_vpc_cidr = NEW_VPC_CIDR
+ new_vpc_max_azs = 2 # Creates resources in 2 AZs. Adjust as needed.
+
+ # For "a NAT gateway", you can set nat_gateways=1.
+ # For resilience (NAT GW per AZ), set nat_gateways=new_vpc_max_azs.
+ # The Vpc construct will create NAT Gateway(s) if subnet_type PRIVATE_WITH_EGRESS is used
+ # and nat_gateways > 0.
+ new_vpc_nat_gateways = (
+ 1 # Creates a single NAT Gateway for cost-effectiveness.
+ )
+ # If you need one per AZ for higher availability, set this to new_vpc_max_azs.
+
+ vpc = ec2.Vpc(
+ self,
+ "MyNewLogicalVpc", # This is the CDK construct ID
+ vpc_name=NEW_VPC_DEFAULT_NAME,
+ ip_addresses=ec2.IpAddresses.cidr(new_vpc_cidr),
+ max_azs=new_vpc_max_azs,
+ nat_gateways=new_vpc_nat_gateways, # Number of NAT gateways to create
+ subnet_configuration=[
+ ec2.SubnetConfiguration(
+ name="Public", # Name prefix for public subnets
+ subnet_type=ec2.SubnetType.PUBLIC,
+ cidr_mask=28, # Adjust CIDR mask as needed (e.g., /24 provides ~250 IPs per subnet)
+ ),
+ ec2.SubnetConfiguration(
+ name="Private", # Name prefix for private subnets
+ subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS, # Ensures these subnets have NAT Gateway access
+ cidr_mask=28, # Adjust CIDR mask as needed
+ ),
+ # You could also add ec2.SubnetType.PRIVATE_ISOLATED if needed
+ ],
+ # Internet Gateway is created and configured automatically for PUBLIC subnets.
+ # Route tables for public subnets will point to the IGW.
+ # Route tables for PRIVATE_WITH_EGRESS subnets will point to the NAT Gateway(s).
+ )
+ print(
+ f"Successfully created new VPC: {vpc.vpc_id} with name '{NEW_VPC_DEFAULT_NAME}'"
+ )
+ # If nat_gateways > 0, vpc.nat_gateway_ips will contain EIPs if Vpc created them.
+ # vpc.public_subnets, vpc.private_subnets, vpc.isolated_subnets are populated.
+
+ else:
+ raise Exception(
+ "VPC_NAME for current VPC not found, and NEW_VPC_DEFAULT_NAME not found to create a new VPC"
+ )
+
+ # --- Subnet Handling (Check Context and Create/Import) ---
+ # Initialize lists to hold ISubnet objects (L2) and CfnSubnet/CfnRouteTable (L1)
+ # We will store ISubnet for consistency, as CfnSubnet has a .subnet_id property
+ self.public_subnets: List[ec2.ISubnet] = []
+ self.private_subnets: List[ec2.ISubnet] = []
+ # Store L1 CfnRouteTables explicitly if you need to reference them later
+ self.private_route_tables_cfn: List[ec2.CfnRouteTable] = []
+ self.public_route_tables_cfn: List[ec2.CfnRouteTable] = (
+ []
+ ) # New: to store public RTs
+
+ names_to_create_private = []
+ names_to_create_public = []
+
+ if not PUBLIC_SUBNETS_TO_USE and not PRIVATE_SUBNETS_TO_USE:
+ print(
+ "Warning: No public or private subnets specified in *_SUBNETS_TO_USE. Attempting to select from existing VPC subnets."
+ )
+
+ print("vpc.public_subnets:", vpc.public_subnets)
+ print("vpc.private_subnets:", vpc.private_subnets)
+
+ if (
+ vpc.public_subnets
+ ): # These are already one_per_az if max_azs was used and Vpc created them
+ self.public_subnets.extend(vpc.public_subnets)
+ else:
+ self.node.add_warning("No public subnets found in the VPC.")
+
+ # Get private subnets with egress specifically
+ # selected_private_subnets_with_egress = vpc.select_subnets(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS)
+
+ print(
+ f"Selected from VPC: {len(self.public_subnets)} public, {len(self.private_subnets)} private_with_egress subnets."
+ )
+
+ if (
+ len(self.public_subnets) < 1 or len(self.private_subnets) < 1
+ ): # Simplified check for new VPC
+ # If new_vpc_max_azs was 1, you'd have 1 of each. If 2, then 2 of each.
+ # The original check ' < 2' might be too strict if new_vpc_max_azs=1
+ pass # For new VPC, allow single AZ setups if configured that way. The VPC construct ensures one per AZ up to max_azs.
+
+ if not self.public_subnets and not self.private_subnets:
+ print(
+ "Error: No public or private subnets could be found in the VPC for automatic selection. "
+ "You must either specify subnets in *_SUBNETS_TO_USE or ensure the VPC has discoverable subnets."
+ )
+ raise RuntimeError("No suitable subnets found for automatic selection.")
+ else:
+ print(
+ f"Automatically selected {len(self.public_subnets)} public and {len(self.private_subnets)} private subnets based on VPC properties."
+ )
+
+ selected_public_subnets = vpc.select_subnets(
+ subnet_type=ec2.SubnetType.PUBLIC, one_per_az=True
+ )
+ private_subnets_egress = vpc.select_subnets(
+ subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS, one_per_az=True
+ )
+
+ if private_subnets_egress.subnets:
+ self.private_subnets.extend(private_subnets_egress.subnets)
+ else:
+ self.node.add_warning(
+ "No PRIVATE_WITH_EGRESS subnets found in the VPC."
+ )
+
+ try:
+ private_subnets_isolated = vpc.select_subnets(
+ subnet_type=ec2.SubnetType.PRIVATE_ISOLATED, one_per_az=True
+ )
+ except Exception as e:
+ private_subnets_isolated = []
+ print("Could not find any isolated subnets due to:", e)
+
+ ###
+ combined_subnet_objects = []
+
+ if private_subnets_isolated:
+ if private_subnets_egress.subnets:
+ # Add the first PRIVATE_WITH_EGRESS subnet
+ combined_subnet_objects.append(private_subnets_egress.subnets[0])
+ elif not private_subnets_isolated:
+ if private_subnets_egress.subnets:
+ # Add the first PRIVATE_WITH_EGRESS subnet
+ combined_subnet_objects.extend(private_subnets_egress.subnets)
+ else:
+ self.node.add_warning(
+ "No PRIVATE_WITH_EGRESS subnets found to select the first one."
+ )
+
+ # Add all PRIVATE_ISOLATED subnets *except* the first one (if they exist)
+ try:
+ if len(private_subnets_isolated.subnets) > 1:
+ combined_subnet_objects.extend(private_subnets_isolated.subnets[1:])
+ elif (
+ private_subnets_isolated.subnets
+ ): # Only 1 isolated subnet, add a warning if [1:] was desired
+ self.node.add_warning(
+ "Only one PRIVATE_ISOLATED subnet found, private_subnets_isolated.subnets[1:] will be empty."
+ )
+ else:
+ self.node.add_warning("No PRIVATE_ISOLATED subnets found.")
+ except Exception as e:
+ print("Could not identify private isolated subnets due to:", e)
+
+ # Create an ec2.SelectedSubnets object from the combined private subnet list.
+ selected_private_subnets = vpc.select_subnets(
+ subnets=combined_subnet_objects
+ )
+
+ print("selected_public_subnets:", selected_public_subnets)
+ print("selected_private_subnets:", selected_private_subnets)
+
+ if (
+ len(selected_public_subnets.subnet_ids) < 2
+ or len(selected_private_subnets.subnet_ids) < 2
+ ):
+ raise Exception(
+ "Need at least two public or private subnets in different availability zones"
+ )
+
+ if not selected_public_subnets and not selected_private_subnets:
+ # If no subnets could be found even with automatic selection, raise an error.
+ # This ensures the stack doesn't proceed if it absolutely needs subnets.
+ print(
+ "Error: No existing public or private subnets could be found in the VPC for automatic selection. "
+ "You must either specify subnets in *_SUBNETS_TO_USE or ensure the VPC has discoverable subnets."
+ )
+ raise RuntimeError("No suitable subnets found for automatic selection.")
+ else:
+ self.public_subnets = selected_public_subnets.subnets
+ self.private_subnets = selected_private_subnets.subnets
+ print(
+ f"Automatically selected {len(self.public_subnets)} public and {len(self.private_subnets)} private subnets based on VPC discovery."
+ )
+
+ print("self.public_subnets:", self.public_subnets)
+ print("self.private_subnets:", self.private_subnets)
+ # Since subnets are now assigned, we can exit this processing block.
+ # The rest of the original code (which iterates *_SUBNETS_TO_USE) will be skipped.
+
+ checked_public_subnets_ctx = get_context_dict("checked_public_subnets")
+ get_context_dict("checked_private_subnets")
+
+ public_subnets_data_for_creation_ctx = get_context_list_of_dicts(
+ "public_subnets_to_create"
+ )
+ private_subnets_data_for_creation_ctx = get_context_list_of_dicts(
+ "private_subnets_to_create"
+ )
+
+ # --- 3. Process Public Subnets ---
+ print("\n--- Processing Public Subnets ---")
+ # Import existing public subnets
+ if checked_public_subnets_ctx:
+ for i, subnet_name in enumerate(PUBLIC_SUBNETS_TO_USE):
+ subnet_info = checked_public_subnets_ctx.get(subnet_name)
+ if subnet_info and subnet_info.get("exists"):
+ subnet_id = subnet_info.get("id")
+ if not subnet_id:
+ raise RuntimeError(
+ f"Context for existing public subnet '{subnet_name}' is missing 'id'."
+ )
+ try:
+ ec2.Subnet.from_subnet_id(
+ self,
+ f"ImportedPublicSubnet{subnet_name.replace('-', '')}{i}",
+ subnet_id,
+ )
+ # self.public_subnets.append(imported_subnet)
+ print(
+ f"Imported existing public subnet: {subnet_name} (ID: {subnet_id})"
+ )
+ except Exception as e:
+ raise RuntimeError(
+ f"Failed to import public subnet '{subnet_name}' with ID '{subnet_id}'. Error: {e}"
+ )
+
+ # Create new public subnets based on public_subnets_data_for_creation_ctx
+ if public_subnets_data_for_creation_ctx:
+ names_to_create_public = [
+ s["name"] for s in public_subnets_data_for_creation_ctx
+ ]
+ cidrs_to_create_public = [
+ s["cidr"] for s in public_subnets_data_for_creation_ctx
+ ]
+ azs_to_create_public = [
+ s["az"] for s in public_subnets_data_for_creation_ctx
+ ]
+
+ if names_to_create_public:
+ print(
+ f"Attempting to create {len(names_to_create_public)} new public subnets: {names_to_create_public}"
+ )
+ newly_created_public_subnets, newly_created_public_rts_cfn = (
+ create_subnets(
+ self,
+ vpc,
+ CDK_PREFIX,
+ names_to_create_public,
+ cidrs_to_create_public,
+ azs_to_create_public,
+ is_public=True,
+ internet_gateway_id=EXISTING_IGW_ID,
+ )
+ )
+ self.public_subnets.extend(newly_created_public_subnets)
+ self.public_route_tables_cfn.extend(newly_created_public_rts_cfn)
+
+ if (
+ not self.public_subnets
+ and not names_to_create_public
+ and not PUBLIC_SUBNETS_TO_USE
+ ):
+ raise Exception("No public subnets found or created, exiting.")
+
+ # --- NAT Gateway Creation/Lookup ---
+ print("Creating NAT gateway/located existing")
+ self.single_nat_gateway_id = None
+
+ nat_gw_id_from_context = SINGLE_NAT_GATEWAY_ID
+
+ if nat_gw_id_from_context:
+ print(
+ f"Using existing NAT Gateway ID from context: {nat_gw_id_from_context}"
+ )
+ self.single_nat_gateway_id = nat_gw_id_from_context
+
+ elif (
+ new_vpc_created
+ and new_vpc_nat_gateways > 0
+ and hasattr(vpc, "nat_gateways")
+ and vpc.nat_gateways
+ ):
+ self.single_nat_gateway_id = vpc.nat_gateways[0].gateway_id
+ print(
+ f"Using NAT Gateway {self.single_nat_gateway_id} created by the new VPC construct."
+ )
+
+ if not self.single_nat_gateway_id:
+ print("Creating a new NAT gateway")
+
+ if hasattr(vpc, "nat_gateways") and vpc.nat_gateways:
+ print("Existing NAT gateway found in vpc")
+ pass
+
+ # If not in context, create a new one, but only if we have a public subnet.
+ elif self.public_subnets:
+ print("NAT Gateway ID not found in context. Creating a new one.")
+ # Place the NAT GW in the first available public subnet
+ first_public_subnet = self.public_subnets[0]
+
+ self.single_nat_gateway_id = create_nat_gateway(
+ self,
+ first_public_subnet,
+ nat_gateway_name=NAT_GATEWAY_NAME,
+ nat_gateway_id_context_key=SINGLE_NAT_GATEWAY_ID,
+ )
+ else:
+ print(
+ "WARNING: No public subnets available and NAT gateway not found in existing VPC. Cannot create a NAT Gateway."
+ )
+
+ # --- 4. Process Private Subnets ---
+ print("\n--- Processing Private Subnets ---")
+ # ... (rest of your existing subnet processing logic for checked_private_subnets_ctx) ...
+ # (This part for importing existing subnets remains the same)
+
+ # Create new private subnets
+ if private_subnets_data_for_creation_ctx:
+ names_to_create_private = [
+ s["name"] for s in private_subnets_data_for_creation_ctx
+ ]
+ cidrs_to_create_private = [
+ s["cidr"] for s in private_subnets_data_for_creation_ctx
+ ]
+ azs_to_create_private = [
+ s["az"] for s in private_subnets_data_for_creation_ctx
+ ]
+
+ if names_to_create_private:
+ print(
+ f"Attempting to create {len(names_to_create_private)} new private subnets: {names_to_create_private}"
+ )
+ # --- CALL THE NEW CREATE_SUBNETS FUNCTION FOR PRIVATE ---
+ # Ensure self.single_nat_gateway_id is available before this call
+ if not self.single_nat_gateway_id:
+ raise ValueError(
+ "A single NAT Gateway ID is required for private subnets but was not resolved."
+ )
+
+ newly_created_private_subnets_cfn, newly_created_private_rts_cfn = (
+ create_subnets(
+ self,
+ vpc,
+ CDK_PREFIX,
+ names_to_create_private,
+ cidrs_to_create_private,
+ azs_to_create_private,
+ is_public=False,
+ single_nat_gateway_id=self.single_nat_gateway_id, # Pass the single NAT Gateway ID
+ )
+ )
+ self.private_subnets.extend(newly_created_private_subnets_cfn)
+ self.private_route_tables_cfn.extend(newly_created_private_rts_cfn)
+ print(
+ f"Successfully defined {len(newly_created_private_subnets_cfn)} new private subnets and their route tables for creation."
+ )
+ else:
+ print(
+ "No private subnets specified for creation in context ('private_subnets_to_create')."
+ )
+
+ # if not self.private_subnets:
+ # raise Exception("No private subnets found or created, exiting.")
+
+ if (
+ not self.private_subnets
+ and not names_to_create_private
+ and not PRIVATE_SUBNETS_TO_USE
+ ):
+ # This condition might need adjustment for new VPCs.
+ raise Exception("No private subnets found or created, exiting.")
+
+ # --- 5. Sanity Check and Output ---
+ # Output the single NAT Gateway ID for verification
+ if self.single_nat_gateway_id:
+ CfnOutput(
+ self,
+ "SingleNatGatewayId",
+ value=self.single_nat_gateway_id,
+ description="ID of the single NAT Gateway resolved or created.",
+ )
+ elif (
+ NEW_VPC_DEFAULT_NAME
+ and (self.node.try_get_context("new_vpc_nat_gateways") or 1) > 0
+ ):
+ print(
+ "INFO: A new VPC was created with NAT Gateway(s). Their routing is handled by the VPC construct. No single_nat_gateway_id was explicitly set for separate output."
+ )
+ else:
+ out_message = "WARNING: No single NAT Gateway was resolved or created explicitly by the script's logic after VPC setup."
+ print(out_message)
+ raise Exception(out_message)
+
+ # --- Outputs for other stacks/regions ---
+ # These are crucial for cross-stack, cross-region referencing
+
+ self.params = dict()
+ self.params["vpc_id"] = vpc.vpc_id
+ self.params["private_subnets"] = self.private_subnets
+ self.params["private_route_tables"] = self.private_route_tables_cfn
+ self.params["public_subnets"] = self.public_subnets
+ self.params["public_route_tables"] = self.public_route_tables_cfn
+
+ private_subnet_selection = ec2.SubnetSelection(subnets=self.private_subnets)
+ public_subnet_selection = ec2.SubnetSelection(subnets=self.public_subnets)
+
+ for sub in private_subnet_selection.subnets:
+ print(
+ "private subnet:",
+ sub.subnet_id,
+ "is in availability zone:",
+ sub.availability_zone,
+ )
+
+ for sub in public_subnet_selection.subnets:
+ print(
+ "public subnet:",
+ sub.subnet_id,
+ "is in availability zone:",
+ sub.availability_zone,
+ )
+
+ print("Private subnet route tables:", self.private_route_tables_cfn)
+
+ # Add the S3 Gateway Endpoint to the VPC
+ if names_to_create_private:
+ try:
+ s3_gateway_endpoint = vpc.add_gateway_endpoint(
+ "S3GatewayEndpoint",
+ service=ec2.GatewayVpcEndpointAwsService.S3,
+ subnets=[private_subnet_selection],
+ )
+ except Exception as e:
+ print("Could not add S3 gateway endpoint to subnets due to:", e)
+
+ # Output some useful information
+ CfnOutput(
+ self,
+ "VpcIdOutput",
+ value=vpc.vpc_id,
+ description="The ID of the VPC where the S3 Gateway Endpoint is deployed.",
+ )
+ CfnOutput(
+ self,
+ "S3GatewayEndpointService",
+ value=s3_gateway_endpoint.vpc_endpoint_id,
+ description="The id for the S3 Gateway Endpoint.",
+ ) # Specify the S3 service
+
+ # --- IAM Roles ---
+ if USE_CUSTOM_KMS_KEY == "1":
+ kms_key = kms.Key(
+ self,
+ "RedactionSharedKmsKey",
+ alias=CUSTOM_KMS_KEY_NAME,
+ removal_policy=RemovalPolicy.DESTROY,
+ )
+
+ custom_sts_kms_policy_dict = {
+ "Version": "2012-10-17",
+ "Statement": [
+ {
+ "Sid": "STSCallerIdentity",
+ "Effect": "Allow",
+ "Action": ["sts:GetCallerIdentity"],
+ "Resource": "*",
+ },
+ {
+ "Sid": "KMSAccess",
+ "Effect": "Allow",
+ "Action": ["kms:Encrypt", "kms:Decrypt", "kms:GenerateDataKey"],
+ "Resource": kms_key.key_arn, # Use key_arn, as it's the full ARN, safer than key_id
+ },
+ ],
+ }
+ else:
+ kms_key = None
+
+ custom_sts_kms_policy_dict = {
+ "Version": "2012-10-17",
+ "Statement": [
+ {
+ "Sid": "STSCallerIdentity",
+ "Effect": "Allow",
+ "Action": ["sts:GetCallerIdentity"],
+ "Resource": "*",
+ },
+ {
+ "Sid": "KMSSecretsManagerDecrypt", # Explicitly add decrypt for default key
+ "Effect": "Allow",
+ "Action": ["kms:Decrypt"],
+ "Resource": f"arn:aws:kms:{AWS_REGION}:{AWS_ACCOUNT_ID}:key/aws/secretsmanager",
+ },
+ ],
+ }
+ custom_sts_kms_policy = json.dumps(custom_sts_kms_policy_dict, indent=4)
+
+ try:
+ codebuild_role_name = CODEBUILD_ROLE_NAME
+
+ if get_context_bool(f"exists:{codebuild_role_name}"):
+ # If exists, lookup/import the role using ARN from context
+ role_arn = get_context_str(f"arn:{codebuild_role_name}")
+ if not role_arn:
+ raise ValueError(
+ f"Context value 'arn:{codebuild_role_name}' is required if role exists."
+ )
+ codebuild_role = iam.Role.from_role_arn(
+ self, "CodeBuildRole", role_arn=role_arn
+ )
+ print("Using existing CodeBuild role")
+ else:
+ # If not exists, create the role
+ codebuild_role = iam.Role(
+ self,
+ "CodeBuildRole", # Logical ID
+ role_name=codebuild_role_name, # Explicit resource name
+ assumed_by=iam.ServicePrincipal("codebuild.amazonaws.com"),
+ )
+ codebuild_role.add_managed_policy(
+ iam.ManagedPolicy.from_aws_managed_policy_name(
+ "EC2InstanceProfileForImageBuilderECRContainerBuilds"
+ )
+ )
+ print("Successfully created new CodeBuild role")
+
+ task_role_name = ECS_TASK_ROLE_NAME
+ if get_context_bool(f"exists:{task_role_name}"):
+ role_arn = get_context_str(f"arn:{task_role_name}")
+ if not role_arn:
+ raise ValueError(
+ f"Context value 'arn:{task_role_name}' is required if role exists."
+ )
+ task_role = iam.Role.from_role_arn(self, "TaskRole", role_arn=role_arn)
+ print("Using existing ECS task role")
+ else:
+ task_role = iam.Role(
+ self,
+ "TaskRole", # Logical ID
+ role_name=task_role_name, # Explicit resource name
+ assumed_by=iam.ServicePrincipal("ecs-tasks.amazonaws.com"),
+ )
+ for role in AWS_MANAGED_TASK_ROLES_LIST:
+ print(f"Adding {role} to policy")
+ task_role.add_managed_policy(
+ iam.ManagedPolicy.from_aws_managed_policy_name(f"{role}")
+ )
+ task_role = add_custom_policies(
+ self, task_role, custom_policy_text=custom_sts_kms_policy
+ )
+ print("Successfully created new ECS task role")
+
+ execution_role_name = ECS_TASK_EXECUTION_ROLE_NAME
+ if get_context_bool(f"exists:{execution_role_name}"):
+ role_arn = get_context_str(f"arn:{execution_role_name}")
+ if not role_arn:
+ raise ValueError(
+ f"Context value 'arn:{execution_role_name}' is required if role exists."
+ )
+ execution_role = iam.Role.from_role_arn(
+ self, "ExecutionRole", role_arn=role_arn
+ )
+ print("Using existing ECS execution role")
+ else:
+ execution_role = iam.Role(
+ self,
+ "ExecutionRole", # Logical ID
+ role_name=execution_role_name, # Explicit resource name
+ assumed_by=iam.ServicePrincipal("ecs-tasks.amazonaws.com"),
+ )
+ for role in AWS_MANAGED_TASK_ROLES_LIST:
+ execution_role.add_managed_policy(
+ iam.ManagedPolicy.from_aws_managed_policy_name(f"{role}")
+ )
+ execution_role = add_custom_policies(
+ self, execution_role, custom_policy_text=custom_sts_kms_policy
+ )
+ print("Successfully created new ECS execution role")
+
+ except Exception as e:
+ raise Exception("Failed at IAM role step due to:", e)
+
+ # --- S3 Buckets ---
+ try:
+ log_bucket_name = S3_LOG_CONFIG_BUCKET_NAME
+ if get_context_bool(f"exists:{log_bucket_name}"):
+ bucket = s3.Bucket.from_bucket_name(
+ self, "LogConfigBucket", bucket_name=log_bucket_name
+ )
+ print("Using existing S3 bucket", log_bucket_name)
+ else:
+ if USE_CUSTOM_KMS_KEY == "1" and isinstance(kms_key, kms.Key):
+ bucket = s3.Bucket(
+ self,
+ "LogConfigBucket",
+ bucket_name=log_bucket_name,
+ versioned=False,
+ removal_policy=RemovalPolicy.DESTROY,
+ auto_delete_objects=True,
+ encryption=s3.BucketEncryption.KMS,
+ encryption_key=kms_key,
+ )
+ else:
+ bucket = s3.Bucket(
+ self,
+ "LogConfigBucket",
+ bucket_name=log_bucket_name,
+ versioned=False,
+ removal_policy=RemovalPolicy.DESTROY,
+ auto_delete_objects=True,
+ )
+
+ print("Created S3 bucket", log_bucket_name)
+
+ # Add policies - this will apply to both created and imported buckets
+ # CDK handles idempotent policy additions
+ bucket.add_to_resource_policy(
+ iam.PolicyStatement(
+ effect=iam.Effect.ALLOW,
+ principals=[task_role], # Pass the role object directly
+ actions=["s3:GetObject", "s3:PutObject"],
+ resources=[f"{bucket.bucket_arn}/*"],
+ )
+ )
+ bucket.add_to_resource_policy(
+ iam.PolicyStatement(
+ effect=iam.Effect.ALLOW,
+ principals=[task_role],
+ actions=["s3:ListBucket"],
+ resources=[bucket.bucket_arn],
+ )
+ )
+
+ output_bucket_name = S3_OUTPUT_BUCKET_NAME
+ if get_context_bool(f"exists:{output_bucket_name}"):
+ output_bucket = s3.Bucket.from_bucket_name(
+ self, "OutputBucket", bucket_name=output_bucket_name
+ )
+ print("Using existing Output bucket", output_bucket_name)
+ else:
+ if USE_CUSTOM_KMS_KEY == "1" and isinstance(kms_key, kms.Key):
+ output_bucket = s3.Bucket(
+ self,
+ "OutputBucket",
+ bucket_name=output_bucket_name,
+ lifecycle_rules=[
+ s3.LifecycleRule(
+ expiration=Duration.days(
+ int(DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS)
+ )
+ )
+ ],
+ versioned=False,
+ removal_policy=RemovalPolicy.DESTROY,
+ auto_delete_objects=True,
+ encryption=s3.BucketEncryption.KMS,
+ encryption_key=kms_key,
+ )
+ else:
+ output_bucket = s3.Bucket(
+ self,
+ "OutputBucket",
+ bucket_name=output_bucket_name,
+ lifecycle_rules=[
+ s3.LifecycleRule(
+ expiration=Duration.days(
+ int(DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS)
+ )
+ )
+ ],
+ versioned=False,
+ removal_policy=RemovalPolicy.DESTROY,
+ auto_delete_objects=True,
+ )
+
+ print("Created Output bucket:", output_bucket_name)
+
+ # Add policies to output bucket
+ output_bucket.add_to_resource_policy(
+ iam.PolicyStatement(
+ effect=iam.Effect.ALLOW,
+ principals=[task_role],
+ actions=["s3:GetObject", "s3:PutObject"],
+ resources=[f"{output_bucket.bucket_arn}/*"],
+ )
+ )
+ output_bucket.add_to_resource_policy(
+ iam.PolicyStatement(
+ effect=iam.Effect.ALLOW,
+ principals=[task_role],
+ actions=["s3:ListBucket"],
+ resources=[output_bucket.bucket_arn],
+ )
+ )
+
+ except Exception as e:
+ raise Exception("Could not handle S3 buckets due to:", e)
+
+ # --- Elastic Container Registry ---
+ try:
+ full_ecr_repo_name = ECR_CDK_REPO_NAME
+ if get_context_bool(f"exists:{full_ecr_repo_name}"):
+ ecr_repo = ecr.Repository.from_repository_name(
+ self, "ECRRepo", repository_name=full_ecr_repo_name
+ )
+ print("Using existing ECR repository")
+ else:
+ ecr_repo = ecr.Repository(
+ self, "ECRRepo", repository_name=full_ecr_repo_name
+ ) # Explicitly set repository_name
+ print("Created ECR repository", full_ecr_repo_name)
+
+ ecr_image_loc = ecr_repo.repository_uri
+ except Exception as e:
+ raise Exception("Could not handle ECR repo due to:", e)
+
+ # --- CODEBUILD ---
+ try:
+ codebuild_project_name = CODEBUILD_PROJECT_NAME
+ if get_context_bool(f"exists:{codebuild_project_name}"):
+ # Lookup CodeBuild project by ARN from context
+ project_arn = get_context_str(f"arn:{codebuild_project_name}")
+ if not project_arn:
+ raise ValueError(
+ f"Context value 'arn:{codebuild_project_name}' is required if project exists."
+ )
+ codebuild_project = codebuild.Project.from_project_arn(
+ self, "CodeBuildProject", project_arn=project_arn
+ )
+ print("Using existing CodeBuild project")
+ else:
+ codebuild_project = codebuild.Project(
+ self,
+ "CodeBuildProject", # Logical ID
+ project_name=codebuild_project_name, # Explicit resource name
+ source=codebuild.Source.git_hub(
+ owner=GITHUB_REPO_USERNAME,
+ repo=GITHUB_REPO_NAME,
+ branch_or_ref=GITHUB_REPO_BRANCH,
+ ),
+ environment=codebuild.BuildEnvironment(
+ build_image=codebuild.LinuxBuildImage.STANDARD_7_0,
+ privileged=True,
+ environment_variables={
+ "ECR_REPO_NAME": codebuild.BuildEnvironmentVariable(
+ value=full_ecr_repo_name
+ ),
+ "AWS_DEFAULT_REGION": codebuild.BuildEnvironmentVariable(
+ value=AWS_REGION
+ ),
+ "AWS_ACCOUNT_ID": codebuild.BuildEnvironmentVariable(
+ value=AWS_ACCOUNT_ID
+ ),
+ "APP_MODE": codebuild.BuildEnvironmentVariable(
+ value="gradio"
+ ),
+ },
+ ),
+ build_spec=codebuild.BuildSpec.from_object(
+ {
+ "version": "0.2",
+ "phases": {
+ "pre_build": {
+ "commands": [
+ "echo Logging in to Amazon ECR",
+ "aws ecr get-login-password --region $AWS_DEFAULT_REGION | docker login --username AWS --password-stdin $AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com",
+ ]
+ },
+ "build": {
+ "commands": [
+ "echo Building the Docker image",
+ "docker build --build-args APP_MODE=$APP_MODE --target $APP_MODE -t $ECR_REPO_NAME:latest .",
+ "docker tag $ECR_REPO_NAME:latest $AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/$ECR_REPO_NAME:latest",
+ ]
+ },
+ "post_build": {
+ "commands": [
+ "echo Pushing the Docker image",
+ "docker push $AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/$ECR_REPO_NAME:latest",
+ ]
+ },
+ },
+ }
+ ),
+ )
+ print("Successfully created CodeBuild project", codebuild_project_name)
+
+ # Grant permissions - applies to both created and imported project role
+ ecr_repo.grant_pull_push(codebuild_project.role)
+
+ except Exception as e:
+ raise Exception("Could not handle Codebuild project due to:", e)
+
+ # --- Security Groups ---
+ try:
+ ecs_security_group_name = ECS_SECURITY_GROUP_NAME
+
+ try:
+ ecs_security_group = ec2.SecurityGroup(
+ self,
+ "ECSSecurityGroup", # Logical ID
+ security_group_name=ecs_security_group_name, # Explicit resource name
+ vpc=vpc,
+ )
+ print(f"Created Security Group: {ecs_security_group_name}")
+ except Exception as e: # If lookup fails, create
+ print("Failed to create ECS security group due to:", e)
+
+ alb_security_group_name = ALB_NAME_SECURITY_GROUP_NAME
+
+ try:
+ alb_security_group = ec2.SecurityGroup(
+ self,
+ "ALBSecurityGroup", # Logical ID
+ security_group_name=alb_security_group_name, # Explicit resource name
+ vpc=vpc,
+ )
+ print(f"Created Security Group: {alb_security_group_name}")
+ except Exception as e: # If lookup fails, create
+ print("Failed to create ALB security group due to:", e)
+
+ # Define Ingress Rules - CDK will manage adding/removing these as needed
+ ec2_port_gradio_server_port = ec2.Port.tcp(
+ int(GRADIO_SERVER_PORT)
+ ) # Ensure port is int
+ ecs_security_group.add_ingress_rule(
+ peer=alb_security_group,
+ connection=ec2_port_gradio_server_port,
+ description="ALB traffic",
+ )
+
+ alb_security_group.add_ingress_rule(
+ peer=ec2.Peer.prefix_list("pl-93a247fa"),
+ connection=ec2.Port.all_traffic(),
+ description="CloudFront traffic",
+ )
+
+ except Exception as e:
+ raise Exception("Could not handle security groups due to:", e)
+
+ # --- DynamoDB tables for logs (optional) ---
+
+ if SAVE_LOGS_TO_DYNAMODB == "True":
+ try:
+ print("Creating DynamoDB tables for logs")
+
+ dynamodb.Table(
+ self,
+ "RedactionAccessDataTable",
+ table_name=ACCESS_LOG_DYNAMODB_TABLE_NAME,
+ partition_key=dynamodb.Attribute(
+ name="id", type=dynamodb.AttributeType.STRING
+ ),
+ billing_mode=dynamodb.BillingMode.PAY_PER_REQUEST,
+ removal_policy=RemovalPolicy.DESTROY,
+ )
+
+ dynamodb.Table(
+ self,
+ "RedactionFeedbackDataTable",
+ table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME,
+ partition_key=dynamodb.Attribute(
+ name="id", type=dynamodb.AttributeType.STRING
+ ),
+ billing_mode=dynamodb.BillingMode.PAY_PER_REQUEST,
+ removal_policy=RemovalPolicy.DESTROY,
+ )
+
+ dynamodb.Table(
+ self,
+ "RedactionUsageDataTable",
+ table_name=USAGE_LOG_DYNAMODB_TABLE_NAME,
+ partition_key=dynamodb.Attribute(
+ name="id", type=dynamodb.AttributeType.STRING
+ ),
+ billing_mode=dynamodb.BillingMode.PAY_PER_REQUEST,
+ removal_policy=RemovalPolicy.DESTROY,
+ )
+
+ except Exception as e:
+ raise Exception("Could not create DynamoDB tables due to:", e)
+
+ # --- ALB ---
+ try:
+ load_balancer_name = ALB_NAME
+ if len(load_balancer_name) > 32:
+ load_balancer_name = load_balancer_name[-32:]
+ if get_context_bool(f"exists:{load_balancer_name}"):
+ # Lookup ALB by ARN from context
+ alb_arn = get_context_str(f"arn:{load_balancer_name}")
+ if not alb_arn:
+ raise ValueError(
+ f"Context value 'arn:{load_balancer_name}' is required if ALB exists."
+ )
+ alb = elbv2.ApplicationLoadBalancer.from_lookup(
+ self, "ALB", load_balancer_arn=alb_arn # Logical ID
+ )
+ print(f"Using existing Application Load Balancer {load_balancer_name}.")
+ else:
+ alb = elbv2.ApplicationLoadBalancer(
+ self,
+ "ALB", # Logical ID
+ load_balancer_name=load_balancer_name, # Explicit resource name
+ vpc=vpc,
+ internet_facing=True,
+ security_group=alb_security_group, # Link to SG
+ vpc_subnets=public_subnet_selection, # Link to subnets
+ )
+ print("Successfully created new Application Load Balancer")
+ except Exception as e:
+ raise Exception("Could not handle application load balancer due to:", e)
+
+ # --- Cognito User Pool ---
+ try:
+ if get_context_bool(f"exists:{COGNITO_USER_POOL_NAME}"):
+ # Lookup by ID from context
+ user_pool_id = get_context_str(f"id:{COGNITO_USER_POOL_NAME}")
+ if not user_pool_id:
+ raise ValueError(
+ f"Context value 'id:{COGNITO_USER_POOL_NAME}' is required if User Pool exists."
+ )
+ user_pool = cognito.UserPool.from_user_pool_id(
+ self, "UserPool", user_pool_id=user_pool_id
+ )
+ print(f"Using existing user pool {user_pool_id}.")
+ else:
+ user_pool = cognito.UserPool(
+ self,
+ "UserPool",
+ user_pool_name=COGNITO_USER_POOL_NAME,
+ mfa=cognito.Mfa.OFF, # Adjust as needed
+ sign_in_aliases=cognito.SignInAliases(email=True),
+ removal_policy=RemovalPolicy.DESTROY,
+ ) # Adjust as needed
+ print(f"Created new user pool {user_pool.user_pool_id}.")
+
+ # If you're using a certificate, assume that you will be using the ALB Cognito login features. You need different redirect URLs to accept the token that comes from Cognito authentication.
+ if ACM_SSL_CERTIFICATE_ARN:
+ redirect_uris = [
+ COGNITO_REDIRECTION_URL,
+ COGNITO_REDIRECTION_URL + "/oauth2/idpresponse",
+ ]
+ else:
+ redirect_uris = [COGNITO_REDIRECTION_URL]
+
+ user_pool_client_name = COGNITO_USER_POOL_CLIENT_NAME
+ if get_context_bool(f"exists:{user_pool_client_name}"):
+ # Lookup by ID from context (requires User Pool object)
+ user_pool_client_id = get_context_str(f"id:{user_pool_client_name}")
+ if not user_pool_client_id:
+ raise ValueError(
+ f"Context value 'id:{user_pool_client_name}' is required if User Pool Client exists."
+ )
+ user_pool_client = cognito.UserPoolClient.from_user_pool_client_id(
+ self, "UserPoolClient", user_pool_client_id=user_pool_client_id
+ )
+ print(f"Using existing user pool client {user_pool_client_id}.")
+ else:
+ user_pool_client = cognito.UserPoolClient(
+ self,
+ "UserPoolClient",
+ auth_flows=cognito.AuthFlow(
+ user_srp=True, user_password=True
+ ), # Example: enable SRP for secure sign-in
+ user_pool=user_pool,
+ generate_secret=True,
+ user_pool_client_name=user_pool_client_name,
+ supported_identity_providers=[
+ cognito.UserPoolClientIdentityProvider.COGNITO
+ ],
+ o_auth=cognito.OAuthSettings(
+ flows=cognito.OAuthFlows(authorization_code_grant=True),
+ scopes=[
+ cognito.OAuthScope.OPENID,
+ cognito.OAuthScope.EMAIL,
+ cognito.OAuthScope.PROFILE,
+ ],
+ callback_urls=redirect_uris,
+ ),
+ refresh_token_validity=Duration.minutes(
+ COGNITO_REFRESH_TOKEN_VALIDITY
+ ),
+ id_token_validity=Duration.minutes(COGNITO_ID_TOKEN_VALIDITY),
+ access_token_validity=Duration.minutes(
+ COGNITO_ACCESS_TOKEN_VALIDITY
+ ),
+ )
+
+ CfnOutput(
+ self, "CognitoAppClientId", value=user_pool_client.user_pool_client_id
+ )
+
+ print(
+ f"Created new user pool client {user_pool_client.user_pool_client_id}."
+ )
+
+ # Add a domain to the User Pool (crucial for ALB integration)
+ user_pool_domain = user_pool.add_domain(
+ "UserPoolDomain",
+ cognito_domain=cognito.CognitoDomainOptions(
+ domain_prefix=COGNITO_USER_POOL_DOMAIN_PREFIX
+ ),
+ )
+
+ # Apply removal_policy to the created UserPoolDomain construct
+ user_pool_domain.apply_removal_policy(policy=RemovalPolicy.DESTROY)
+
+ CfnOutput(
+ self, "CognitoUserPoolLoginUrl", value=user_pool_domain.base_url()
+ )
+
+ except Exception as e:
+ raise Exception("Could not handle Cognito resources due to:", e)
+
+ # --- Secrets Manager Secret ---
+ try:
+ secret_name = COGNITO_USER_POOL_CLIENT_SECRET_NAME
+ if get_context_bool(f"exists:{secret_name}"):
+ # Lookup by name
+ secret = secretsmanager.Secret.from_secret_name_v2(
+ self, "CognitoSecret", secret_name=secret_name
+ )
+ print("Using existing Secret.")
+ else:
+ if USE_CUSTOM_KMS_KEY == "1" and isinstance(kms_key, kms.Key):
+ secret = secretsmanager.Secret(
+ self,
+ "CognitoSecret", # Logical ID
+ secret_name=secret_name, # Explicit resource name
+ secret_object_value={
+ "REDACTION_USER_POOL_ID": SecretValue.unsafe_plain_text(
+ user_pool.user_pool_id
+ ), # Use the CDK attribute
+ "REDACTION_CLIENT_ID": SecretValue.unsafe_plain_text(
+ user_pool_client.user_pool_client_id
+ ), # Use the CDK attribute
+ "REDACTION_CLIENT_SECRET": user_pool_client.user_pool_client_secret, # Use the CDK attribute
+ },
+ encryption_key=kms_key,
+ )
+ else:
+ secret = secretsmanager.Secret(
+ self,
+ "CognitoSecret", # Logical ID
+ secret_name=secret_name, # Explicit resource name
+ secret_object_value={
+ "REDACTION_USER_POOL_ID": SecretValue.unsafe_plain_text(
+ user_pool.user_pool_id
+ ), # Use the CDK attribute
+ "REDACTION_CLIENT_ID": SecretValue.unsafe_plain_text(
+ user_pool_client.user_pool_client_id
+ ), # Use the CDK attribute
+ "REDACTION_CLIENT_SECRET": user_pool_client.user_pool_client_secret, # Use the CDK attribute
+ },
+ )
+
+ print(
+ "Created new secret in Secrets Manager for Cognito user pool and related details."
+ )
+
+ except Exception as e:
+ raise Exception("Could not handle Secrets Manager secret due to:", e)
+
+ # --- Fargate Task Definition ---
+ try:
+ fargate_task_definition_name = FARGATE_TASK_DEFINITION_NAME
+
+ read_only_file_system = ECS_READ_ONLY_FILE_SYSTEM == "True"
+
+ if os.path.exists(TASK_DEFINITION_FILE_LOCATION):
+ with open(TASK_DEFINITION_FILE_LOCATION) as f: # Use correct path
+ task_def_params = json.load(f)
+ # Need to ensure taskRoleArn and executionRoleArn in JSON are correct ARN strings
+ else:
+ epheremal_storage_volume_name = "appEphemeralVolume"
+
+ task_def_params = {}
+ task_def_params["taskRoleArn"] = (
+ task_role.role_arn
+ ) # Use CDK role object ARN
+ task_def_params["executionRoleArn"] = (
+ execution_role.role_arn
+ ) # Use CDK role object ARN
+ task_def_params["memory"] = ECS_TASK_MEMORY_SIZE
+ task_def_params["cpu"] = ECS_TASK_CPU_SIZE
+ container_def = {
+ "name": full_ecr_repo_name,
+ "image": ecr_image_loc + ":latest",
+ "essential": True,
+ "portMappings": [
+ {
+ "containerPort": int(GRADIO_SERVER_PORT),
+ "hostPort": int(GRADIO_SERVER_PORT),
+ "protocol": "tcp",
+ "appProtocol": "http",
+ }
+ ],
+ "logConfiguration": {
+ "logDriver": "awslogs",
+ "options": {
+ "awslogs-group": ECS_LOG_GROUP_NAME,
+ "awslogs-region": AWS_REGION,
+ "awslogs-stream-prefix": "ecs",
+ },
+ },
+ "environmentFiles": [
+ {"value": bucket.bucket_arn + "/config.env", "type": "s3"}
+ ],
+ "memoryReservation": int(task_def_params["memory"])
+ - 512, # Reserve some memory for the container
+ "mountPoints": [
+ {
+ "sourceVolume": epheremal_storage_volume_name,
+ "containerPath": "/home/user/app/logs",
+ "readOnly": False,
+ },
+ {
+ "sourceVolume": epheremal_storage_volume_name,
+ "containerPath": "/home/user/app/feedback",
+ "readOnly": False,
+ },
+ {
+ "sourceVolume": epheremal_storage_volume_name,
+ "containerPath": "/home/user/app/usage",
+ "readOnly": False,
+ },
+ {
+ "sourceVolume": epheremal_storage_volume_name,
+ "containerPath": "/home/user/app/input",
+ "readOnly": False,
+ },
+ {
+ "sourceVolume": epheremal_storage_volume_name,
+ "containerPath": "/home/user/app/output",
+ "readOnly": False,
+ },
+ {
+ "sourceVolume": epheremal_storage_volume_name,
+ "containerPath": "/home/user/app/tmp",
+ "readOnly": False,
+ },
+ {
+ "sourceVolume": epheremal_storage_volume_name,
+ "containerPath": "/home/user/app/config",
+ "readOnly": False,
+ },
+ {
+ "sourceVolume": epheremal_storage_volume_name,
+ "containerPath": "/tmp/matplotlib_cache",
+ "readOnly": False,
+ },
+ {
+ "sourceVolume": epheremal_storage_volume_name,
+ "containerPath": "/tmp",
+ "readOnly": False,
+ },
+ {
+ "sourceVolume": epheremal_storage_volume_name,
+ "containerPath": "/var/tmp",
+ "readOnly": False,
+ },
+ {
+ "sourceVolume": epheremal_storage_volume_name,
+ "containerPath": "/tmp/tld",
+ "readOnly": False,
+ },
+ {
+ "sourceVolume": epheremal_storage_volume_name,
+ "containerPath": "/tmp/gradio_tmp",
+ "readOnly": False,
+ },
+ {
+ "sourceVolume": epheremal_storage_volume_name,
+ "containerPath": "/home/user/.paddlex",
+ "readOnly": False,
+ },
+ {
+ "sourceVolume": epheremal_storage_volume_name,
+ "containerPath": "/home/user/.local/share/spacy/data",
+ "readOnly": False,
+ },
+ {
+ "sourceVolume": epheremal_storage_volume_name,
+ "containerPath": "/usr/share/tessdata",
+ "readOnly": False,
+ },
+ ],
+ "readonlyRootFilesystem": read_only_file_system,
+ }
+ task_def_params["containerDefinitions"] = [container_def]
+
+ log_group_name_from_config = task_def_params["containerDefinitions"][0][
+ "logConfiguration"
+ ]["options"]["awslogs-group"]
+
+ cdk_managed_log_group = logs.LogGroup(
+ self,
+ "MyTaskLogGroup", # CDK Logical ID
+ log_group_name=log_group_name_from_config,
+ retention=logs.RetentionDays.ONE_MONTH,
+ removal_policy=RemovalPolicy.DESTROY,
+ )
+
+ epheremal_storage_volume_cdk_obj = ecs.Volume(
+ name=epheremal_storage_volume_name
+ )
+
+ fargate_task_definition = ecs.FargateTaskDefinition(
+ self,
+ "FargateTaskDefinition", # Logical ID
+ family=fargate_task_definition_name,
+ cpu=int(task_def_params["cpu"]),
+ memory_limit_mib=int(task_def_params["memory"]),
+ task_role=task_role,
+ execution_role=execution_role,
+ runtime_platform=ecs.RuntimePlatform(
+ cpu_architecture=ecs.CpuArchitecture.X86_64,
+ operating_system_family=ecs.OperatingSystemFamily.LINUX,
+ ),
+ ephemeral_storage_gib=21, # Minimum is 21 GiB
+ volumes=[epheremal_storage_volume_cdk_obj],
+ )
+ print("Fargate task definition defined.")
+
+ # Add container definitions to the task definition object
+ if task_def_params["containerDefinitions"]:
+ container_def_params = task_def_params["containerDefinitions"][0]
+
+ if container_def_params.get("environmentFiles"):
+ env_files = []
+ for env_file_param in container_def_params["environmentFiles"]:
+ # Need to parse the ARN to get the bucket object and key
+ env_file_arn_parts = env_file_param["value"].split(":::")
+ bucket_name_and_key = env_file_arn_parts[-1]
+ env_bucket_name, env_key = bucket_name_and_key.split("/", 1)
+
+ env_file = ecs.EnvironmentFile.from_bucket(bucket, env_key)
+
+ env_files.append(env_file)
+
+ container = fargate_task_definition.add_container(
+ container_def_params["name"],
+ image=ecs.ContainerImage.from_registry(
+ container_def_params["image"]
+ ),
+ logging=ecs.LogDriver.aws_logs(
+ stream_prefix=container_def_params["logConfiguration"][
+ "options"
+ ]["awslogs-stream-prefix"],
+ log_group=cdk_managed_log_group,
+ ),
+ secrets={
+ "AWS_USER_POOL_ID": ecs.Secret.from_secrets_manager(
+ secret, "REDACTION_USER_POOL_ID"
+ ),
+ "AWS_CLIENT_ID": ecs.Secret.from_secrets_manager(
+ secret, "REDACTION_CLIENT_ID"
+ ),
+ "AWS_CLIENT_SECRET": ecs.Secret.from_secrets_manager(
+ secret, "REDACTION_CLIENT_SECRET"
+ ),
+ },
+ environment_files=env_files,
+ readonly_root_filesystem=read_only_file_system,
+ )
+
+ for port_mapping in container_def_params["portMappings"]:
+ container.add_port_mappings(
+ ecs.PortMapping(
+ container_port=int(port_mapping["containerPort"]),
+ host_port=int(port_mapping["hostPort"]),
+ name="port-" + str(port_mapping["containerPort"]),
+ app_protocol=ecs.AppProtocol.http,
+ protocol=ecs.Protocol.TCP,
+ )
+ )
+
+ container.add_port_mappings(
+ ecs.PortMapping(
+ container_port=80,
+ host_port=80,
+ name="port-80",
+ app_protocol=ecs.AppProtocol.http,
+ protocol=ecs.Protocol.TCP,
+ )
+ )
+
+ if container_def_params.get("mountPoints"):
+ mount_points = []
+ for mount_point in container_def_params["mountPoints"]:
+ mount_points.append(
+ ecs.MountPoint(
+ container_path=mount_point["containerPath"],
+ read_only=mount_point["readOnly"],
+ source_volume=epheremal_storage_volume_name,
+ )
+ )
+ container.add_mount_points(*mount_points)
+
+ except Exception as e:
+ raise Exception("Could not handle Fargate task definition due to:", e)
+
+ # --- ECS Cluster ---
+ try:
+ cluster = ecs.Cluster(
+ self,
+ "ECSCluster", # Logical ID
+ cluster_name=CLUSTER_NAME, # Explicit resource name
+ enable_fargate_capacity_providers=True,
+ vpc=vpc,
+ )
+ print("Successfully created new ECS cluster")
+ except Exception as e:
+ raise Exception("Could not handle ECS cluster due to:", e)
+
+ # --- ECS Service ---
+ try:
+ ecs_service_name = ECS_SERVICE_NAME
+
+ if ECS_USE_FARGATE_SPOT == "True":
+ use_fargate_spot = "FARGATE_SPOT"
+ if ECS_USE_FARGATE_SPOT == "False":
+ use_fargate_spot = "FARGATE"
+
+ # Check if service exists - from_service_arn or from_service_name (needs cluster)
+ try:
+ # from_service_name is useful if you have the cluster object
+ ecs_service = ecs.FargateService.from_service_attributes(
+ self,
+ "ECSService", # Logical ID
+ cluster=cluster, # Requires the cluster object
+ service_name=ecs_service_name,
+ )
+ print(f"Using existing ECS service {ecs_service_name}.")
+ except Exception:
+ # Service will be created with a count of 0, because you haven't yet actually built the initial Docker container with CodeBuild
+ ecs_service = ecs.FargateService(
+ self,
+ "ECSService", # Logical ID
+ service_name=ecs_service_name, # Explicit resource name
+ platform_version=ecs.FargatePlatformVersion.LATEST,
+ capacity_provider_strategies=[
+ ecs.CapacityProviderStrategy(
+ capacity_provider=use_fargate_spot, base=0, weight=1
+ )
+ ],
+ cluster=cluster,
+ task_definition=fargate_task_definition, # Link to TD
+ security_groups=[ecs_security_group], # Link to SG
+ vpc_subnets=ec2.SubnetSelection(
+ subnets=self.private_subnets
+ ), # Link to subnets
+ min_healthy_percent=0,
+ max_healthy_percent=100,
+ desired_count=0,
+ )
+ print("Successfully created new ECS service")
+
+ # Note: Auto-scaling setup would typically go here if needed for the service
+
+ except Exception as e:
+ raise Exception("Could not handle ECS service due to:", e)
+
+ # --- Grant Secret Read Access (Applies to both created and imported roles) ---
+ try:
+ secret.grant_read(task_role)
+ secret.grant_read(execution_role)
+ except Exception as e:
+ raise Exception("Could not grant access to Secrets Manager due to:", e)
+
+ # --- ALB TARGET GROUPS AND LISTENERS ---
+ # This section should primarily define the resources if they are managed by this stack.
+ # CDK handles adding/removing targets and actions on updates.
+ # If they might pre-exist outside the stack, you need lookups.
+ cookie_duration = Duration.hours(12)
+ target_group_name = ALB_TARGET_GROUP_NAME # Explicit resource name
+ cloudfront_distribution_url = "cloudfront_placeholder.net" # Need to replace this afterwards with the actual cloudfront_distribution.domain_name
+
+ try:
+ # --- CREATING TARGET GROUPS AND ADDING THE CLOUDFRONT LISTENER RULE ---
+
+ target_group = elbv2.ApplicationTargetGroup(
+ self,
+ "AppTargetGroup", # Logical ID
+ target_group_name=target_group_name, # Explicit resource name
+ port=int(GRADIO_SERVER_PORT), # Ensure port is int
+ protocol=elbv2.ApplicationProtocol.HTTP,
+ targets=[ecs_service], # Link to ECS Service
+ stickiness_cookie_duration=cookie_duration,
+ vpc=vpc, # Target Groups need VPC
+ )
+ print(f"ALB target group {target_group_name} defined.")
+
+ # First HTTP
+ listener_port = 80
+ # Check if Listener exists - from_listener_arn or lookup by port/ALB
+
+ http_listener = alb.add_listener(
+ "HttpListener", # Logical ID
+ port=listener_port,
+ open=False, # Be cautious with open=True, usually restrict source SG
+ )
+ print(f"ALB listener on port {listener_port} defined.")
+
+ if ACM_SSL_CERTIFICATE_ARN:
+ http_listener.add_action(
+ "DefaultAction", # Logical ID for the default action
+ action=elbv2.ListenerAction.redirect(
+ protocol="HTTPS",
+ host="#{host}",
+ port="443",
+ path="/#{path}",
+ query="#{query}",
+ ),
+ )
+ else:
+ if USE_CLOUDFRONT == "True":
+
+ # The following default action can be added for the listener after a host header rule is added to the listener manually in the Console as suggested in the above comments.
+ http_listener.add_action(
+ "DefaultAction", # Logical ID for the default action
+ action=elbv2.ListenerAction.fixed_response(
+ status_code=403,
+ content_type="text/plain",
+ message_body="Access denied",
+ ),
+ )
+
+ # Add the Listener Rule for the specific CloudFront Host Header
+ http_listener.add_action(
+ "CloudFrontHostHeaderRule",
+ action=elbv2.ListenerAction.forward(
+ target_groups=[target_group],
+ stickiness_duration=cookie_duration,
+ ),
+ priority=1, # Example priority. Adjust as needed. Lower is evaluated first.
+ conditions=[
+ elbv2.ListenerCondition.host_headers(
+ [cloudfront_distribution_url]
+ ) # May have to redefine url in console afterwards if not specified in config file
+ ],
+ )
+
+ else:
+ # Add the Listener Rule for the specific CloudFront Host Header
+ http_listener.add_action(
+ "CloudFrontHostHeaderRule",
+ action=elbv2.ListenerAction.forward(
+ target_groups=[target_group],
+ stickiness_duration=cookie_duration,
+ ),
+ )
+
+ print("Added targets and actions to ALB HTTP listener.")
+
+ # Now the same for HTTPS if you have an ACM certificate
+ if ACM_SSL_CERTIFICATE_ARN:
+ listener_port_https = 443
+ # Check if Listener exists - from_listener_arn or lookup by port/ALB
+
+ https_listener = add_alb_https_listener_with_cert(
+ self,
+ "MyHttpsListener", # Logical ID for the HTTPS listener
+ alb,
+ acm_certificate_arn=ACM_SSL_CERTIFICATE_ARN,
+ default_target_group=target_group,
+ enable_cognito_auth=True,
+ cognito_user_pool=user_pool,
+ cognito_user_pool_client=user_pool_client,
+ cognito_user_pool_domain=user_pool_domain,
+ listener_open_to_internet=True,
+ stickiness_cookie_duration=cookie_duration,
+ )
+
+ if https_listener:
+ CfnOutput(
+ self, "HttpsListenerArn", value=https_listener.listener_arn
+ )
+
+ print(f"ALB listener on port {listener_port_https} defined.")
+
+ # if USE_CLOUDFRONT == 'True':
+ # # Add default action to the listener
+ # https_listener.add_action(
+ # "DefaultAction", # Logical ID for the default action
+ # action=elbv2.ListenerAction.fixed_response(
+ # status_code=403,
+ # content_type="text/plain",
+ # message_body="Access denied",
+ # ),
+ # )
+
+ # # Add the Listener Rule for the specific CloudFront Host Header
+ # https_listener.add_action(
+ # "CloudFrontHostHeaderRuleHTTPS",
+ # action=elbv2.ListenerAction.forward(target_groups=[target_group],stickiness_duration=cookie_duration),
+ # priority=1, # Example priority. Adjust as needed. Lower is evaluated first.
+ # conditions=[
+ # elbv2.ListenerCondition.host_headers([cloudfront_distribution_url])
+ # ]
+ # )
+ # else:
+ # https_listener.add_action(
+ # "CloudFrontHostHeaderRuleHTTPS",
+ # action=elbv2.ListenerAction.forward(target_groups=[target_group],stickiness_duration=cookie_duration))
+
+ print("Added targets and actions to ALB HTTPS listener.")
+
+ except Exception as e:
+ raise Exception(
+ "Could not handle ALB target groups and listeners due to:", e
+ )
+
+ # Create WAF to attach to load balancer
+ try:
+ web_acl_name = LOAD_BALANCER_WEB_ACL_NAME
+ if get_context_bool(f"exists:{web_acl_name}"):
+ # Lookup WAF ACL by ARN from context
+ web_acl_arn = get_context_str(f"arn:{web_acl_name}")
+ if not web_acl_arn:
+ raise ValueError(
+ f"Context value 'arn:{web_acl_name}' is required if Web ACL exists."
+ )
+
+ web_acl = create_web_acl_with_common_rules(
+ self, web_acl_name, waf_scope="REGIONAL"
+ ) # Assuming it takes scope and name
+ print(f"Handled ALB WAF web ACL {web_acl_name}.")
+ else:
+ web_acl = create_web_acl_with_common_rules(
+ self, web_acl_name, waf_scope="REGIONAL"
+ ) # Assuming it takes scope and name
+ print(f"Created ALB WAF web ACL {web_acl_name}.")
+
+ wafv2.CfnWebACLAssociation(
+ self,
+ id="alb_waf_association",
+ resource_arn=alb.load_balancer_arn,
+ web_acl_arn=web_acl.attr_arn,
+ )
+
+ except Exception as e:
+ raise Exception("Could not handle create ALB WAF web ACL due to:", e)
+
+ # --- Outputs for other stacks/regions ---
+
+ self.params = dict()
+ self.params["alb_arn_output"] = alb.load_balancer_arn
+ self.params["alb_security_group_id"] = alb_security_group.security_group_id
+ self.params["alb_dns_name"] = alb.load_balancer_dns_name
+
+ CfnOutput(
+ self,
+ "AlbArnOutput",
+ value=alb.load_balancer_arn,
+ description="ARN of the Application Load Balancer",
+ export_name=f"{self.stack_name}-AlbArn",
+ ) # Export name must be unique within the account/region
+
+ CfnOutput(
+ self,
+ "AlbSecurityGroupIdOutput",
+ value=alb_security_group.security_group_id,
+ description="ID of the ALB's Security Group",
+ export_name=f"{self.stack_name}-AlbSgId",
+ )
+ CfnOutput(self, "ALBName", value=alb.load_balancer_name)
+
+ CfnOutput(self, "RegionalAlbDnsName", value=alb.load_balancer_dns_name)
+
+ CfnOutput(self, "CognitoPoolId", value=user_pool.user_pool_id)
+ # Add other outputs if needed
+
+ CfnOutput(self, "ECRRepoUri", value=ecr_repo.repository_uri)
+
+
+# --- CLOUDFRONT DISTRIBUTION in separate stack (us-east-1 required) ---
+class CdkStackCloudfront(Stack):
+
+ def __init__(
+ self,
+ scope: Construct,
+ construct_id: str,
+ alb_arn: str,
+ alb_sec_group_id: str,
+ alb_dns_name: str,
+ **kwargs,
+ ) -> None:
+ super().__init__(scope, construct_id, **kwargs)
+
+ # --- Helper to get context values ---
+ def get_context_bool(key: str, default: bool = False) -> bool:
+ return self.node.try_get_context(key) or default
+
+ def get_context_str(key: str, default: str = None) -> str:
+ return self.node.try_get_context(key) or default
+
+ def get_context_dict(scope: Construct, key: str, default: dict = None) -> dict:
+ return scope.node.try_get_context(key) or default
+
+ print(f"CloudFront Stack: Received ALB ARN: {alb_arn}")
+ print(f"CloudFront Stack: Received ALB Security Group ID: {alb_sec_group_id}")
+
+ if not alb_arn:
+ raise ValueError("ALB ARN must be provided to CloudFront stack")
+ if not alb_sec_group_id:
+ raise ValueError(
+ "ALB Security Group ID must be provided to CloudFront stack"
+ )
+
+ # 2. Import the ALB using its ARN
+ # This imports an existing ALB as a construct in the CloudFront stack's context.
+ # CloudFormation will understand this reference at deploy time.
+ alb = elbv2.ApplicationLoadBalancer.from_application_load_balancer_attributes(
+ self,
+ "ImportedAlb",
+ load_balancer_arn=alb_arn,
+ security_group_id=alb_sec_group_id,
+ load_balancer_dns_name=alb_dns_name,
+ )
+
+ try:
+ web_acl_name = WEB_ACL_NAME
+ if get_context_bool(f"exists:{web_acl_name}"):
+ # Lookup WAF ACL by ARN from context
+ web_acl_arn = get_context_str(f"arn:{web_acl_name}")
+ if not web_acl_arn:
+ raise ValueError(
+ f"Context value 'arn:{web_acl_name}' is required if Web ACL exists."
+ )
+
+ web_acl = create_web_acl_with_common_rules(
+ self, web_acl_name
+ ) # Assuming it takes scope and name
+ print(f"Handled Cloudfront WAF web ACL {web_acl_name}.")
+ else:
+ web_acl = create_web_acl_with_common_rules(
+ self, web_acl_name
+ ) # Assuming it takes scope and name
+ print(f"Created Cloudfront WAF web ACL {web_acl_name}.")
+
+ # Add ALB as CloudFront Origin
+ origin = origins.LoadBalancerV2Origin(
+ alb, # Use the created or looked-up ALB object
+ custom_headers={CUSTOM_HEADER: CUSTOM_HEADER_VALUE},
+ origin_shield_enabled=False,
+ protocol_policy=cloudfront.OriginProtocolPolicy.HTTP_ONLY,
+ )
+
+ if CLOUDFRONT_GEO_RESTRICTION:
+ geo_restrict = cloudfront.GeoRestriction.allowlist(
+ CLOUDFRONT_GEO_RESTRICTION
+ )
+ else:
+ geo_restrict = None
+
+ cloudfront_distribution = cloudfront.Distribution(
+ self,
+ "CloudFrontDistribution", # Logical ID
+ comment=CLOUDFRONT_DISTRIBUTION_NAME, # Use name as comment for easier identification
+ geo_restriction=geo_restrict,
+ default_behavior=cloudfront.BehaviorOptions(
+ origin=origin,
+ viewer_protocol_policy=cloudfront.ViewerProtocolPolicy.REDIRECT_TO_HTTPS,
+ allowed_methods=cloudfront.AllowedMethods.ALLOW_ALL,
+ cache_policy=cloudfront.CachePolicy.CACHING_DISABLED,
+ origin_request_policy=cloudfront.OriginRequestPolicy.ALL_VIEWER,
+ ),
+ web_acl_id=web_acl.attr_arn,
+ )
+ print(f"Cloudfront distribution {CLOUDFRONT_DISTRIBUTION_NAME} defined.")
+
+ except Exception as e:
+ raise Exception("Could not handle Cloudfront distribution due to:", e)
+
+ # --- Outputs ---
+ CfnOutput(
+ self, "CloudFrontDistributionURL", value=cloudfront_distribution.domain_name
+ )
diff --git a/cdk/check_resources.py b/cdk/check_resources.py
new file mode 100644
index 0000000000000000000000000000000000000000..297e26a4f1d898cf293e8de553a3706fe3593018
--- /dev/null
+++ b/cdk/check_resources.py
@@ -0,0 +1,375 @@
+import json
+import os
+from typing import Any, Dict, List
+
+from cdk_config import ( # Import necessary config
+ ALB_NAME,
+ AWS_REGION,
+ CDK_CONFIG_PATH,
+ CDK_FOLDER,
+ CODEBUILD_PROJECT_NAME,
+ CODEBUILD_ROLE_NAME,
+ COGNITO_USER_POOL_CLIENT_NAME,
+ COGNITO_USER_POOL_CLIENT_SECRET_NAME,
+ COGNITO_USER_POOL_NAME,
+ CONTEXT_FILE,
+ ECR_CDK_REPO_NAME,
+ ECS_TASK_EXECUTION_ROLE_NAME,
+ ECS_TASK_ROLE_NAME,
+ PRIVATE_SUBNET_AVAILABILITY_ZONES,
+ PRIVATE_SUBNET_CIDR_BLOCKS,
+ PRIVATE_SUBNETS_TO_USE,
+ PUBLIC_SUBNET_AVAILABILITY_ZONES,
+ PUBLIC_SUBNET_CIDR_BLOCKS,
+ PUBLIC_SUBNETS_TO_USE,
+ S3_LOG_CONFIG_BUCKET_NAME,
+ S3_OUTPUT_BUCKET_NAME,
+ VPC_NAME,
+ WEB_ACL_NAME,
+)
+from cdk_functions import ( # Import your check functions (assuming they use Boto3)
+ _get_existing_subnets_in_vpc,
+ check_alb_exists,
+ check_codebuild_project_exists,
+ check_ecr_repo_exists,
+ check_for_existing_role,
+ check_for_existing_user_pool,
+ check_for_existing_user_pool_client,
+ check_for_secret,
+ check_s3_bucket_exists,
+ check_subnet_exists_by_name,
+ check_web_acl_exists,
+ get_vpc_id_by_name,
+ validate_subnet_creation_parameters,
+ # Add other check functions as needed
+)
+
+cdk_folder = CDK_FOLDER #
+
+# Full path needed to find config file
+os.environ["CDK_CONFIG_PATH"] = cdk_folder + CDK_CONFIG_PATH
+
+
+# --- Helper to parse environment variables into lists ---
+def _get_env_list(env_var_name: str) -> List[str]:
+ """Parses a comma-separated environment variable into a list of strings."""
+ value = env_var_name[1:-1].strip().replace('"', "").replace("'", "")
+ if not value:
+ return []
+ # Split by comma and filter out any empty strings that might result from extra commas
+ return [s.strip() for s in value.split(",") if s.strip()]
+
+
+if PUBLIC_SUBNETS_TO_USE and not isinstance(PUBLIC_SUBNETS_TO_USE, list):
+ PUBLIC_SUBNETS_TO_USE = _get_env_list(PUBLIC_SUBNETS_TO_USE)
+if PRIVATE_SUBNETS_TO_USE and not isinstance(PRIVATE_SUBNETS_TO_USE, list):
+ PRIVATE_SUBNETS_TO_USE = _get_env_list(PRIVATE_SUBNETS_TO_USE)
+if PUBLIC_SUBNET_CIDR_BLOCKS and not isinstance(PUBLIC_SUBNET_CIDR_BLOCKS, list):
+ PUBLIC_SUBNET_CIDR_BLOCKS = _get_env_list(PUBLIC_SUBNET_CIDR_BLOCKS)
+if PUBLIC_SUBNET_AVAILABILITY_ZONES and not isinstance(
+ PUBLIC_SUBNET_AVAILABILITY_ZONES, list
+):
+ PUBLIC_SUBNET_AVAILABILITY_ZONES = _get_env_list(PUBLIC_SUBNET_AVAILABILITY_ZONES)
+if PRIVATE_SUBNET_CIDR_BLOCKS and not isinstance(PRIVATE_SUBNET_CIDR_BLOCKS, list):
+ PRIVATE_SUBNET_CIDR_BLOCKS = _get_env_list(PRIVATE_SUBNET_CIDR_BLOCKS)
+if PRIVATE_SUBNET_AVAILABILITY_ZONES and not isinstance(
+ PRIVATE_SUBNET_AVAILABILITY_ZONES, list
+):
+ PRIVATE_SUBNET_AVAILABILITY_ZONES = _get_env_list(PRIVATE_SUBNET_AVAILABILITY_ZONES)
+
+# Check for the existence of elements in your AWS environment to see if it's necessary to create new versions of the same
+
+
+def check_and_set_context():
+ context_data = {}
+
+ # --- Find the VPC ID first ---
+ if VPC_NAME:
+ print("VPC_NAME:", VPC_NAME)
+ vpc_id, nat_gateways = get_vpc_id_by_name(VPC_NAME)
+
+ # If you expect only one, or one per AZ and you're creating one per AZ in CDK:
+ if nat_gateways:
+ # For simplicity, let's just check if *any* NAT exists in the VPC
+ # A more robust check would match by subnet, AZ, or a specific tag.
+ context_data["exists:NatGateway"] = True
+ context_data["id:NatGateway"] = nat_gateways[0][
+ "NatGatewayId"
+ ] # Store the ID of the first one found
+ else:
+ context_data["exists:NatGateway"] = False
+ context_data["id:NatGateway"] = None
+
+ if not vpc_id:
+ # If the VPC doesn't exist, you might not be able to check/create subnets.
+ # Decide how to handle this: raise an error, set a flag, etc.
+ raise RuntimeError(
+ f"Required VPC '{VPC_NAME}' not found. Cannot proceed with subnet checks."
+ )
+
+ context_data["vpc_id"] = vpc_id # Store VPC ID in context
+
+ # SUBNET CHECKS
+ context_data: Dict[str, Any] = {}
+ all_proposed_subnets_data: List[Dict[str, str]] = []
+
+ # Flag to indicate if full validation mode (with CIDR/AZs) is active
+ full_validation_mode = False
+
+ # Determine if full validation mode is possible/desired
+ # It's 'desired' if CIDR/AZs are provided, and their lengths match the name lists.
+ public_ready_for_full_validation = (
+ len(PUBLIC_SUBNETS_TO_USE) > 0
+ and len(PUBLIC_SUBNET_CIDR_BLOCKS) == len(PUBLIC_SUBNETS_TO_USE)
+ and len(PUBLIC_SUBNET_AVAILABILITY_ZONES) == len(PUBLIC_SUBNETS_TO_USE)
+ )
+ private_ready_for_full_validation = (
+ len(PRIVATE_SUBNETS_TO_USE) > 0
+ and len(PRIVATE_SUBNET_CIDR_BLOCKS) == len(PRIVATE_SUBNETS_TO_USE)
+ and len(PRIVATE_SUBNET_AVAILABILITY_ZONES) == len(PRIVATE_SUBNETS_TO_USE)
+ )
+
+ # Activate full validation if *any* type of subnet (public or private) has its full details provided.
+ # You might adjust this logic if you require ALL subnet types to have CIDRs, or NONE.
+ if public_ready_for_full_validation or private_ready_for_full_validation:
+ full_validation_mode = True
+
+ # If some are ready but others aren't, print a warning or raise an error based on your strictness
+ if (
+ public_ready_for_full_validation
+ and not private_ready_for_full_validation
+ and PRIVATE_SUBNETS_TO_USE
+ ):
+ print(
+ "Warning: Public subnets have CIDRs/AZs, but private subnets do not. Only public will be fully validated/created with CIDRs."
+ )
+ if (
+ private_ready_for_full_validation
+ and not public_ready_for_full_validation
+ and PUBLIC_SUBNETS_TO_USE
+ ):
+ print(
+ "Warning: Private subnets have CIDRs/AZs, but public subnets do not. Only private will be fully validated/created with CIDRs."
+ )
+
+ # Prepare data for validate_subnet_creation_parameters for all subnets that have full details
+ if public_ready_for_full_validation:
+ for i, name in enumerate(PUBLIC_SUBNETS_TO_USE):
+ all_proposed_subnets_data.append(
+ {
+ "name": name,
+ "cidr": PUBLIC_SUBNET_CIDR_BLOCKS[i],
+ "az": PUBLIC_SUBNET_AVAILABILITY_ZONES[i],
+ }
+ )
+ if private_ready_for_full_validation:
+ for i, name in enumerate(PRIVATE_SUBNETS_TO_USE):
+ all_proposed_subnets_data.append(
+ {
+ "name": name,
+ "cidr": PRIVATE_SUBNET_CIDR_BLOCKS[i],
+ "az": PRIVATE_SUBNET_AVAILABILITY_ZONES[i],
+ }
+ )
+
+ print(f"Target VPC ID for Boto3 lookup: {vpc_id}")
+
+ # Fetch all existing subnets in the target VPC once to avoid repeated API calls
+ try:
+ existing_aws_subnets = _get_existing_subnets_in_vpc(vpc_id)
+ except Exception as e:
+ print(f"Failed to fetch existing VPC subnets. Aborting. Error: {e}")
+ raise SystemExit(1) # Exit immediately if we can't get baseline data
+
+ print("\n--- Running Name-Only Subnet Existence Check Mode ---")
+ # Fallback: check only by name using the existing data
+ checked_public_subnets = {}
+ if PUBLIC_SUBNETS_TO_USE:
+ for subnet_name in PUBLIC_SUBNETS_TO_USE:
+ print("subnet_name:", subnet_name)
+ exists, subnet_id = check_subnet_exists_by_name(
+ subnet_name, existing_aws_subnets
+ )
+ checked_public_subnets[subnet_name] = {
+ "exists": exists,
+ "id": subnet_id,
+ }
+
+ # If the subnet exists, remove it from the proposed subnets list
+ if checked_public_subnets[subnet_name]["exists"] is True:
+ all_proposed_subnets_data = [
+ subnet
+ for subnet in all_proposed_subnets_data
+ if subnet["name"] != subnet_name
+ ]
+
+ context_data["checked_public_subnets"] = checked_public_subnets
+
+ checked_private_subnets = {}
+ if PRIVATE_SUBNETS_TO_USE:
+ for subnet_name in PRIVATE_SUBNETS_TO_USE:
+ print("subnet_name:", subnet_name)
+ exists, subnet_id = check_subnet_exists_by_name(
+ subnet_name, existing_aws_subnets
+ )
+ checked_private_subnets[subnet_name] = {
+ "exists": exists,
+ "id": subnet_id,
+ }
+
+ # If the subnet exists, remove it from the proposed subnets list
+ if checked_private_subnets[subnet_name]["exists"] is True:
+ all_proposed_subnets_data = [
+ subnet
+ for subnet in all_proposed_subnets_data
+ if subnet["name"] != subnet_name
+ ]
+
+ context_data["checked_private_subnets"] = checked_private_subnets
+
+ print("\nName-only existence subnet check complete.\n")
+
+ if full_validation_mode:
+ print(
+ "\n--- Running in Full Subnet Validation Mode (CIDR/AZs provided) ---"
+ )
+ try:
+ validate_subnet_creation_parameters(
+ vpc_id, all_proposed_subnets_data, existing_aws_subnets
+ )
+ print("\nPre-synth validation successful. Proceeding with CDK synth.\n")
+
+ # Populate context_data for downstream CDK construct creation
+ context_data["public_subnets_to_create"] = []
+ if public_ready_for_full_validation:
+ for i, name in enumerate(PUBLIC_SUBNETS_TO_USE):
+ context_data["public_subnets_to_create"].append(
+ {
+ "name": name,
+ "cidr": PUBLIC_SUBNET_CIDR_BLOCKS[i],
+ "az": PUBLIC_SUBNET_AVAILABILITY_ZONES[i],
+ "is_public": True,
+ }
+ )
+ context_data["private_subnets_to_create"] = []
+ if private_ready_for_full_validation:
+ for i, name in enumerate(PRIVATE_SUBNETS_TO_USE):
+ context_data["private_subnets_to_create"].append(
+ {
+ "name": name,
+ "cidr": PRIVATE_SUBNET_CIDR_BLOCKS[i],
+ "az": PRIVATE_SUBNET_AVAILABILITY_ZONES[i],
+ "is_public": False,
+ }
+ )
+
+ except (ValueError, Exception) as e:
+ print(f"\nFATAL ERROR: Subnet parameter validation failed: {e}\n")
+ raise SystemExit(1) # Exit if validation fails
+
+ # Example checks and setting context values
+ # IAM Roles
+ role_name = CODEBUILD_ROLE_NAME
+ exists, _, _ = check_for_existing_role(role_name)
+ context_data[f"exists:{role_name}"] = exists # Use boolean
+ if exists:
+ _, role_arn, _ = check_for_existing_role(role_name) # Get ARN if needed
+ context_data[f"arn:{role_name}"] = role_arn
+
+ role_name = ECS_TASK_ROLE_NAME
+ exists, _, _ = check_for_existing_role(role_name)
+ context_data[f"exists:{role_name}"] = exists
+ if exists:
+ _, role_arn, _ = check_for_existing_role(role_name)
+ context_data[f"arn:{role_name}"] = role_arn
+
+ role_name = ECS_TASK_EXECUTION_ROLE_NAME
+ exists, _, _ = check_for_existing_role(role_name)
+ context_data[f"exists:{role_name}"] = exists
+ if exists:
+ _, role_arn, _ = check_for_existing_role(role_name)
+ context_data[f"arn:{role_name}"] = role_arn
+
+ # S3 Buckets
+ bucket_name = S3_LOG_CONFIG_BUCKET_NAME
+ exists, _ = check_s3_bucket_exists(bucket_name)
+ context_data[f"exists:{bucket_name}"] = exists
+ if exists:
+ # You might not need the ARN if using from_bucket_name
+ pass
+
+ output_bucket_name = S3_OUTPUT_BUCKET_NAME
+ exists, _ = check_s3_bucket_exists(output_bucket_name)
+ context_data[f"exists:{output_bucket_name}"] = exists
+ if exists:
+ pass
+
+ # ECR Repository
+ repo_name = ECR_CDK_REPO_NAME
+ exists, _ = check_ecr_repo_exists(repo_name)
+ context_data[f"exists:{repo_name}"] = exists
+ if exists:
+ pass # from_repository_name is sufficient
+
+ # CodeBuild Project
+ project_name = CODEBUILD_PROJECT_NAME
+ exists, _ = check_codebuild_project_exists(project_name)
+ context_data[f"exists:{project_name}"] = exists
+ if exists:
+ # Need a way to get the ARN from the check function
+ _, project_arn = check_codebuild_project_exists(
+ project_name
+ ) # Assuming it returns ARN
+ context_data[f"arn:{project_name}"] = project_arn
+
+ # ALB (by name lookup)
+ alb_name = ALB_NAME
+ exists, _ = check_alb_exists(alb_name, region_name=AWS_REGION)
+ context_data[f"exists:{alb_name}"] = exists
+ if exists:
+ _, alb_object = check_alb_exists(
+ alb_name, region_name=AWS_REGION
+ ) # Assuming check returns object
+ print("alb_object:", alb_object)
+ context_data[f"arn:{alb_name}"] = alb_object["LoadBalancerArn"]
+
+ # Cognito User Pool (by name)
+ user_pool_name = COGNITO_USER_POOL_NAME
+ exists, user_pool_id, _ = check_for_existing_user_pool(user_pool_name)
+ context_data[f"exists:{user_pool_name}"] = exists
+ if exists:
+ context_data[f"id:{user_pool_name}"] = user_pool_id
+
+ # Cognito User Pool Client (by name and pool ID) - requires User Pool ID from check
+ if user_pool_id:
+ user_pool_id_for_client_check = user_pool_id # context_data.get(f"id:{user_pool_name}") # Use ID from context
+ user_pool_client_name = COGNITO_USER_POOL_CLIENT_NAME
+ if user_pool_id_for_client_check:
+ exists, client_id, _ = check_for_existing_user_pool_client(
+ user_pool_client_name, user_pool_id_for_client_check
+ )
+ context_data[f"exists:{user_pool_client_name}"] = exists
+ if exists:
+ context_data[f"id:{user_pool_client_name}"] = client_id
+
+ # Secrets Manager Secret (by name)
+ secret_name = COGNITO_USER_POOL_CLIENT_SECRET_NAME
+ exists, _ = check_for_secret(secret_name)
+ context_data[f"exists:{secret_name}"] = exists
+ # You might not need the ARN if using from_secret_name_v2
+
+ # WAF Web ACL (by name and scope)
+ web_acl_name = WEB_ACL_NAME
+ exists, _ = check_web_acl_exists(
+ web_acl_name, scope="CLOUDFRONT"
+ ) # Assuming check returns object
+ context_data[f"exists:{web_acl_name}"] = exists
+ if exists:
+ _, existing_web_acl = check_web_acl_exists(web_acl_name, scope="CLOUDFRONT")
+ context_data[f"arn:{web_acl_name}"] = existing_web_acl.attr_arn
+
+ # Write the context data to the file
+ with open(CONTEXT_FILE, "w") as f:
+ json.dump(context_data, f, indent=2)
+
+ print(f"Context data written to {CONTEXT_FILE}")
diff --git a/cdk/lambda_load_dynamo_logs.py b/cdk/lambda_load_dynamo_logs.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ada4d875b11fe69d7786a2ec38b6f59445d4822
--- /dev/null
+++ b/cdk/lambda_load_dynamo_logs.py
@@ -0,0 +1,321 @@
+"""
+Lambda handler to export DynamoDB usage log table to CSV and upload to S3.
+
+All inputs are read from environment variables (no argparse).
+Intended to run as an AWS Lambda function; can also be invoked locally
+by setting env vars and calling lambda_handler({}, None).
+
+Environment variables (same semantics as load_dynamo_logs.py CLI):
+ DYNAMODB_TABLE_NAME - DynamoDB table name (default: redaction_usage)
+ AWS_REGION - AWS region (optional; if unset, uses AWS_DEFAULT_REGION,
+ then region from Lambda context ARN, then eu-west-2)
+ OUTPUT_FOLDER - Local output directory, e.g. /tmp (optional)
+ OUTPUT_FILENAME - Local output file name (default: dynamodb_logs_export.csv)
+ OUTPUT - Full local output path (overrides folder + filename if set).
+ In Lambda only /tmp is writable; relative paths are auto-resolved to /tmp.
+ FROM_DATE - Only include entries on/after this date YYYY-MM-DD (optional)
+ TO_DATE - Only include entries on/before this date YYYY-MM-DD (optional)
+ DATE_ATTRIBUTE - Attribute name for date filtering (default: timestamp)
+ S3_OUTPUT_BUCKET - S3 bucket for the output CSV (required for upload)
+ S3_OUTPUT_KEY - S3 object key/path for the output CSV (required for upload)
+"""
+
+import csv
+import datetime
+import os
+from decimal import Decimal
+from io import StringIO
+
+import boto3
+
+
+def _get_region_from_context(context):
+ """Extract region from Lambda context invoked_function_arn (arn:aws:lambda:REGION:ACCOUNT:function:NAME)."""
+ if context is None:
+ return None
+ arn = getattr(context, "invoked_function_arn", None)
+ if not arn or not isinstance(arn, str):
+ return None
+ parts = arn.split(":")
+ if len(parts) >= 4:
+ return parts[3] # region is 4th segment
+ return None
+
+
+def get_config_from_env(context=None):
+ """Read all settings from environment variables (same inputs as load_dynamo_logs.py).
+ When running in Lambda, context can be passed to derive region from the function ARN if env is not set.
+ """
+ today = datetime.datetime.now().date()
+ one_year_ago = today - datetime.timedelta(days=365)
+
+ table_name = os.environ.get("DYNAMODB_TABLE_NAME") or os.environ.get(
+ "USAGE_LOG_DYNAMODB_TABLE_NAME", "redaction_usage"
+ )
+ region = (
+ os.environ.get("AWS_REGION") or os.environ.get("AWS_DEFAULT_REGION") or ""
+ ).strip()
+ output = os.environ.get("OUTPUT")
+ output_folder = os.environ.get("OUTPUT_FOLDER", "output/")
+ output_filename = os.environ.get("OUTPUT_FILENAME", "dynamodb_logs_export.csv")
+ from_date_str = os.environ.get("FROM_DATE")
+ to_date_str = os.environ.get("TO_DATE")
+ date_attribute = os.environ.get("DATE_ATTRIBUTE", "timestamp")
+ s3_output_bucket = os.environ.get("S3_OUTPUT_BUCKET")
+ s3_output_key = os.environ.get("S3_OUTPUT_KEY")
+
+ if output:
+ local_output_path = output
+ else:
+ folder = output_folder.rstrip("/").rstrip("\\")
+ local_output_path = os.path.join(folder, output_filename)
+
+ # In AWS Lambda only /tmp is writable; resolve relative paths to /tmp to avoid read-only FS errors
+ if os.environ.get("AWS_LAMBDA_FUNCTION_NAME"):
+ resolved = os.path.abspath(local_output_path)
+ if not resolved.startswith("/tmp"):
+ local_output_path = os.path.join(
+ "/tmp", os.path.basename(local_output_path)
+ )
+
+ # Region: env (AWS_REGION / AWS_DEFAULT_REGION) → Lambda context ARN → hardcoded fallback
+ if not region and context is not None:
+ region = _get_region_from_context(context) or ""
+ if not region:
+ region = "FILL IN DEFAULT REGION HERE"
+
+ from_date = None
+ to_date = None
+ if from_date_str:
+ from_date = datetime.datetime.strptime(from_date_str, "%Y-%m-%d").date()
+ if to_date_str:
+ to_date = datetime.datetime.strptime(to_date_str, "%Y-%m-%d").date()
+ if from_date is None and to_date is None:
+ from_date = one_year_ago
+ to_date = today
+ elif from_date is None:
+ from_date = one_year_ago
+ elif to_date is None:
+ to_date = today
+
+ return {
+ "table_name": table_name,
+ "region": region,
+ "local_output_path": local_output_path,
+ "from_date": from_date,
+ "to_date": to_date,
+ "date_attribute": date_attribute,
+ "s3_output_bucket": s3_output_bucket,
+ "s3_output_key": s3_output_key,
+ }
+
+
+# Helper function to convert Decimal to float or int
+def convert_types(item):
+ new_item = {}
+ for key, value in item.items():
+ if isinstance(value, Decimal):
+ new_item[key] = int(value) if value % 1 == 0 else float(value)
+ elif isinstance(value, str):
+ try:
+ dt_obj = datetime.datetime.fromisoformat(value.replace("Z", "+00:00"))
+ new_item[key] = dt_obj.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
+ except (ValueError, TypeError):
+ new_item[key] = value
+ else:
+ new_item[key] = value
+ return new_item
+
+
+def _parse_item_date(value):
+ """Parse a DynamoDB attribute value to datetime for comparison. Returns None if unparseable."""
+ if value is None:
+ return None
+ if isinstance(value, Decimal):
+ try:
+ return datetime.datetime.utcfromtimestamp(float(value))
+ except (ValueError, OSError):
+ return None
+ if isinstance(value, (int, float)):
+ try:
+ return datetime.datetime.utcfromtimestamp(float(value))
+ except (ValueError, OSError):
+ return None
+ if isinstance(value, str):
+ for fmt in (
+ "%Y-%m-%d %H:%M:%S.%f",
+ "%Y-%m-%d %H:%M:%S",
+ "%Y-%m-%d",
+ "%Y-%m-%dT%H:%M:%S",
+ ):
+ try:
+ return datetime.datetime.strptime(value, fmt)
+ except (ValueError, TypeError):
+ continue
+ try:
+ return datetime.datetime.fromisoformat(value.replace("Z", "+00:00"))
+ except (ValueError, TypeError):
+ pass
+ return None
+
+
+def filter_items_by_date(items, from_date, to_date, date_attribute: str):
+ """Return items whose date attribute falls within [from_date, to_date] (inclusive)."""
+ if from_date is None and to_date is None:
+ return items
+ start = datetime.datetime.combine(from_date, datetime.time.min)
+ end = datetime.datetime.combine(to_date, datetime.time.max)
+ filtered = []
+ for item in items:
+ raw = item.get(date_attribute)
+ dt = _parse_item_date(raw)
+ if dt is None:
+ continue
+ if dt.tzinfo:
+ dt = dt.replace(tzinfo=None)
+ if start <= dt <= end:
+ filtered.append(item)
+ return filtered
+
+
+def scan_table(table):
+ """Paginated scan of DynamoDB table."""
+ items = []
+ response = table.scan()
+ items.extend(response["Items"])
+ while "LastEvaluatedKey" in response:
+ response = table.scan(ExclusiveStartKey=response["LastEvaluatedKey"])
+ items.extend(response["Items"])
+ return items
+
+
+def export_to_csv_buffer(items, fields_to_drop=None):
+ """
+ Write items to a CSV in memory; return (csv_string, fieldnames).
+ Use for uploading to S3 without writing to disk.
+ """
+ if not items:
+ return "", []
+
+ drop_set = set(fields_to_drop or [])
+ all_keys = set()
+ for item in items:
+ all_keys.update(item.keys())
+ fieldnames = sorted(list(all_keys - drop_set))
+
+ buf = StringIO()
+ writer = csv.DictWriter(
+ buf, fieldnames=fieldnames, extrasaction="ignore", restval=""
+ )
+ writer.writeheader()
+ for item in items:
+ writer.writerow(convert_types(item))
+ return buf.getvalue(), fieldnames
+
+
+def export_to_csv_file(items, output_path, fields_to_drop=None):
+ """Write items to a CSV file (for optional /tmp or local path)."""
+ csv_string, _ = export_to_csv_buffer(items, fields_to_drop)
+ if not csv_string:
+ return
+ os.makedirs(os.path.dirname(os.path.abspath(output_path)) or ".", exist_ok=True)
+ with open(output_path, "w", newline="", encoding="utf-8-sig") as f:
+ f.write(csv_string)
+
+
+def run_export(config):
+ """
+ Run the full export: scan DynamoDB, filter by date, write CSV (buffer and/or file), upload to S3.
+ """
+ table_name = config["table_name"]
+ region = config["region"]
+ local_output_path = config["local_output_path"]
+ from_date = config["from_date"]
+ to_date = config["to_date"]
+ date_attribute = config["date_attribute"]
+ s3_output_bucket = config["s3_output_bucket"]
+ s3_output_key = config["s3_output_key"]
+
+ if from_date > to_date:
+ raise ValueError("FROM_DATE must be on or before TO_DATE")
+
+ dynamodb = boto3.resource("dynamodb", region_name=region or None)
+ table = dynamodb.Table(table_name)
+
+ items = scan_table(table)
+ items = filter_items_by_date(items, from_date, to_date, date_attribute)
+
+ csv_string, fieldnames = export_to_csv_buffer(items, fields_to_drop=[])
+ result = {
+ "item_count": len(items),
+ "from_date": str(from_date),
+ "to_date": str(to_date),
+ "columns": fieldnames,
+ }
+
+ if csv_string:
+ # Optional: write to local path (e.g. /tmp in Lambda)
+ try:
+ export_to_csv_file(items, local_output_path, fields_to_drop=[])
+ result["local_path"] = local_output_path
+ except Exception as e:
+ result["local_write_error"] = str(e)
+
+ # Upload to S3 if bucket and key are set
+ if s3_output_bucket and s3_output_key:
+ s3 = boto3.client("s3", region_name=region or None)
+ s3.put_object(
+ Bucket=s3_output_bucket,
+ Key=s3_output_key,
+ Body=csv_string.encode("utf-8-sig"),
+ ContentType="text/csv; charset=utf-8",
+ )
+ result["s3_uri"] = f"s3://{s3_output_bucket}/{s3_output_key}"
+ elif s3_output_bucket or s3_output_key:
+ result["s3_skip_reason"] = (
+ "Both S3_OUTPUT_BUCKET and S3_OUTPUT_KEY must be set"
+ )
+
+ return result
+
+
+def lambda_handler(event, context):
+ """
+ AWS Lambda entrypoint. Config is read from environment variables.
+
+ Event is not required for config; it can be used to override env vars
+ (e.g. pass table_name, from_date, to_date, s3_output_bucket, s3_output_key).
+ """
+ config = get_config_from_env(context=context)
+
+ # Optional: allow event to override env-based config
+ if isinstance(event, dict):
+ if event.get("table_name"):
+ config["table_name"] = event["table_name"]
+ if event.get("region"):
+ config["region"] = event["region"]
+ if event.get("from_date"):
+ config["from_date"] = datetime.datetime.strptime(
+ event["from_date"], "%Y-%m-%d"
+ ).date()
+ if event.get("to_date"):
+ config["to_date"] = datetime.datetime.strptime(
+ event["to_date"], "%Y-%m-%d"
+ ).date()
+ if event.get("date_attribute"):
+ config["date_attribute"] = event["date_attribute"]
+ if event.get("s3_output_bucket"):
+ config["s3_output_bucket"] = event["s3_output_bucket"]
+ if event.get("s3_output_key"):
+ config["s3_output_key"] = event["s3_output_key"]
+
+ result = run_export(config)
+ return {"statusCode": 200, "body": result}
+
+
+if __name__ == "__main__":
+ # Allow running locally with env vars set
+ import json
+
+ result = lambda_handler({}, None)
+ print(json.dumps(result, indent=2))
diff --git a/cdk/post_cdk_build_quickstart.py b/cdk/post_cdk_build_quickstart.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c20a1b5b3edd7a6d50b7234c9ca6429e0be40d2
--- /dev/null
+++ b/cdk/post_cdk_build_quickstart.py
@@ -0,0 +1,40 @@
+import time
+
+from cdk_config import (
+ CLUSTER_NAME,
+ CODEBUILD_PROJECT_NAME,
+ ECS_SERVICE_NAME,
+ S3_LOG_CONFIG_BUCKET_NAME,
+)
+from cdk_functions import (
+ create_basic_config_env,
+ start_codebuild_build,
+ start_ecs_task,
+ upload_file_to_s3,
+)
+from tqdm import tqdm
+
+# Create basic config.env file that user can use to run the app later. Input is the folder it is saved into.
+create_basic_config_env("config")
+
+# Start codebuild build
+print("Starting CodeBuild project.")
+start_codebuild_build(PROJECT_NAME=CODEBUILD_PROJECT_NAME)
+
+# Upload config.env file to S3 bucket
+upload_file_to_s3(
+ local_file_paths="config/config.env", s3_key="", s3_bucket=S3_LOG_CONFIG_BUCKET_NAME
+)
+
+total_seconds = 660 # 11 minutes
+update_interval = 1 # Update every second
+
+print("Waiting 11 minutes for the CodeBuild container to build.")
+
+# tqdm iterates over a range, and you perform a small sleep in each iteration
+for i in tqdm(range(total_seconds), desc="Building container"):
+ time.sleep(update_interval)
+
+# Start task on ECS
+print("Starting ECS task")
+start_ecs_task(cluster_name=CLUSTER_NAME, service_name=ECS_SERVICE_NAME)
diff --git a/cdk/requirements.txt b/cdk/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..bda05036c0bd99b31b7750672b89e527a39ebfb5
--- /dev/null
+++ b/cdk/requirements.txt
@@ -0,0 +1,5 @@
+aws-cdk-lib==2.243.0
+boto3==1.42.61
+pandas==2.3.3
+nodejs==0.1.1
+python-dotenv==1.0.1
\ No newline at end of file
diff --git a/cli_redact.py b/cli_redact.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c3b3f10f71d681f4caa7d8cb0f17405488d243c
--- /dev/null
+++ b/cli_redact.py
@@ -0,0 +1,2447 @@
+import argparse
+import os
+import re
+import time
+import uuid
+from datetime import datetime
+
+import pandas as pd
+
+from tools.aws_functions import download_file_from_s3, export_outputs_to_s3
+from tools.config import (
+ ACCESS_LOGS_FOLDER,
+ ALLOW_LIST_PATH,
+ AWS_ACCESS_KEY,
+ AWS_LLM_PII_OPTION,
+ AWS_PII_OPTION,
+ AWS_REGION,
+ AWS_SECRET_KEY,
+ AZURE_OPENAI_API_KEY,
+ AZURE_OPENAI_INFERENCE_ENDPOINT,
+ CHOSEN_COMPREHEND_ENTITIES,
+ CHOSEN_LLM_ENTITIES,
+ CHOSEN_LLM_PII_INFERENCE_METHOD,
+ CHOSEN_REDACT_ENTITIES,
+ CLOUD_LLM_PII_MODEL_CHOICE,
+ CLOUD_VLM_MODEL_CHOICE,
+ COMPRESS_REDACTED_PDF,
+ CUSTOM_ENTITIES,
+ DEFAULT_COMBINE_PAGES,
+ DEFAULT_COST_CODE,
+ DEFAULT_DUPLICATE_DETECTION_THRESHOLD,
+ DEFAULT_FUZZY_SPELLING_MISTAKES_NUM,
+ DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX,
+ DEFAULT_INFERENCE_SERVER_PII_MODEL,
+ DEFAULT_INFERENCE_SERVER_VLM_MODEL,
+ DEFAULT_LANGUAGE,
+ DEFAULT_LOCAL_OCR_MODEL,
+ DEFAULT_MIN_CONSECUTIVE_PAGES,
+ DEFAULT_MIN_WORD_COUNT,
+ DEFAULT_TABULAR_ANONYMISATION_STRATEGY,
+ DENY_LIST_PATH,
+ DIRECT_MODE_DEFAULT_USER,
+ DISPLAY_FILE_NAMES_IN_LOGS,
+ DO_INITIAL_TABULAR_DATA_CLEAN,
+ DOCUMENT_REDACTION_BUCKET,
+ EFFICIENT_OCR,
+ EFFICIENT_OCR_MIN_IMAGE_COVERAGE_FRACTION,
+ EFFICIENT_OCR_MIN_WORDS,
+ FEEDBACK_LOGS_FOLDER,
+ FULL_COMPREHEND_ENTITY_LIST,
+ FULL_ENTITY_LIST,
+ FULL_LLM_ENTITY_LIST,
+ GEMINI_API_KEY,
+ GRADIO_TEMP_DIR,
+ HYBRID_TEXTRACT_BEDROCK_VLM,
+ IMAGES_DPI,
+ INFERENCE_SERVER_API_URL,
+ INFERENCE_SERVER_PII_OPTION,
+ INPUT_FOLDER,
+ LLM_MAX_NEW_TOKENS,
+ LLM_PII_INFERENCE_METHODS,
+ LLM_TEMPERATURE,
+ LOCAL_OCR_MODEL_OPTIONS,
+ LOCAL_PII_OPTION,
+ LOCAL_TRANSFORMERS_LLM_PII_OPTION,
+ OCR_FIRST_PASS_MAX_WORKERS,
+ OUTPUT_FOLDER,
+ OVERWRITE_EXISTING_OCR_RESULTS,
+ PADDLE_MODEL_PATH,
+ PREPROCESS_LOCAL_OCR_IMAGES,
+ REMOVE_DUPLICATE_ROWS,
+ RETURN_REDACTED_PDF,
+ RUN_AWS_FUNCTIONS,
+ S3_OUTPUTS_BUCKET,
+ S3_OUTPUTS_FOLDER,
+ S3_USAGE_LOGS_FOLDER,
+ SAVE_LOGS_TO_CSV,
+ SAVE_LOGS_TO_DYNAMODB,
+ SAVE_OUTPUTS_TO_S3,
+ SAVE_PAGE_OCR_VISUALISATIONS,
+ SESSION_OUTPUT_FOLDER,
+ SPACY_MODEL_PATH,
+ SUMMARY_PAGE_GROUP_MAX_WORKERS,
+ TEXTRACT_JOBS_LOCAL_LOC,
+ TEXTRACT_JOBS_S3_LOC,
+ TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET,
+ TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER,
+ TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER,
+ USAGE_LOGS_FOLDER,
+ USE_GREEDY_DUPLICATE_DETECTION,
+ WHOLE_PAGE_REDACTION_LIST_PATH,
+ convert_string_to_boolean,
+)
+
+
+def _generate_session_hash() -> str:
+ """Generate a unique session hash for logging purposes."""
+ return str(uuid.uuid4())[:8]
+
+
+def _sanitize_folder_name(folder_name: str, max_length: int = 50) -> str:
+ """
+ Sanitize folder name for S3 compatibility.
+
+ Replaces 'strange' characters (anything that's not alphanumeric, dash, underscore, or full stop)
+ with underscores, and limits the length to max_length characters.
+
+ Args:
+ folder_name: Original folder name to sanitize
+ max_length: Maximum length for the folder name (default: 50)
+
+ Returns:
+ Sanitized folder name
+ """
+ if not folder_name:
+ return folder_name
+
+ # Replace any character that's not alphanumeric, dash, underscore, or full stop with underscore
+ # This handles @, commas, exclamation marks, spaces, etc.
+ sanitized = re.sub(r"[^a-zA-Z0-9._-]", "_", folder_name)
+
+ # Limit length to max_length
+ if len(sanitized) > max_length:
+ sanitized = sanitized[:max_length]
+
+ return sanitized
+
+
+def get_username_and_folders(
+ username: str = "",
+ output_folder_textbox: str = OUTPUT_FOLDER,
+ input_folder_textbox: str = INPUT_FOLDER,
+ session_output_folder: bool = SESSION_OUTPUT_FOLDER,
+ textract_document_upload_input_folder: str = TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER,
+ textract_document_upload_output_folder: str = TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER,
+ s3_textract_document_logs_subfolder: str = TEXTRACT_JOBS_S3_LOC,
+ local_textract_document_logs_subfolder: str = TEXTRACT_JOBS_LOCAL_LOC,
+):
+
+ # Generate session hash for logging. Either from input user name or generated
+ if username:
+ out_session_hash = username
+ else:
+ out_session_hash = _generate_session_hash()
+
+ # Sanitize session hash for S3 compatibility (especially important for S3 folder paths)
+ sanitized_session_hash = _sanitize_folder_name(out_session_hash)
+
+ if session_output_folder:
+ output_folder = output_folder_textbox + sanitized_session_hash + "/"
+ input_folder = input_folder_textbox + sanitized_session_hash + "/"
+
+ textract_document_upload_input_folder = (
+ textract_document_upload_input_folder + "/" + sanitized_session_hash
+ )
+ textract_document_upload_output_folder = (
+ textract_document_upload_output_folder + "/" + sanitized_session_hash
+ )
+
+ s3_textract_document_logs_subfolder = (
+ s3_textract_document_logs_subfolder + "/" + sanitized_session_hash
+ )
+ local_textract_document_logs_subfolder = (
+ local_textract_document_logs_subfolder + "/" + sanitized_session_hash + "/"
+ )
+
+ else:
+ output_folder = output_folder_textbox
+ input_folder = input_folder_textbox
+
+ if not os.path.exists(output_folder):
+ os.mkdir(output_folder)
+ if not os.path.exists(input_folder):
+ os.mkdir(input_folder)
+
+ return (
+ out_session_hash,
+ output_folder,
+ out_session_hash,
+ input_folder,
+ textract_document_upload_input_folder,
+ textract_document_upload_output_folder,
+ s3_textract_document_logs_subfolder,
+ local_textract_document_logs_subfolder,
+ )
+
+
+def _get_env_list(env_var_name: str) -> list[str]:
+ """Parses a comma-separated environment variable into a list of strings."""
+ value = env_var_name[1:-1].strip().replace('"', "").replace("'", "")
+ if not value:
+ return []
+ # Split by comma and filter out any empty strings that might result from extra commas
+ return [s.strip() for s in value.split(",") if s.strip()]
+
+
+def _download_s3_file_if_needed(
+ file_path: str, default_filename: str = "downloaded_file"
+) -> str:
+ """
+ Download a file from S3 if the path starts with 's3://' or 'S3://', otherwise return the path as-is.
+
+ Args:
+ file_path: File path (either local or S3 URL)
+ default_filename: Default filename to use if S3 key doesn't have a filename
+
+ Returns:
+ Local file path (downloaded from S3 or original path)
+ """
+ if not file_path:
+ return file_path
+
+ # Check for S3 URL (case-insensitive)
+ file_path_stripped = file_path.strip()
+ file_path_upper = file_path_stripped.upper()
+ if not file_path_upper.startswith("S3://"):
+ return file_path
+
+ # Use GRADIO_TEMP_DIR if available, otherwise use INPUT_FOLDER as fallback
+ temp_dir = GRADIO_TEMP_DIR if GRADIO_TEMP_DIR else INPUT_FOLDER
+ os.makedirs(temp_dir, exist_ok=True)
+
+ # Parse S3 URL: s3://bucket/key (preserve original case for bucket/key)
+ # Remove 's3://' prefix (case-insensitive)
+ s3_path = (
+ file_path_stripped.split("://", 1)[1]
+ if "://" in file_path_stripped
+ else file_path_stripped
+ )
+ # Split bucket and key (first '/' separates bucket from key)
+ if "/" in s3_path:
+ bucket_name_s3, s3_key = s3_path.split("/", 1)
+ else:
+ # If no key provided, use bucket name as key (unlikely but handle it)
+ bucket_name_s3 = s3_path
+ s3_key = ""
+
+ # Get the filename from the S3 key
+ filename = os.path.basename(s3_key) if s3_key else bucket_name_s3
+ if not filename:
+ filename = default_filename
+
+ # Create local file path in temp directory
+ local_file_path = os.path.join(temp_dir, filename)
+
+ # Download file from S3
+ try:
+ download_file_from_s3(
+ bucket_name=bucket_name_s3,
+ key=s3_key,
+ local_file_path_and_name=local_file_path,
+ )
+ print(f"S3 file downloaded successfully: {file_path} -> {local_file_path}")
+ return local_file_path
+ except Exception as e:
+ print(f"Error downloading file from S3 ({file_path}): {e}")
+ raise Exception(f"Failed to download file from S3: {e}")
+
+
+def _build_s3_output_folder(
+ s3_outputs_folder: str,
+ session_hash: str,
+ save_to_user_folders: bool,
+) -> str:
+ """
+ Build the S3 output folder path with session hash and date suffix if needed.
+
+ Args:
+ s3_outputs_folder: Base S3 folder path
+ session_hash: Session hash/username
+ save_to_user_folders: Whether to append session hash to folder path
+
+ Returns:
+ Final S3 folder path with session hash and date suffix
+ """
+ if not s3_outputs_folder:
+ return ""
+
+ # Append session hash if save_to_user_folders is enabled
+ if save_to_user_folders and session_hash:
+ sanitized_session_hash = _sanitize_folder_name(session_hash)
+ s3_outputs_folder = (
+ s3_outputs_folder.rstrip("/") + "/" + sanitized_session_hash + "/"
+ )
+ else:
+ # Ensure trailing slash
+ if not s3_outputs_folder.endswith("/"):
+ s3_outputs_folder = s3_outputs_folder + "/"
+
+ # Append today's date (YYYYMMDD/)
+ today_suffix = datetime.now().strftime("%Y%m%d") + "/"
+ s3_outputs_folder = s3_outputs_folder.rstrip("/") + "/" + today_suffix
+
+ return s3_outputs_folder
+
+
+# Add custom spacy recognisers to the Comprehend list, so that local Spacy model can be used to pick up e.g. titles, streetnames, UK postcodes that are sometimes missed by comprehend
+CHOSEN_COMPREHEND_ENTITIES.extend(CUSTOM_ENTITIES)
+FULL_COMPREHEND_ENTITY_LIST.extend(CUSTOM_ENTITIES)
+
+chosen_redact_entities = CHOSEN_REDACT_ENTITIES
+full_entity_list = FULL_ENTITY_LIST
+chosen_comprehend_entities = CHOSEN_COMPREHEND_ENTITIES
+full_comprehend_entity_list = FULL_COMPREHEND_ENTITY_LIST
+chosen_llm_entities = CHOSEN_LLM_ENTITIES
+full_llm_entity_list = FULL_LLM_ENTITY_LIST
+default_handwrite_signature_checkbox = DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX
+
+
+# --- Main CLI Function ---
+def main(direct_mode_args={}):
+ """
+ A unified command-line interface to prepare, redact, and anonymise various document types.
+
+ Args:
+ direct_mode_args (dict, optional): Dictionary of arguments for direct mode execution.
+ If provided, uses these instead of parsing command line arguments.
+ """
+ parser = argparse.ArgumentParser(
+ description="A versatile CLI for redacting PII from PDF/image files and anonymising Word/tabular data.",
+ formatter_class=argparse.RawTextHelpFormatter,
+ epilog="""
+Examples:
+
+To run these, you need to do the following:
+
+- Open a terminal window
+
+- CD to the app folder that contains this file (cli_redact.py)
+
+- Load the virtual environment using either conda or venv depending on your setup
+
+- Run one of the example commands below
+
+- Look in the output/ folder to see output files:
+
+# Redaction
+
+## Redact a PDF with default settings (local OCR):
+python cli_redact.py --input_file example_data/example_of_emails_sent_to_a_professor_before_applying.pdf
+
+## Extract text from a PDF only (i.e. no redaction), using local OCR:
+python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --redact_whole_page_file example_data/partnership_toolkit_redact_some_pages.csv --pii_detector None
+
+## Extract text from a PDF only (i.e. no redaction), using local OCR, with a whole page redaction list:
+python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --redact_whole_page_file example_data/partnership_toolkit_redact_some_pages.csv --pii_detector Local --local_redact_entities CUSTOM
+
+## Redact a PDF with allow list (local OCR) and custom list of redaction entities:
+python cli_redact.py --input_file example_data/graduate-job-example-cover-letter.pdf --allow_list_file example_data/test_allow_list_graduate.csv --local_redact_entities TITLES PERSON DATE_TIME
+
+## Redact a PDF with limited pages and text extraction method (local text) with custom fuzzy matching:
+python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --deny_list_file example_data/Partnership-Agreement-Toolkit_test_deny_list_para_single_spell.csv --local_redact_entities CUSTOM_FUZZY --page_min 1 --page_max 3 --ocr_method "Local text" --fuzzy_mistakes 3
+
+## Redaction with custom deny list, allow list, and whole page redaction list:
+python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --deny_list_file example_data/partnership_toolkit_redact_custom_deny_list.csv --redact_whole_page_file example_data/partnership_toolkit_redact_some_pages.csv --allow_list_file example_data/test_allow_list_partnership.csv
+
+## Redact an image:
+python cli_redact.py --input_file example_data/example_complaint_letter.jpg
+
+## Anonymise csv file with specific columns:
+python cli_redact.py --input_file example_data/combined_case_notes.csv --text_columns "Case Note" "Client" --anon_strategy replace_redacted
+
+## Anonymise csv file with a different strategy (remove text completely):
+python cli_redact.py --input_file example_data/combined_case_notes.csv --text_columns "Case Note" "Client" --anon_strategy redact
+
+## Anonymise Excel file, remove text completely:
+python cli_redact.py --input_file example_data/combined_case_notes.xlsx --text_columns "Case Note" "Client" --excel_sheets combined_case_notes --anon_strategy redact
+
+## Anonymise a word document:
+python cli_redact.py --input_file "example_data/Bold minimalist professional cover letter.docx" --anon_strategy replace_redacted
+
+# Redaction with AWS services:
+
+## Use Textract and Comprehend:
+python cli_redact.py --input_file example_data/example_of_emails_sent_to_a_professor_before_applying.pdf --ocr_method "AWS Textract" --pii_detector "AWS Comprehend"
+
+# LLM PII identification (entity subset and custom instructions)
+
+## Redact with LLM PII entity subset (NAME, EMAIL_ADDRESS, etc.) and custom instructions:
+python cli_redact.py --input_file example_data/example_of_emails_sent_to_a_professor_before_applying.pdf --llm_redact_entities NAME EMAIL_ADDRESS PHONE_NUMBER ADDRESS CUSTOM --custom_llm_instructions "Do not redact the name of the university."
+
+## Redact with custom LLM instructions only (use default LLM entities from config):
+python cli_redact.py --input_file example_data/graduate-job-example-cover-letter.pdf --custom_llm_instructions "Redact all company names with the label COMPANY_NAME."
+
+## Redact specific pages with AWS OCR and signature extraction:
+python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --page_min 6 --page_max 7 --ocr_method "AWS Textract" --handwrite_signature_extraction "Extract handwriting" "Extract signatures"
+
+## Redact with AWS OCR and additional layout extraction options:
+python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --ocr_method "AWS Textract" --extract_layout
+
+# Duplicate page detection
+
+## Find duplicate pages in OCR files:
+python cli_redact.py --task deduplicate --input_file example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv --duplicate_type pages --similarity_threshold 0.95
+
+## Find duplicate in OCR files at the line level:
+python cli_redact.py --task deduplicate --input_file example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv --duplicate_type pages --similarity_threshold 0.95 --combine_pages False --min_word_count 3
+
+## Find duplicate rows in tabular data:
+python cli_redact.py --task deduplicate --input_file example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv --duplicate_type tabular --text_columns "text" --similarity_threshold 0.95
+
+# AWS Textract whole document analysis
+
+## Submit document to Textract for basic text analysis:
+python cli_redact.py --task textract --textract_action submit --input_file example_data/example_of_emails_sent_to_a_professor_before_applying.pdf
+
+## Submit document to Textract for analysis with signature extraction (Job ID will be printed to the console, you need this to retrieve the results):
+python cli_redact.py --task textract --textract_action submit --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --extract_signatures
+
+## Retrieve Textract results by job ID (returns a .json file output):
+python cli_redact.py --task textract --textract_action retrieve --job_id 12345678-1234-1234-1234-123456789012
+
+## List recent Textract jobs:
+python cli_redact.py --task textract --textract_action list
+
+# Document summarisation
+
+# Summarise from a PDF with AWS Bedrock
+python cli_redact.py --task summarise --input_file example_data/example_data/Partnership-Agreement-Toolkit_0_0.pdf --summarisation_inference_method "LLM (AWS Bedrock)"
+
+## Summarise document(s) from OCR output CSV(s) using AWS Bedrock:
+python cli_redact.py --task summarise --input_file example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_ocr_output.csv --summarisation_inference_method "LLM (AWS Bedrock)"
+
+## Summarise with local LLM and detailed format:
+python cli_redact.py --task summarise --input_file example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_ocr_output.csv --summarisation_inference_method "Local transformers LLM" --summarisation_format detailed
+
+## Summarise with additional context and instructions (concise format):
+python cli_redact.py --task summarise --input_file example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_ocr_output.csv --summarisation_context "This is a partnership agreement" --summarisation_additional_instructions "Focus on key obligations and termination clauses" --summarisation_format concise
+
+## Summarise multiple OCR CSV files:
+python cli_redact.py --task summarise --input_file example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_ocr_output.csv example_data/example_outputs/example_of_emails_sent_to_a_professor_before_applying_ocr_output_textract.csv --summarisation_inference_method "LLM (AWS Bedrock)"
+
+# Combine review PDFs
+
+## Merge redaction comments from multiple '_redactions_for_review' PDFs into one file:
+python cli_redact.py --task combine_review_pdfs --input_file path/to/review1.pdf path/to/review2.pdf --output_dir output/
+
+""",
+ )
+
+ # --- Task Selection ---
+ task_group = parser.add_argument_group("Task Selection")
+ task_group.add_argument(
+ "--task",
+ choices=[
+ "redact",
+ "deduplicate",
+ "textract",
+ "summarise",
+ "combine_review_pdfs",
+ ],
+ default="redact",
+ help="Task to perform: redact (PII redaction/anonymisation), deduplicate (find duplicate content), textract (AWS Textract batch operations), summarise (LLM-based document summarisation from OCR CSV files), or combine_review_pdfs (merge redaction comments from multiple '_redactions_for_review' PDFs into one file).",
+ )
+
+ # --- General Arguments (apply to all file types) ---
+ general_group = parser.add_argument_group("General Options")
+ general_group.add_argument(
+ "--input_file",
+ nargs="+",
+ help="Path to the input file(s) to process. Separate multiple files with a space, and use quotes if there are spaces in the file name.",
+ )
+ general_group.add_argument(
+ "--output_dir", default=OUTPUT_FOLDER, help="Directory for all output files."
+ )
+ general_group.add_argument(
+ "--input_dir", default=INPUT_FOLDER, help="Directory for all input files."
+ )
+ general_group.add_argument(
+ "--language", default=DEFAULT_LANGUAGE, help="Language of the document content."
+ )
+ general_group.add_argument(
+ "--allow_list",
+ default=ALLOW_LIST_PATH,
+ help="Path to a CSV file with words to exclude from redaction.",
+ )
+ general_group.add_argument(
+ "--pii_detector",
+ choices=[LOCAL_PII_OPTION, AWS_PII_OPTION, "None"],
+ default=LOCAL_PII_OPTION,
+ help="Core PII detection method (Local or AWS Comprehend, or None).",
+ )
+ general_group.add_argument(
+ "--username", default=DIRECT_MODE_DEFAULT_USER, help="Username for the session."
+ )
+ general_group.add_argument(
+ "--save_to_user_folders",
+ default=SESSION_OUTPUT_FOLDER,
+ help="Whether to save to user folders or not.",
+ )
+
+ general_group.add_argument(
+ "--local_redact_entities",
+ nargs="+",
+ choices=full_entity_list,
+ default=chosen_redact_entities,
+ help=f"Local redaction entities to use. Default: {chosen_redact_entities}. Full list: {full_entity_list}.",
+ )
+
+ general_group.add_argument(
+ "--aws_redact_entities",
+ nargs="+",
+ choices=full_comprehend_entity_list,
+ default=chosen_comprehend_entities,
+ help=f"AWS redaction entities to use. Default: {chosen_comprehend_entities}. Full list: {full_comprehend_entity_list}.",
+ )
+
+ general_group.add_argument(
+ "--aws_access_key", default=AWS_ACCESS_KEY, help="Your AWS Access Key ID."
+ )
+ general_group.add_argument(
+ "--aws_secret_key", default=AWS_SECRET_KEY, help="Your AWS Secret Access Key."
+ )
+ general_group.add_argument(
+ "--cost_code", default=DEFAULT_COST_CODE, help="Cost code for tracking usage."
+ )
+ general_group.add_argument(
+ "--aws_region", default=AWS_REGION, help="AWS region for cloud services."
+ )
+ general_group.add_argument(
+ "--s3_bucket",
+ default=DOCUMENT_REDACTION_BUCKET,
+ help="S3 bucket name for cloud operations.",
+ )
+ general_group.add_argument(
+ "--save_outputs_to_s3",
+ default=SAVE_OUTPUTS_TO_S3,
+ help="Upload output files (redacted PDFs, anonymized documents, etc.) to S3 after processing.",
+ )
+ general_group.add_argument(
+ "--s3_outputs_folder",
+ default=S3_OUTPUTS_FOLDER,
+ help="S3 folder (key prefix) for saving output files. If left blank, outputs will not be uploaded even if --save_outputs_to_s3 is enabled.",
+ )
+ general_group.add_argument(
+ "--s3_outputs_bucket",
+ default=S3_OUTPUTS_BUCKET,
+ help="S3 bucket name for output files (defaults to --s3_bucket if not specified).",
+ )
+ general_group.add_argument(
+ "--do_initial_clean",
+ default=DO_INITIAL_TABULAR_DATA_CLEAN,
+ help="Perform initial text cleaning for tabular data.",
+ )
+ general_group.add_argument(
+ "--save_logs_to_csv",
+ default=SAVE_LOGS_TO_CSV,
+ help="Save processing logs to CSV files.",
+ )
+ general_group.add_argument(
+ "--save_logs_to_dynamodb",
+ default=SAVE_LOGS_TO_DYNAMODB,
+ help="Save processing logs to DynamoDB.",
+ )
+ general_group.add_argument(
+ "--display_file_names_in_logs",
+ default=DISPLAY_FILE_NAMES_IN_LOGS,
+ help="Include file names in log outputs.",
+ )
+ general_group.add_argument(
+ "--upload_logs_to_s3",
+ default=RUN_AWS_FUNCTIONS,
+ help="Upload log files to S3 after processing.",
+ )
+ general_group.add_argument(
+ "--s3_logs_prefix",
+ default=S3_USAGE_LOGS_FOLDER,
+ help="S3 prefix for usage log files.",
+ )
+ general_group.add_argument(
+ "--feedback_logs_folder",
+ default=FEEDBACK_LOGS_FOLDER,
+ help="Directory for feedback log files.",
+ )
+ general_group.add_argument(
+ "--access_logs_folder",
+ default=ACCESS_LOGS_FOLDER,
+ help="Directory for access log files.",
+ )
+ general_group.add_argument(
+ "--usage_logs_folder",
+ default=USAGE_LOGS_FOLDER,
+ help="Directory for usage log files.",
+ )
+ general_group.add_argument(
+ "--paddle_model_path",
+ default=PADDLE_MODEL_PATH,
+ help="Directory for PaddleOCR model storage.",
+ )
+ general_group.add_argument(
+ "--spacy_model_path",
+ default=SPACY_MODEL_PATH,
+ help="Directory for spaCy model storage.",
+ )
+
+ # --- PDF/Image Redaction Arguments ---
+ pdf_group = parser.add_argument_group(
+ "PDF/Image Redaction Options (.pdf, .png, .jpg)"
+ )
+ pdf_group.add_argument(
+ "--ocr_method",
+ choices=["AWS Textract", "Local OCR", "Local text"],
+ default="Local OCR",
+ help="OCR method for text extraction from images.",
+ )
+ pdf_group.add_argument(
+ "--page_min", type=int, default=0, help="First page to redact."
+ )
+ pdf_group.add_argument(
+ "--page_max", type=int, default=0, help="Last page to redact."
+ )
+ pdf_group.add_argument(
+ "--images_dpi",
+ type=float,
+ default=float(IMAGES_DPI),
+ help="DPI for image processing.",
+ )
+ pdf_group.add_argument(
+ "--chosen_local_ocr_model",
+ choices=LOCAL_OCR_MODEL_OPTIONS,
+ default=DEFAULT_LOCAL_OCR_MODEL,
+ help="Local OCR model to use.",
+ )
+ pdf_group.add_argument(
+ "--preprocess_local_ocr_images",
+ default=PREPROCESS_LOCAL_OCR_IMAGES,
+ help="Preprocess images before OCR.",
+ )
+ pdf_group.add_argument(
+ "--compress_redacted_pdf",
+ default=COMPRESS_REDACTED_PDF,
+ help="Compress the final redacted PDF.",
+ )
+ pdf_group.add_argument(
+ "--return_pdf_end_of_redaction",
+ default=RETURN_REDACTED_PDF,
+ help="Return PDF at end of redaction process.",
+ )
+ pdf_group.add_argument(
+ "--deny_list_file",
+ default=DENY_LIST_PATH,
+ help="Custom words file to recognize for redaction.",
+ )
+ pdf_group.add_argument(
+ "--allow_list_file",
+ default=ALLOW_LIST_PATH,
+ help="Custom words file to recognize for redaction.",
+ )
+ pdf_group.add_argument(
+ "--redact_whole_page_file",
+ default=WHOLE_PAGE_REDACTION_LIST_PATH,
+ help="File for pages to redact completely.",
+ )
+ pdf_group.add_argument(
+ "--handwrite_signature_extraction",
+ nargs="+",
+ default=default_handwrite_signature_checkbox,
+ help='Handwriting and signature extraction options. Choose from "Extract handwriting", "Extract signatures".',
+ )
+ pdf_group.add_argument(
+ "--extract_forms",
+ action="store_true",
+ help="Extract forms during Textract analysis.",
+ )
+ pdf_group.add_argument(
+ "--extract_tables",
+ action="store_true",
+ help="Extract tables during Textract analysis.",
+ )
+ pdf_group.add_argument(
+ "--extract_layout",
+ action="store_true",
+ help="Extract layout during Textract analysis.",
+ )
+ pdf_group.add_argument(
+ "--vlm_model_choice",
+ default=CLOUD_VLM_MODEL_CHOICE,
+ help="VLM model choice for OCR (e.g., 'qwen.qwen3-vl-235b-a22b' for Bedrock, or model name for other providers).",
+ )
+ pdf_group.add_argument(
+ "--inference_server_vlm_model",
+ default=DEFAULT_INFERENCE_SERVER_VLM_MODEL,
+ help="Inference server VLM model name for OCR.",
+ )
+ pdf_group.add_argument(
+ "--inference_server_api_url",
+ default=INFERENCE_SERVER_API_URL,
+ help="Inference server API URL.",
+ )
+ pdf_group.add_argument(
+ "--gemini_api_key",
+ default=GEMINI_API_KEY,
+ help="Google Gemini API key for VLM OCR.",
+ )
+ pdf_group.add_argument(
+ "--azure_openai_api_key",
+ default=AZURE_OPENAI_API_KEY,
+ help="Azure OpenAI API key for VLM OCR.",
+ )
+ pdf_group.add_argument(
+ "--azure_openai_endpoint",
+ default=AZURE_OPENAI_INFERENCE_ENDPOINT,
+ help="Azure OpenAI endpoint URL for VLM OCR.",
+ )
+ pdf_group.add_argument(
+ "--efficient_ocr",
+ action="store_true",
+ default=None,
+ help="Use efficient OCR: try selectable text first per page, run OCR only when needed (saves time/cost). Defaults to EFFICIENT_OCR config.",
+ )
+ pdf_group.add_argument(
+ "--no_efficient_ocr",
+ action="store_false",
+ dest="efficient_ocr",
+ help="Disable efficient OCR (use selected OCR method for all pages).",
+ )
+ pdf_group.add_argument(
+ "--efficient_ocr_min_words",
+ type=int,
+ default=None,
+ metavar="N",
+ help="Minimum words on a page to use text-only route; below this use OCR. Defaults to EFFICIENT_OCR_MIN_WORDS config (e.g. 20).",
+ )
+ pdf_group.add_argument(
+ "--efficient_ocr_min_image_coverage_fraction",
+ type=float,
+ default=None,
+ metavar="F",
+ help="Efficient OCR: min fraction of page area (0-1) for an embedded image to force OCR; 0 disables. Defaults to EFFICIENT_OCR_MIN_IMAGE_COVERAGE_FRACTION config (e.g. 0.03).",
+ )
+ pdf_group.add_argument(
+ "--ocr_first_pass_max_workers",
+ type=int,
+ default=None,
+ metavar="N",
+ help="Max threads for OCR first pass (1 = sequential). Defaults to OCR_FIRST_PASS_MAX_WORKERS config (e.g. 3).",
+ )
+ pdf_group.add_argument(
+ "--hybrid_textract_bedrock_vlm",
+ action="store_true",
+ default=None,
+ help="When using AWS Textract, re-run low-confidence lines with Bedrock VLM for higher quality. Defaults to HYBRID_TEXTRACT_BEDROCK_VLM config.",
+ )
+ pdf_group.add_argument(
+ "--no_hybrid_textract_bedrock_vlm",
+ action="store_false",
+ dest="hybrid_textract_bedrock_vlm",
+ help="Disable hybrid Textract + Bedrock VLM (use Textract only).",
+ )
+ pdf_group.add_argument(
+ "--overwrite_existing_ocr_results",
+ action="store_true",
+ default=None,
+ help="Ignore cached OCR JSON files and re-run OCR. Defaults to OVERWRITE_EXISTING_OCR_RESULTS config (e.g. False).",
+ )
+ pdf_group.add_argument(
+ "--no_overwrite_existing_ocr_results",
+ action="store_false",
+ dest="overwrite_existing_ocr_results",
+ help="Use existing OCR results when available (do not overwrite cached JSON).",
+ )
+ pdf_group.add_argument(
+ "--save_page_ocr_visualisations",
+ action="store_true",
+ default=None,
+ help="Save page OCR visualisations (debug bounding boxes). Defaults to SAVE_PAGE_OCR_VISUALISATIONS config.",
+ )
+ pdf_group.add_argument(
+ "--no_save_page_ocr_visualisations",
+ action="store_false",
+ dest="save_page_ocr_visualisations",
+ help="Do not save page OCR visualisations (debug bounding boxes).",
+ )
+
+ # --- LLM PII Detection Arguments ---
+ llm_group = parser.add_argument_group("LLM PII Detection Options")
+ llm_group.add_argument(
+ "--llm_model_choice",
+ default=CLOUD_LLM_PII_MODEL_CHOICE,
+ help="LLM model choice for PII detection. Defaults to CLOUD_LLM_PII_MODEL_CHOICE for Bedrock. "
+ "Note: The actual model used is determined by pii_identification_method - "
+ "CLOUD_LLM_PII_MODEL_CHOICE for Bedrock, INFERENCE_SERVER_LLM_PII_MODEL_CHOICE for inference server, "
+ "LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE for local transformers.",
+ )
+ llm_group.add_argument(
+ "--llm_inference_method",
+ choices=LLM_PII_INFERENCE_METHODS,
+ default=CHOSEN_LLM_PII_INFERENCE_METHOD,
+ help="LLM inference method for PII detection: aws-bedrock, local, inference-server, azure-openai, or gemini.",
+ )
+ llm_group.add_argument(
+ "--inference_server_pii_model",
+ default=DEFAULT_INFERENCE_SERVER_PII_MODEL,
+ help="Inference server PII detection model name.",
+ )
+ llm_group.add_argument(
+ "--llm_temperature",
+ type=float,
+ default=LLM_TEMPERATURE,
+ help="Temperature for LLM PII detection (lower = more deterministic).",
+ )
+ llm_group.add_argument(
+ "--llm_max_tokens",
+ type=int,
+ default=LLM_MAX_NEW_TOKENS,
+ help="Maximum tokens in LLM response for PII detection.",
+ )
+ llm_group.add_argument(
+ "--llm_redact_entities",
+ nargs="+",
+ choices=full_llm_entity_list,
+ default=chosen_llm_entities,
+ help=f"Subset of entities for LLM PII detection (when pii_detector uses an LLM). Default: {chosen_llm_entities}. Full list: {full_llm_entity_list}.",
+ )
+ llm_group.add_argument(
+ "--custom_llm_instructions",
+ default="",
+ help="Custom instructions for LLM-based entity detection (e.g. 'don't redact anything related to Mark Wilson' or 'redact all company names with the label COMPANY_NAME').",
+ )
+
+ # --- Word/Tabular Anonymisation Arguments ---
+ tabular_group = parser.add_argument_group(
+ "Word/Tabular Anonymisation Options (.docx, .csv, .xlsx)"
+ )
+ tabular_group.add_argument(
+ "--anon_strategy",
+ choices=[
+ "redact",
+ "redact completely",
+ "replace_redacted",
+ "entity_type",
+ "encrypt",
+ "hash",
+ "replace with 'REDACTED'",
+ "replace with ",
+ "mask",
+ "fake_first_name",
+ ],
+ default=DEFAULT_TABULAR_ANONYMISATION_STRATEGY,
+ help="The anonymisation strategy to apply.",
+ )
+ tabular_group.add_argument(
+ "--text_columns",
+ nargs="+",
+ default=list(),
+ help="A list of column names to anonymise or deduplicate in tabular data.",
+ )
+ tabular_group.add_argument(
+ "--excel_sheets",
+ nargs="+",
+ default=list(),
+ help="Specific Excel sheet names to process.",
+ )
+ tabular_group.add_argument(
+ "--fuzzy_mistakes",
+ type=int,
+ default=DEFAULT_FUZZY_SPELLING_MISTAKES_NUM,
+ help="Number of allowed spelling mistakes for fuzzy matching.",
+ )
+ tabular_group.add_argument(
+ "--match_fuzzy_whole_phrase_bool",
+ default=True,
+ help="Match fuzzy whole phrase boolean.",
+ )
+ # --- Duplicate Detection Arguments ---
+ duplicate_group = parser.add_argument_group("Duplicate Detection Options")
+ duplicate_group.add_argument(
+ "--duplicate_type",
+ choices=["pages", "tabular"],
+ default="pages",
+ help="Type of duplicate detection: pages (for OCR files) or tabular (for CSV/Excel files).",
+ )
+ duplicate_group.add_argument(
+ "--similarity_threshold",
+ type=float,
+ default=DEFAULT_DUPLICATE_DETECTION_THRESHOLD,
+ help="Similarity threshold (0-1) to consider content as duplicates.",
+ )
+ duplicate_group.add_argument(
+ "--min_word_count",
+ type=int,
+ default=DEFAULT_MIN_WORD_COUNT,
+ help="Minimum word count for text to be considered in duplicate analysis.",
+ )
+ duplicate_group.add_argument(
+ "--min_consecutive_pages",
+ type=int,
+ default=DEFAULT_MIN_CONSECUTIVE_PAGES,
+ help="Minimum number of consecutive pages to consider as a match.",
+ )
+ duplicate_group.add_argument(
+ "--greedy_match",
+ default=USE_GREEDY_DUPLICATE_DETECTION,
+ help="Use greedy matching strategy for consecutive pages.",
+ )
+ duplicate_group.add_argument(
+ "--combine_pages",
+ default=DEFAULT_COMBINE_PAGES,
+ help="Combine text from the same page number within a file. Alternative will enable line-level duplicate detection.",
+ )
+ duplicate_group.add_argument(
+ "--remove_duplicate_rows",
+ default=REMOVE_DUPLICATE_ROWS,
+ help="Remove duplicate rows from the output.",
+ )
+
+ # --- Document Summarisation Arguments ---
+ summarisation_group = parser.add_argument_group("Document Summarisation Options")
+ summarisation_group.add_argument(
+ "--summarisation_inference_method",
+ choices=[
+ AWS_LLM_PII_OPTION,
+ LOCAL_TRANSFORMERS_LLM_PII_OPTION,
+ INFERENCE_SERVER_PII_OPTION,
+ ],
+ default=AWS_LLM_PII_OPTION,
+ help="LLM inference method for summarisation (same options as GUI).",
+ )
+ summarisation_group.add_argument(
+ "--summarisation_temperature",
+ type=float,
+ default=0.6,
+ help="Temperature for summarisation (0.0-2.0). Lower is more deterministic.",
+ )
+ summarisation_group.add_argument(
+ "--summarisation_max_pages_per_group",
+ type=int,
+ default=30,
+ help="Maximum pages per page-group summary (in addition to context-length limits).",
+ )
+ summarisation_group.add_argument(
+ "--summary_page_group_max_workers",
+ type=int,
+ default=SUMMARY_PAGE_GROUP_MAX_WORKERS,
+ metavar="N",
+ help="Max threads for page-group summarisation (1 = sequential). Defaults to SUMMARY_PAGE_GROUP_MAX_WORKERS config (e.g. 1).",
+ )
+ summarisation_group.add_argument(
+ "--summarisation_api_key",
+ default="",
+ help="API key for summarisation (if required by the chosen LLM).",
+ )
+ summarisation_group.add_argument(
+ "--summarisation_context",
+ default="",
+ help="Additional context for summarisation (e.g. 'This is a consultation response document').",
+ )
+ summarisation_group.add_argument(
+ "--summarisation_format",
+ choices=["concise", "detailed"],
+ default="detailed",
+ help="Summary format: concise (key themes only) or detailed (as much detail as possible).",
+ )
+ summarisation_group.add_argument(
+ "--summarisation_additional_instructions",
+ default="",
+ help="Additional summary instructions (e.g. 'Focus on key decisions and recommendations').",
+ )
+
+ # --- Textract Batch Operations Arguments ---
+ textract_group = parser.add_argument_group("Textract Batch Operations Options")
+ textract_group.add_argument(
+ "--textract_action",
+ choices=["submit", "retrieve", "list"],
+ help="Textract action to perform: submit (submit document for analysis), retrieve (get results by job ID), or list (show recent jobs).",
+ )
+ textract_group.add_argument("--job_id", help="Textract job ID for retrieve action.")
+ textract_group.add_argument(
+ "--extract_signatures",
+ action="store_true",
+ help="Extract signatures during Textract analysis (for submit action).",
+ )
+ textract_group.add_argument(
+ "--textract_bucket",
+ default=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET,
+ help="S3 bucket name for Textract operations (overrides default).",
+ )
+ textract_group.add_argument(
+ "--textract_input_prefix",
+ default=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER,
+ help="S3 prefix for input files in Textract operations.",
+ )
+ textract_group.add_argument(
+ "--textract_output_prefix",
+ default=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER,
+ help="S3 prefix for output files in Textract operations.",
+ )
+ textract_group.add_argument(
+ "--s3_textract_document_logs_subfolder",
+ default=TEXTRACT_JOBS_S3_LOC,
+ help="S3 prefix for logs in Textract operations.",
+ )
+ textract_group.add_argument(
+ "--local_textract_document_logs_subfolder",
+ default=TEXTRACT_JOBS_LOCAL_LOC,
+ help="Local prefix for logs in Textract operations.",
+ )
+ textract_group.add_argument(
+ "--poll_interval",
+ type=int,
+ default=30,
+ help="Polling interval in seconds for Textract job status.",
+ )
+ textract_group.add_argument(
+ "--max_poll_attempts",
+ type=int,
+ default=120,
+ help="Maximum number of polling attempts for Textract job completion.",
+ )
+ # Parse arguments - either from command line or direct mode
+ if direct_mode_args:
+ # Use direct mode arguments
+ args = argparse.Namespace(**direct_mode_args)
+ else:
+ # Parse command line arguments
+ args = parser.parse_args()
+
+ # --- Handle S3 file downloads ---
+ # Download input files from S3 if needed
+ # Note: args.input_file is typically a list (from CLI nargs="+" or from direct mode)
+ # but we also handle pipe-separated strings for compatibility
+ if args.input_file:
+ if isinstance(args.input_file, list):
+ # Handle list of files (may include S3 paths)
+ downloaded_files = []
+ for file_path in args.input_file:
+ downloaded_path = _download_s3_file_if_needed(file_path)
+ downloaded_files.append(downloaded_path)
+ args.input_file = downloaded_files
+ elif isinstance(args.input_file, str):
+ # Handle pipe-separated string (for direct mode compatibility)
+ if "|" in args.input_file:
+ file_list = [f.strip() for f in args.input_file.split("|") if f.strip()]
+ downloaded_files = []
+ for file_path in file_list:
+ downloaded_path = _download_s3_file_if_needed(file_path)
+ downloaded_files.append(downloaded_path)
+ args.input_file = downloaded_files
+ else:
+ # Single file path
+ args.input_file = [_download_s3_file_if_needed(args.input_file)]
+
+ # Download other file arguments from S3 if needed
+ if args.deny_list_file:
+ args.deny_list_file = _download_s3_file_if_needed(
+ args.deny_list_file, default_filename="downloaded_deny_list.csv"
+ )
+ if args.allow_list_file:
+ args.allow_list_file = _download_s3_file_if_needed(
+ args.allow_list_file, default_filename="downloaded_allow_list.csv"
+ )
+ if args.redact_whole_page_file:
+ args.redact_whole_page_file = _download_s3_file_if_needed(
+ args.redact_whole_page_file,
+ default_filename="downloaded_redact_whole_page.csv",
+ )
+
+ # --- Initial Setup ---
+ # Convert string boolean variables to boolean
+ if args.preprocess_local_ocr_images == "True":
+ args.preprocess_local_ocr_images = True
+ else:
+ args.preprocess_local_ocr_images = False
+ if args.greedy_match == "True":
+ args.greedy_match = True
+ else:
+ args.greedy_match = False
+ if args.combine_pages == "True":
+ args.combine_pages = True
+ else:
+ args.combine_pages = False
+ if args.remove_duplicate_rows == "True":
+ args.remove_duplicate_rows = True
+ else:
+ args.remove_duplicate_rows = False
+ if args.return_pdf_end_of_redaction == "True":
+ args.return_pdf_end_of_redaction = True
+ else:
+ args.return_pdf_end_of_redaction = False
+ if args.compress_redacted_pdf == "True":
+ args.compress_redacted_pdf = True
+ else:
+ args.compress_redacted_pdf = False
+ if args.do_initial_clean == "True":
+ args.do_initial_clean = True
+ else:
+ args.do_initial_clean = False
+ if args.save_logs_to_csv == "True":
+ args.save_logs_to_csv = True
+ else:
+ args.save_logs_to_csv = False
+ if args.save_logs_to_dynamodb == "True":
+ args.save_logs_to_dynamodb = True
+ else:
+ args.save_logs_to_dynamodb = False
+ if args.display_file_names_in_logs == "True":
+ args.display_file_names_in_logs = True
+ else:
+ args.display_file_names_in_logs = False
+ if args.match_fuzzy_whole_phrase_bool == "True":
+ args.match_fuzzy_whole_phrase_bool = True
+ else:
+ args.match_fuzzy_whole_phrase_bool = False
+ # Convert save_to_user_folders to boolean (handles both string and boolean values)
+ args.save_to_user_folders = convert_string_to_boolean(args.save_to_user_folders)
+ # Convert save_outputs_to_s3 to boolean (handles both string and boolean values)
+ args.save_outputs_to_s3 = convert_string_to_boolean(args.save_outputs_to_s3)
+
+ # Combine extraction options
+ extraction_options = (
+ list(args.handwrite_signature_extraction)
+ if args.handwrite_signature_extraction
+ else []
+ )
+ if args.extract_forms:
+ extraction_options.append("Extract forms")
+ if args.extract_tables:
+ extraction_options.append("Extract tables")
+ if args.extract_layout:
+ extraction_options.append("Extract layout")
+ args.handwrite_signature_extraction = extraction_options
+
+ if args.task in ["redact", "deduplicate", "summarise", "combine_review_pdfs"]:
+ if args.input_file:
+ if isinstance(args.input_file, str):
+ args.input_file = [args.input_file]
+
+ _, file_extension = os.path.splitext(args.input_file[0])
+ file_extension = file_extension.lower()
+ else:
+ raise ValueError(f"Error: --input_file is required for '{args.task}' task.")
+
+ # Initialise usage logger if logging is enabled
+ usage_logger = None
+ if args.save_logs_to_csv or args.save_logs_to_dynamodb:
+ from tools.cli_usage_logger import create_cli_usage_logger
+
+ try:
+ usage_logger = create_cli_usage_logger(logs_folder=args.usage_logs_folder)
+ except Exception as e:
+ print(f"Warning: Could not initialise usage logger: {e}")
+
+ # Get username and folders
+ (
+ session_hash,
+ args.output_dir,
+ _,
+ args.input_dir,
+ args.textract_input_prefix,
+ args.textract_output_prefix,
+ args.s3_textract_document_logs_subfolder,
+ args.local_textract_document_logs_subfolder,
+ ) = get_username_and_folders(
+ username=args.username,
+ output_folder_textbox=args.output_dir,
+ input_folder_textbox=args.input_dir,
+ session_output_folder=args.save_to_user_folders,
+ textract_document_upload_input_folder=args.textract_input_prefix,
+ textract_document_upload_output_folder=args.textract_output_prefix,
+ s3_textract_document_logs_subfolder=args.s3_textract_document_logs_subfolder,
+ local_textract_document_logs_subfolder=args.local_textract_document_logs_subfolder,
+ )
+
+ print(
+ f"Conducting analyses with user {args.username}. Outputs will be saved to {args.output_dir}."
+ )
+
+ # Build S3 output folder path if S3 uploads are enabled
+ s3_output_folder = ""
+ if args.save_outputs_to_s3 and args.s3_outputs_folder:
+ s3_output_folder = _build_s3_output_folder(
+ s3_outputs_folder=args.s3_outputs_folder,
+ session_hash=session_hash,
+ save_to_user_folders=args.save_to_user_folders,
+ )
+ if s3_output_folder:
+ print(f"S3 output folder: s3://{args.s3_outputs_bucket}/{s3_output_folder}")
+ elif args.save_outputs_to_s3 and not args.s3_outputs_folder:
+ print(
+ "Warning: --save_outputs_to_s3 is enabled but --s3_outputs_folder is not set. Outputs will not be uploaded to S3."
+ )
+
+ # --- Route to the Correct Workflow Based on Task and File Type ---
+
+ # Validate input_file requirement for tasks that need it
+ if (
+ args.task in ["redact", "deduplicate", "summarise", "combine_review_pdfs"]
+ and not args.input_file
+ ):
+ print(f"Error: --input_file is required for '{args.task}' task.")
+ return
+
+ if args.ocr_method in ["Local OCR", "AWS Textract"]:
+ args.prepare_images = True
+ else:
+ args.prepare_images = False
+
+ from tools.cli_usage_logger import create_cli_usage_logger, log_redaction_usage
+
+ # Task 1: Redaction/Anonymisation
+ if args.task == "redact":
+
+ # Workflow 1: PDF/Image Redaction
+ if file_extension in [".pdf", ".png", ".jpg", ".jpeg"]:
+ print("--- Detected PDF/Image file. Starting Redaction Workflow... ---")
+ start_time = time.time()
+ try:
+ from tools.file_conversion import prepare_image_or_pdf
+ from tools.file_redaction import choose_and_run_redactor
+
+ # Step 1: Prepare the document
+ print("\nStep 1: Preparing document...")
+ (
+ prep_summary,
+ prepared_pdf_paths,
+ image_file_paths,
+ _,
+ _,
+ pdf_doc,
+ image_annotations,
+ _,
+ original_cropboxes,
+ page_sizes,
+ _,
+ _,
+ _,
+ _,
+ _,
+ ) = prepare_image_or_pdf(
+ file_paths=args.input_file,
+ text_extract_method=args.ocr_method,
+ all_line_level_ocr_results_df=pd.DataFrame(),
+ all_page_line_level_ocr_results_with_words_df=pd.DataFrame(),
+ first_loop_state=True,
+ prepare_for_review=False,
+ output_folder=args.output_dir,
+ input_folder=args.input_dir,
+ prepare_images=args.prepare_images,
+ page_min=args.page_min,
+ page_max=args.page_max,
+ )
+ print(f"Preparation complete. {prep_summary}")
+
+ # Note: VLM and LLM clients are initialized inside choose_and_run_redactor
+ # based on text_extraction_method and pii_identification_method.
+ # Model choices (vlm_model_choice, llm_model_choice) can be overridden via
+ # environment variables (CLOUD_VLM_MODEL_CHOICE, CLOUD_LLM_PII_MODEL_CHOICE) before running the CLI.
+ # For CLI, we pass inference_server_vlm_model and custom_llm_instructions.
+ # Other LLM parameters (temperature, max_tokens, inference_method) are set via
+ # environment variables or config defaults.
+
+ # Step 2: Redact the prepared document
+ print("\nStep 2: Running redaction...")
+ (
+ output_summary,
+ output_files,
+ _,
+ _,
+ log_files,
+ _,
+ _,
+ _,
+ _,
+ _,
+ _,
+ _,
+ _,
+ _,
+ comprehend_query_number,
+ _,
+ _,
+ _,
+ _,
+ _,
+ _,
+ page_sizes,
+ _,
+ _,
+ _,
+ _,
+ total_textract_query_number,
+ _,
+ _,
+ _,
+ _,
+ _,
+ _,
+ _,
+ vlm_model_name,
+ vlm_total_input_tokens,
+ vlm_total_output_tokens,
+ llm_model_name,
+ llm_total_input_tokens,
+ llm_total_output_tokens,
+ _,
+ ) = choose_and_run_redactor(
+ file_paths=args.input_file,
+ prepared_pdf_file_paths=prepared_pdf_paths,
+ pdf_image_file_paths=image_file_paths,
+ chosen_redact_entities=args.local_redact_entities,
+ chosen_redact_comprehend_entities=args.aws_redact_entities,
+ chosen_llm_entities=args.llm_redact_entities,
+ text_extraction_method=args.ocr_method,
+ in_allow_list=args.allow_list_file,
+ in_deny_list=args.deny_list_file,
+ redact_whole_page_list=args.redact_whole_page_file,
+ first_loop_state=True,
+ page_min=args.page_min,
+ page_max=args.page_max,
+ handwrite_signature_checkbox=args.handwrite_signature_extraction,
+ max_fuzzy_spelling_mistakes_num=args.fuzzy_mistakes,
+ match_fuzzy_whole_phrase_bool=args.match_fuzzy_whole_phrase_bool,
+ pymupdf_doc=pdf_doc,
+ annotations_all_pages=image_annotations,
+ page_sizes=page_sizes,
+ document_cropboxes=original_cropboxes,
+ pii_identification_method=args.pii_detector,
+ aws_access_key_textbox=args.aws_access_key,
+ aws_secret_key_textbox=args.aws_secret_key,
+ language=args.language,
+ output_folder=args.output_dir,
+ input_folder=args.input_dir,
+ custom_llm_instructions=args.custom_llm_instructions,
+ inference_server_vlm_model=(
+ args.inference_server_vlm_model
+ if args.inference_server_vlm_model
+ else DEFAULT_INFERENCE_SERVER_VLM_MODEL
+ ),
+ efficient_ocr=getattr(args, "efficient_ocr", EFFICIENT_OCR),
+ efficient_ocr_min_words=(
+ args.efficient_ocr_min_words
+ if getattr(args, "efficient_ocr_min_words", None) is not None
+ else EFFICIENT_OCR_MIN_WORDS
+ ),
+ efficient_ocr_min_image_coverage_fraction=(
+ args.efficient_ocr_min_image_coverage_fraction
+ if getattr(
+ args, "efficient_ocr_min_image_coverage_fraction", None
+ )
+ is not None
+ else EFFICIENT_OCR_MIN_IMAGE_COVERAGE_FRACTION
+ ),
+ ocr_first_pass_max_workers=(
+ args.ocr_first_pass_max_workers
+ if getattr(args, "ocr_first_pass_max_workers", None) is not None
+ else OCR_FIRST_PASS_MAX_WORKERS
+ ),
+ hybrid_textract_bedrock_vlm=getattr(
+ args, "hybrid_textract_bedrock_vlm", HYBRID_TEXTRACT_BEDROCK_VLM
+ ),
+ overwrite_existing_ocr_results=getattr(
+ args,
+ "overwrite_existing_ocr_results",
+ OVERWRITE_EXISTING_OCR_RESULTS,
+ ),
+ save_page_ocr_visualisations=(
+ getattr(args, "save_page_ocr_visualisations", None)
+ if getattr(args, "save_page_ocr_visualisations", None)
+ is not None
+ else SAVE_PAGE_OCR_VISUALISATIONS
+ ),
+ # Note: bedrock_runtime, gemini_client, gemini_config, azure_openai_client
+ # are initialized inside choose_and_run_redactor based on text_extraction_method
+ # but we can pass vlm_model_choice through custom_llm_instructions or other means
+ # The clients will be initialized in choose_and_run_redactor based on the method
+ )
+
+ # Calculate processing time
+ end_time = time.time()
+ processing_time = end_time - start_time
+
+ # Log usage data if logger is available
+ if usage_logger:
+ try:
+ # Extract file name for logging
+ print("Saving logs to CSV")
+ doc_file_name = (
+ os.path.basename(args.input_file[0])
+ if args.display_file_names_in_logs
+ else "document"
+ )
+ data_file_name = "" # Not applicable for PDF/image redaction
+
+ # Determine if this was a Textract API call
+ is_textract_call = args.ocr_method == "AWS Textract"
+
+ # Count pages (approximate from page_sizes if available)
+ total_pages = len(page_sizes) if page_sizes else 1
+
+ # Count API calls (approximate - would need to be tracked in the redaction function)
+ textract_queries = (
+ int(total_textract_query_number) if is_textract_call else 0
+ )
+ comprehend_queries = (
+ int(comprehend_query_number)
+ if args.pii_detector == "AWS Comprehend"
+ else 0
+ )
+
+ # Format handwriting/signature options
+ handwriting_signature = (
+ ", ".join(args.handwrite_signature_extraction)
+ if args.handwrite_signature_extraction
+ else ""
+ )
+
+ log_redaction_usage(
+ logger=usage_logger,
+ session_hash=session_hash,
+ doc_file_name=doc_file_name,
+ data_file_name=data_file_name,
+ time_taken=processing_time,
+ total_pages=total_pages,
+ textract_queries=textract_queries,
+ pii_method=args.pii_detector,
+ comprehend_queries=comprehend_queries,
+ cost_code=args.cost_code,
+ handwriting_signature=handwriting_signature,
+ text_extraction_method=args.ocr_method,
+ is_textract_call=is_textract_call,
+ task=args.task,
+ save_to_dynamodb=args.save_logs_to_dynamodb,
+ save_to_s3=args.upload_logs_to_s3,
+ s3_bucket=args.s3_bucket,
+ s3_key_prefix=args.s3_logs_prefix,
+ vlm_model_name=vlm_model_name,
+ vlm_total_input_tokens=vlm_total_input_tokens,
+ vlm_total_output_tokens=vlm_total_output_tokens,
+ llm_model_name=llm_model_name,
+ llm_total_input_tokens=llm_total_input_tokens,
+ llm_total_output_tokens=llm_total_output_tokens,
+ )
+ except Exception as e:
+ print(f"Warning: Could not log usage data: {e}")
+
+ print("\n--- Redaction Process Complete ---")
+ print(f"Summary: {output_summary}")
+ print(f"Processing time: {processing_time:.2f} seconds")
+ print(f"\nOutput files saved to: {args.output_dir}")
+ print("Generated Files:", sorted(output_files))
+ if log_files:
+ print("Log Files:", sorted(log_files))
+
+ # Upload output files to S3 if enabled
+ if args.save_outputs_to_s3 and s3_output_folder and output_files:
+ print("\n--- Uploading output files to S3 ---")
+ try:
+ # Get base file name for organizing outputs
+ (
+ os.path.splitext(os.path.basename(args.input_file[0]))[0]
+ if args.input_file
+ else None
+ )
+ export_outputs_to_s3(
+ file_list_state=output_files,
+ s3_output_folder_state_value=s3_output_folder,
+ save_outputs_to_s3_flag=args.save_outputs_to_s3,
+ base_file_state=(
+ args.input_file[0] if args.input_file else None
+ ),
+ s3_bucket=args.s3_outputs_bucket,
+ )
+ except Exception as e:
+ print(f"Warning: Could not upload output files to S3: {e}")
+
+ except Exception as e:
+ print(
+ f"\nAn error occurred during the PDF/Image redaction workflow: {e}"
+ )
+
+ # Workflow 2: Word/Tabular Data Anonymisation
+ elif file_extension in [".docx", ".xlsx", ".xls", ".csv", ".parquet"]:
+ print(
+ "--- Detected Word/Tabular file. Starting Anonymisation Workflow... ---"
+ )
+ start_time = time.time()
+ try:
+ from tools.data_anonymise import anonymise_files_with_open_text
+
+ # Note: anonymise_files_with_open_text initializes LLM clients internally
+ # based on pii_identification_method. LLM model choices and parameters
+ # can be set via environment variables (CLOUD_LLM_PII_MODEL_CHOICE, LLM_TEMPERATURE, etc.)
+ # before running the CLI.
+
+ # Run the anonymisation function directly
+ (
+ output_summary,
+ output_files,
+ _,
+ _,
+ log_files,
+ _,
+ processing_time,
+ comprehend_query_number,
+ _,
+ _,
+ _,
+ ) = anonymise_files_with_open_text(
+ file_paths=args.input_file,
+ in_text="", # Not used for file-based operations
+ anon_strategy=args.anon_strategy,
+ chosen_cols=args.text_columns,
+ chosen_redact_entities=args.local_redact_entities,
+ in_allow_list=args.allow_list_file,
+ in_excel_sheets=args.excel_sheets,
+ first_loop_state=True,
+ output_folder=args.output_dir,
+ in_deny_list=args.deny_list_file,
+ max_fuzzy_spelling_mistakes_num=args.fuzzy_mistakes,
+ pii_identification_method=args.pii_detector,
+ chosen_redact_comprehend_entities=args.aws_redact_entities,
+ aws_access_key_textbox=args.aws_access_key,
+ aws_secret_key_textbox=args.aws_secret_key,
+ language=args.language,
+ do_initial_clean=args.do_initial_clean,
+ )
+
+ # Calculate processing time
+ end_time = time.time()
+ processing_time = end_time - start_time
+
+ # Log usage data if logger is available
+ if usage_logger:
+ try:
+ print("Saving logs to CSV")
+ # Extract file name for logging
+ doc_file_name = "" # Not applicable for tabular data
+ data_file_name = (
+ os.path.basename(args.input_file[0])
+ if args.display_file_names_in_logs
+ else "data_file"
+ )
+
+ # Determine if this was a Textract API call (not applicable for tabular)
+ is_textract_call = False
+
+ # Count pages (not applicable for tabular data)
+ total_pages = 0
+
+ # Count API calls (approximate - would need to be tracked in the anonymisation function)
+ textract_queries = 0 # Not applicable for tabular data
+ comprehend_queries = (
+ comprehend_query_number
+ if args.pii_detector == "AWS Comprehend"
+ else 0
+ )
+
+ # Format handwriting/signature options (not applicable for tabular)
+ handwriting_signature = ""
+
+ log_redaction_usage(
+ logger=usage_logger,
+ session_hash=session_hash,
+ doc_file_name=doc_file_name,
+ data_file_name=data_file_name,
+ time_taken=processing_time,
+ total_pages=total_pages,
+ textract_queries=textract_queries,
+ pii_method=args.pii_detector,
+ comprehend_queries=comprehend_queries,
+ cost_code=args.cost_code,
+ handwriting_signature=handwriting_signature,
+ text_extraction_method="tabular", # Indicate this is tabular processing
+ is_textract_call=is_textract_call,
+ task=args.task,
+ save_to_dynamodb=args.save_logs_to_dynamodb,
+ save_to_s3=args.upload_logs_to_s3,
+ s3_bucket=args.s3_bucket,
+ s3_key_prefix=args.s3_logs_prefix,
+ vlm_model_name="", # TODO: Track from perform_ocr
+ vlm_total_input_tokens=0, # TODO: Track from perform_ocr
+ vlm_total_output_tokens=0, # TODO: Track from perform_ocr
+ llm_model_name="", # TODO: Track from anonymise_script
+ llm_total_input_tokens=0, # TODO: Track from anonymise_script
+ llm_total_output_tokens=0, # TODO: Track from anonymise_script
+ )
+ except Exception as e:
+ print(f"Warning: Could not log usage data: {e}")
+
+ print("\n--- Anonymisation Process Complete ---")
+ print(f"Summary: {output_summary}")
+ print(f"Processing time: {processing_time:.2f} seconds")
+ print(f"\nOutput files saved to: {args.output_dir}")
+ print("Generated Files:", sorted(output_files))
+ if log_files:
+ print("Log Files:", sorted(log_files))
+
+ # Upload output files to S3 if enabled
+ if args.save_outputs_to_s3 and s3_output_folder and output_files:
+ print("\n--- Uploading output files to S3 ---")
+ try:
+ export_outputs_to_s3(
+ file_list_state=output_files,
+ s3_output_folder_state_value=s3_output_folder,
+ save_outputs_to_s3_flag=args.save_outputs_to_s3,
+ base_file_state=(
+ args.input_file[0] if args.input_file else None
+ ),
+ s3_bucket=args.s3_outputs_bucket,
+ )
+ except Exception as e:
+ print(f"Warning: Could not upload output files to S3: {e}")
+
+ except Exception as e:
+ print(
+ f"\nAn error occurred during the Word/Tabular anonymisation workflow: {e}"
+ )
+
+ else:
+ print(f"Error: Unsupported file type '{file_extension}' for redaction.")
+ print("Supported types for redaction: .pdf, .png, .jpg, .jpeg")
+ print(
+ "Supported types for anonymisation: .docx, .xlsx, .xls, .csv, .parquet"
+ )
+
+ # Task 2: Duplicate Detection
+ elif args.task == "deduplicate":
+ print("--- Starting Duplicate Detection Workflow... ---")
+ try:
+ from tools.find_duplicate_pages import run_duplicate_analysis
+
+ if args.duplicate_type == "pages":
+ # Page duplicate detection
+ if file_extension == ".csv":
+ print(
+ "--- Detected OCR CSV file. Starting Page Duplicate Detection... ---"
+ )
+
+ start_time = time.time()
+
+ if args.combine_pages is True:
+ print("Combining pages...")
+ else:
+ print("Using line-level duplicate detection...")
+
+ # Load the CSV file as a list for the duplicate analysis function
+ (
+ results_df,
+ output_paths,
+ full_data_by_file,
+ processing_time,
+ task_textbox,
+ _,
+ _,
+ _,
+ ) = run_duplicate_analysis(
+ files=args.input_file,
+ threshold=args.similarity_threshold,
+ min_words=args.min_word_count,
+ min_consecutive=args.min_consecutive_pages,
+ greedy_match=args.greedy_match,
+ combine_pages=args.combine_pages,
+ output_folder=args.output_dir,
+ all_page_line_level_ocr_results_df_base=pd.DataFrame(),
+ ocr_df_paths_list=[],
+ )
+
+ end_time = time.time()
+ processing_time = end_time - start_time
+
+ print("\n--- Page Duplicate Detection Complete ---")
+ print(f"Found {len(results_df)} duplicate matches")
+ print(f"\nOutput files saved to: {args.output_dir}")
+ if output_paths:
+ print("Generated Files:", sorted(output_paths))
+
+ # Upload output files to S3 if enabled
+ if args.save_outputs_to_s3 and s3_output_folder and output_paths:
+ print("\n--- Uploading output files to S3 ---")
+ try:
+ export_outputs_to_s3(
+ file_list_state=output_paths,
+ s3_output_folder_state_value=s3_output_folder,
+ save_outputs_to_s3_flag=args.save_outputs_to_s3,
+ base_file_state=(
+ args.input_file[0] if args.input_file else None
+ ),
+ s3_bucket=args.s3_outputs_bucket,
+ )
+ except Exception as e:
+ print(f"Warning: Could not upload output files to S3: {e}")
+
+ # Log usage for page deduplication (match app: doc name or "document", data blank)
+ if usage_logger:
+ try:
+ print("Saving logs to CSV")
+ doc_file_name = (
+ os.path.basename(args.input_file[0])
+ if args.display_file_names_in_logs and args.input_file
+ else "document"
+ )
+ data_file_name = "" # Not applicable for page dedup
+ log_redaction_usage(
+ logger=usage_logger,
+ session_hash=session_hash,
+ doc_file_name=doc_file_name,
+ data_file_name=data_file_name,
+ time_taken=processing_time,
+ total_pages=0,
+ textract_queries=0,
+ comprehend_queries=0,
+ pii_method=args.pii_detector,
+ cost_code=args.cost_code,
+ handwriting_signature="",
+ text_extraction_method=args.ocr_method,
+ is_textract_call=False,
+ task=args.task,
+ save_to_dynamodb=args.save_logs_to_dynamodb,
+ save_to_s3=args.upload_logs_to_s3,
+ s3_bucket=args.s3_bucket,
+ s3_key_prefix=args.s3_logs_prefix,
+ vlm_model_name="",
+ vlm_total_input_tokens=0,
+ vlm_total_output_tokens=0,
+ llm_model_name="",
+ llm_total_input_tokens=0,
+ llm_total_output_tokens=0,
+ )
+ except Exception as e:
+ print(f"Warning: Could not log usage data: {e}")
+
+ else:
+ print(
+ "Error: Page duplicate detection requires CSV files with OCR data."
+ )
+ print("Please provide a CSV file containing OCR output data.")
+
+ # Log usage data if logger is available
+ if usage_logger:
+ try:
+ # Extract file name for logging
+ print("Saving logs to CSV")
+ doc_file_name = (
+ os.path.basename(args.input_file[0])
+ if args.display_file_names_in_logs
+ else "document"
+ )
+ data_file_name = (
+ "" # Not applicable for PDF/image redaction
+ )
+
+ # Determine if this was a Textract API call
+ is_textract_call = False
+
+ # Count pages (approximate from page_sizes if available)
+ total_pages = len(page_sizes) if page_sizes else 1
+
+ # Count API calls (approximate - would need to be tracked in the redaction function)
+ textract_queries = 0
+ comprehend_queries = 0
+
+ # Format handwriting/signature options
+ handwriting_signature = ""
+
+ log_redaction_usage(
+ logger=usage_logger,
+ session_hash=session_hash,
+ doc_file_name=doc_file_name,
+ data_file_name=data_file_name,
+ time_taken=processing_time,
+ total_pages=total_pages,
+ textract_queries=textract_queries,
+ pii_method=args.pii_detector,
+ comprehend_queries=comprehend_queries,
+ cost_code=args.cost_code,
+ handwriting_signature=handwriting_signature,
+ text_extraction_method=args.ocr_method,
+ is_textract_call=is_textract_call,
+ task=args.task,
+ save_to_dynamodb=args.save_logs_to_dynamodb,
+ save_to_s3=args.upload_logs_to_s3,
+ s3_bucket=args.s3_bucket,
+ s3_key_prefix=args.s3_logs_prefix,
+ vlm_model_name="", # Not applicable for duplicate detection
+ vlm_total_input_tokens=0,
+ vlm_total_output_tokens=0,
+ llm_model_name="", # Not applicable for duplicate detection
+ llm_total_input_tokens=0,
+ llm_total_output_tokens=0,
+ )
+ except Exception as e:
+ print(f"Warning: Could not log usage data: {e}")
+
+ elif args.duplicate_type == "tabular":
+ # Tabular duplicate detection
+ from tools.find_duplicate_tabular import run_tabular_duplicate_detection
+
+ if file_extension in [".csv", ".xlsx", ".xls", ".parquet"]:
+ print(
+ "--- Detected tabular file. Starting Tabular Duplicate Detection... ---"
+ )
+
+ start_time = time.time()
+
+ (
+ results_df,
+ output_paths,
+ full_data_by_file,
+ processing_time,
+ task_textbox,
+ ) = run_tabular_duplicate_detection(
+ files=args.input_file,
+ threshold=args.similarity_threshold,
+ min_words=args.min_word_count,
+ text_columns=args.text_columns,
+ output_folder=args.output_dir,
+ do_initial_clean_dup=args.do_initial_clean,
+ in_excel_tabular_sheets=args.excel_sheets,
+ remove_duplicate_rows=args.remove_duplicate_rows,
+ )
+
+ end_time = time.time()
+ processing_time = end_time - start_time
+
+ # Log usage data if logger is available
+ if usage_logger:
+ try:
+ # Extract file name for logging
+ print("Saving logs to CSV")
+ doc_file_name = "" # Tabular dedup: no doc (match app)
+ data_file_name = (
+ os.path.basename(args.input_file[0])
+ if args.display_file_names_in_logs and args.input_file
+ else "data_file"
+ )
+
+ is_textract_call = False
+ total_pages = 0 # Tabular dedup: no page count (match app)
+ textract_queries = 0
+ comprehend_queries = 0
+ handwriting_signature = ""
+
+ log_redaction_usage(
+ logger=usage_logger,
+ session_hash=session_hash,
+ doc_file_name=doc_file_name,
+ data_file_name=data_file_name,
+ time_taken=processing_time,
+ total_pages=total_pages,
+ textract_queries=textract_queries,
+ pii_method=args.pii_detector,
+ comprehend_queries=comprehend_queries,
+ cost_code=args.cost_code,
+ handwriting_signature=handwriting_signature,
+ text_extraction_method=args.ocr_method,
+ is_textract_call=is_textract_call,
+ task=args.task,
+ save_to_dynamodb=args.save_logs_to_dynamodb,
+ save_to_s3=args.upload_logs_to_s3,
+ s3_bucket=args.s3_bucket,
+ s3_key_prefix=args.s3_logs_prefix,
+ vlm_model_name="", # Not applicable for duplicate detection
+ vlm_total_input_tokens=0,
+ vlm_total_output_tokens=0,
+ llm_model_name="", # Not applicable for duplicate detection
+ llm_total_input_tokens=0,
+ llm_total_output_tokens=0,
+ )
+ except Exception as e:
+ print(f"Warning: Could not log usage data: {e}")
+
+ print("\n--- Tabular Duplicate Detection Complete ---")
+ print(f"Found {len(results_df)} duplicate matches")
+ print(f"\nOutput files saved to: {args.output_dir}")
+ if output_paths:
+ print("Generated Files:", sorted(output_paths))
+
+ # Upload output files to S3 if enabled
+ if args.save_outputs_to_s3 and s3_output_folder and output_paths:
+ print("\n--- Uploading output files to S3 ---")
+ try:
+ export_outputs_to_s3(
+ file_list_state=output_paths,
+ s3_output_folder_state_value=s3_output_folder,
+ save_outputs_to_s3_flag=args.save_outputs_to_s3,
+ base_file_state=(
+ args.input_file[0] if args.input_file else None
+ ),
+ s3_bucket=args.s3_outputs_bucket,
+ )
+ except Exception as e:
+ print(f"Warning: Could not upload output files to S3: {e}")
+
+ else:
+ print(
+ "Error: Tabular duplicate detection requires CSV, Excel, or Parquet files."
+ )
+ print("Supported types: .csv, .xlsx, .xls, .parquet")
+ else:
+ print(f"Error: Invalid duplicate type '{args.duplicate_type}'.")
+ print("Valid options: 'pages' or 'tabular'")
+
+ except Exception as e:
+ print(f"\nAn error occurred during the duplicate detection workflow: {e}")
+
+ # Task 3: Textract Batch Operations
+ elif args.task == "textract":
+ print("--- Starting Textract Batch Operations Workflow... ---")
+
+ if not args.textract_action:
+ print("Error: --textract_action is required for textract task.")
+ print("Valid options: 'submit', 'retrieve', or 'list'")
+ return
+
+ try:
+ if args.textract_action == "submit":
+ from tools.textract_batch_call import (
+ analyse_document_with_textract_api,
+ load_in_textract_job_details,
+ )
+
+ # Submit document to Textract for analysis
+ if not args.input_file:
+ print("Error: --input_file is required for submit action.")
+ return
+
+ print(f"--- Submitting document to Textract: {args.input_file} ---")
+
+ start_time = time.time()
+
+ # Load existing job details
+ job_df = load_in_textract_job_details(
+ load_s3_jobs_loc=args.s3_textract_document_logs_subfolder,
+ load_local_jobs_loc=args.local_textract_document_logs_subfolder,
+ )
+
+ # Determine signature extraction options
+ signature_options = (
+ ["Extract handwriting", "Extract signatures"]
+ if args.extract_signatures
+ else ["Extract handwriting"]
+ )
+
+ # Use configured bucket or override
+ textract_bucket = args.textract_bucket if args.textract_bucket else ""
+
+ # Submit the job
+ (
+ result_message,
+ job_id,
+ job_type,
+ successful_job_number,
+ is_textract_call,
+ total_pages,
+ task_textbox,
+ ) = analyse_document_with_textract_api(
+ local_pdf_path=args.input_file,
+ s3_input_prefix=args.textract_input_prefix,
+ s3_output_prefix=args.textract_output_prefix,
+ job_df=job_df,
+ s3_bucket_name=textract_bucket,
+ general_s3_bucket_name=args.s3_bucket,
+ local_output_dir=args.output_dir,
+ handwrite_signature_checkbox=signature_options,
+ aws_region=args.aws_region,
+ )
+
+ end_time = time.time()
+ processing_time = end_time - start_time
+
+ print("\n--- Textract Job Submitted Successfully ---")
+ print(f"Job ID: {job_id}")
+ print(f"Job Type: {job_type}")
+ print(f"Message: {result_message}")
+ print(f"Results will be available in: {args.output_dir}")
+
+ # Log usage data if logger is available
+ if usage_logger:
+ try:
+ # Extract file name for logging
+ print("Saving logs to CSV")
+ doc_file_name = (
+ os.path.basename(args.input_file[0])
+ if args.display_file_names_in_logs
+ else "document"
+ )
+ data_file_name = ""
+
+ # Determine if this was a Textract API call
+ is_textract_call = True
+ args.ocr_method == "AWS Textract"
+
+ # Count API calls (approximate - would need to be tracked in the redaction function)
+ textract_queries = total_pages
+ comprehend_queries = 0
+
+ # Format handwriting/signature options
+ handwriting_signature = ""
+
+ log_redaction_usage(
+ logger=usage_logger,
+ session_hash=session_hash,
+ doc_file_name=doc_file_name,
+ data_file_name=data_file_name,
+ time_taken=processing_time,
+ total_pages=total_pages,
+ textract_queries=textract_queries,
+ pii_method=args.pii_detector,
+ comprehend_queries=comprehend_queries,
+ cost_code=args.cost_code,
+ handwriting_signature=handwriting_signature,
+ text_extraction_method=args.ocr_method,
+ is_textract_call=is_textract_call,
+ task=args.task,
+ save_to_dynamodb=args.save_logs_to_dynamodb,
+ save_to_s3=args.upload_logs_to_s3,
+ s3_bucket=args.s3_bucket,
+ s3_key_prefix=args.s3_logs_prefix,
+ vlm_model_name="", # Not applicable for Textract submit
+ vlm_total_input_tokens=0,
+ vlm_total_output_tokens=0,
+ llm_model_name="", # Not applicable for Textract submit
+ llm_total_input_tokens=0,
+ llm_total_output_tokens=0,
+ )
+ except Exception as e:
+ print(f"Warning: Could not log usage data: {e}")
+
+ elif args.textract_action == "retrieve":
+ print(f"--- Retrieving Textract results for Job ID: {args.job_id} ---")
+
+ from tools.textract_batch_call import (
+ load_in_textract_job_details,
+ poll_whole_document_textract_analysis_progress_and_download,
+ )
+
+ # Retrieve results by job ID
+ if not args.job_id:
+ print("Error: --job_id is required for retrieve action.")
+ return
+
+ # Load existing job details to get job type
+ print("Loading existing job details...")
+ job_df = load_in_textract_job_details(
+ load_s3_jobs_loc=args.s3_textract_document_logs_subfolder,
+ load_local_jobs_loc=args.local_textract_document_logs_subfolder,
+ )
+
+ # Find job type from the dataframe
+ job_type = "document_text_detection" # default
+ if not job_df.empty and "job_id" in job_df.columns:
+ matching_jobs = job_df.loc[job_df["job_id"] == args.job_id]
+ if not matching_jobs.empty and "job_type" in matching_jobs.columns:
+ job_type = matching_jobs.iloc[0]["job_type"]
+
+ # Use configured bucket or override
+ textract_bucket = args.textract_bucket if args.textract_bucket else ""
+
+ # Poll for completion and download results
+ print("Polling for completion and downloading results...")
+ downloaded_file_path, job_status, updated_job_df, output_filename = (
+ poll_whole_document_textract_analysis_progress_and_download(
+ job_id=args.job_id,
+ job_type_dropdown=job_type,
+ s3_output_prefix=args.textract_output_prefix,
+ pdf_filename="", # Will be determined from job details
+ job_df=job_df,
+ s3_bucket_name=textract_bucket,
+ load_s3_jobs_loc=args.s3_textract_document_logs_subfolder,
+ load_local_jobs_loc=args.local_textract_document_logs_subfolder,
+ local_output_dir=args.output_dir,
+ poll_interval_seconds=args.poll_interval,
+ max_polling_attempts=args.max_poll_attempts,
+ )
+ )
+
+ print("\n--- Textract Results Retrieved Successfully ---")
+ print(f"Job Status: {job_status}")
+ print(f"Downloaded File: {downloaded_file_path}")
+ # print(f"Output Filename: {output_filename}")
+
+ elif args.textract_action == "list":
+ from tools.textract_batch_call import load_in_textract_job_details
+
+ # List recent Textract jobs
+ print("--- Listing Recent Textract Jobs ---")
+
+ job_df = load_in_textract_job_details(
+ load_s3_jobs_loc=args.s3_textract_document_logs_subfolder,
+ load_local_jobs_loc=args.local_textract_document_logs_subfolder,
+ )
+
+ if job_df.empty:
+ print("No recent Textract jobs found.")
+ else:
+ print(f"\nFound {len(job_df)} recent Textract jobs:")
+ print("-" * 80)
+ for _, job in job_df.iterrows():
+ print(f"Job ID: {job.get('job_id', 'N/A')}")
+ print(f"File: {job.get('file_name', 'N/A')}")
+ print(f"Type: {job.get('job_type', 'N/A')}")
+ print(f"Signatures: {job.get('signature_extraction', 'N/A')}")
+ print(f"Date: {job.get('job_date_time', 'N/A')}")
+ print("-" * 80)
+
+ else:
+ print(f"Error: Invalid textract_action '{args.textract_action}'.")
+ print("Valid options: 'submit', 'retrieve', or 'list'")
+
+ except Exception as e:
+ print(f"\nAn error occurred during the Textract workflow: {e}")
+
+ elif args.task == "summarise":
+ print("--- Document Summarisation ---")
+ try:
+ from tools.cli_usage_logger import log_redaction_usage
+ from tools.file_conversion import is_pdf
+ from tools.summaries import (
+ concise_summary_format_prompt,
+ detailed_summary_format_prompt,
+ load_csv_files_to_dataframe,
+ summarise_document_wrapper,
+ )
+
+ # Map format choice to prompt string (same as GUI)
+ format_map = {
+ "concise": concise_summary_format_prompt,
+ "detailed": detailed_summary_format_prompt,
+ }
+ summarise_format_radio = format_map.get(
+ args.summarisation_format, detailed_summary_format_prompt
+ )
+
+ # Normalise input to list of paths
+ input_paths = (
+ [args.input_file]
+ if isinstance(args.input_file, str)
+ else list(args.input_file or [])
+ )
+ input_paths = [p for p in input_paths if p and str(p).strip()]
+
+ # If any input is a PDF, extract text first then summarise (same as app.py)
+ summarise_from_pdf = any(is_pdf(p) for p in input_paths)
+ if summarise_from_pdf:
+ pdf_path = next((p for p in input_paths if is_pdf(p)), None)
+ if not pdf_path:
+ print("Error: No PDF path found in input files.")
+ return
+ print(
+ f"Detected PDF input. Extracting text with '{args.ocr_method}' then summarising..."
+ )
+ from tools.file_conversion import prepare_image_or_pdf
+ from tools.file_redaction import choose_and_run_redactor
+
+ prepare_images = args.ocr_method in ["Local OCR", "AWS Textract"]
+ (
+ _prep_summary,
+ prepared_pdf_paths,
+ image_file_paths,
+ _,
+ _,
+ pdf_doc,
+ image_annotations,
+ _,
+ original_cropboxes,
+ page_sizes,
+ _,
+ _,
+ _,
+ _,
+ _,
+ _,
+ ) = prepare_image_or_pdf(
+ file_paths=[pdf_path],
+ text_extract_method=args.ocr_method,
+ all_line_level_ocr_results_df=pd.DataFrame(),
+ all_page_line_level_ocr_results_with_words_df=pd.DataFrame(),
+ first_loop_state=True,
+ prepare_for_review=False,
+ output_folder=args.output_dir,
+ input_folder=args.input_dir,
+ prepare_images=prepare_images,
+ page_min=args.page_min,
+ page_max=args.page_max,
+ )
+ print(f" {_prep_summary}")
+
+ (
+ _,
+ _,
+ _,
+ _,
+ _,
+ _,
+ _,
+ _,
+ _,
+ _,
+ _,
+ _,
+ ocr_df,
+ _,
+ _,
+ _,
+ _,
+ _,
+ _,
+ _,
+ _,
+ _,
+ _,
+ _,
+ _,
+ _,
+ _,
+ _,
+ _,
+ _,
+ _,
+ _,
+ _,
+ _,
+ _,
+ _,
+ _,
+ ) = choose_and_run_redactor(
+ file_paths=[pdf_path],
+ prepared_pdf_file_paths=prepared_pdf_paths,
+ pdf_image_file_paths=image_file_paths,
+ chosen_redact_entities=args.local_redact_entities or [],
+ chosen_redact_comprehend_entities=args.aws_redact_entities or [],
+ chosen_llm_entities=args.llm_redact_entities or [],
+ text_extraction_method=args.ocr_method,
+ in_allow_list=args.allow_list_file,
+ in_deny_list=args.deny_list_file,
+ redact_whole_page_list=args.redact_whole_page_file,
+ first_loop_state=True,
+ page_min=args.page_min,
+ page_max=args.page_max,
+ handwrite_signature_checkbox=args.handwrite_signature_extraction
+ or [],
+ max_fuzzy_spelling_mistakes_num=getattr(
+ args, "fuzzy_mistakes", DEFAULT_FUZZY_SPELLING_MISTAKES_NUM
+ ),
+ match_fuzzy_whole_phrase_bool=getattr(
+ args, "match_fuzzy_whole_phrase_bool", True
+ ),
+ pymupdf_doc=pdf_doc,
+ annotations_all_pages=image_annotations,
+ page_sizes=page_sizes,
+ document_cropboxes=original_cropboxes,
+ pii_identification_method=args.pii_detector or "Local",
+ aws_access_key_textbox=args.aws_access_key or "",
+ aws_secret_key_textbox=args.aws_secret_key or "",
+ language=args.language,
+ output_folder=args.output_dir,
+ input_folder=args.input_dir,
+ custom_llm_instructions=args.custom_llm_instructions or "",
+ inference_server_vlm_model=(
+ getattr(args, "inference_server_vlm_model", None)
+ or DEFAULT_INFERENCE_SERVER_VLM_MODEL
+ ),
+ efficient_ocr=getattr(args, "efficient_ocr", EFFICIENT_OCR),
+ efficient_ocr_min_words=(
+ getattr(args, "efficient_ocr_min_words", None)
+ or EFFICIENT_OCR_MIN_WORDS
+ ),
+ efficient_ocr_min_image_coverage_fraction=(
+ getattr(args, "efficient_ocr_min_image_coverage_fraction", None)
+ if getattr(
+ args, "efficient_ocr_min_image_coverage_fraction", None
+ )
+ is not None
+ else EFFICIENT_OCR_MIN_IMAGE_COVERAGE_FRACTION
+ ),
+ ocr_first_pass_max_workers=(
+ getattr(args, "ocr_first_pass_max_workers", None)
+ or OCR_FIRST_PASS_MAX_WORKERS
+ ),
+ hybrid_textract_bedrock_vlm=getattr(
+ args, "hybrid_textract_bedrock_vlm", HYBRID_TEXTRACT_BEDROCK_VLM
+ ),
+ overwrite_existing_ocr_results=getattr(
+ args,
+ "overwrite_existing_ocr_results",
+ OVERWRITE_EXISTING_OCR_RESULTS,
+ ),
+ save_page_ocr_visualisations=(
+ getattr(args, "save_page_ocr_visualisations", None)
+ if getattr(args, "save_page_ocr_visualisations", None)
+ is not None
+ else SAVE_PAGE_OCR_VISUALISATIONS
+ ),
+ text_extraction_only=True,
+ )
+
+ if ocr_df is None or (
+ isinstance(ocr_df, pd.DataFrame) and ocr_df.empty
+ ):
+ print("Error: No OCR text extracted from PDF. Cannot summarise.")
+ return
+
+ # Derive file_name from PDF path (same as app.py _file_name_from_pdf_path)
+ basename = os.path.basename(pdf_path)
+ file_name = os.path.splitext(basename)[0][:20]
+ invalid_chars = '<>:"/\\|?*'
+ for char in invalid_chars:
+ file_name = file_name.replace(char, "_")
+ file_name = file_name if file_name else "document"
+ else:
+ # CSV path: load OCR CSV file(s)
+ ocr_df = load_csv_files_to_dataframe(input_paths)
+ if ocr_df is None or ocr_df.empty:
+ print(
+ "Error: No valid OCR data (page, line, text columns) in input file(s)."
+ )
+ return
+
+ first_path = input_paths[0] if input_paths else ""
+ if first_path:
+ basename = os.path.basename(first_path)
+ file_name = os.path.splitext(basename)[0][:20]
+ invalid_chars = '<>:"/\\|?*'
+ for char in invalid_chars:
+ file_name = file_name.replace(char, "_")
+ file_name = file_name if file_name else "document"
+ else:
+ file_name = "document"
+
+ (
+ output_files,
+ status_message,
+ llm_model_name,
+ llm_total_input_tokens,
+ llm_total_output_tokens,
+ summary_display_text,
+ elapsed_seconds,
+ ) = summarise_document_wrapper(
+ ocr_df,
+ args.output_dir,
+ args.summarisation_inference_method,
+ args.summarisation_api_key or "",
+ args.summarisation_temperature,
+ file_name,
+ args.summarisation_context or "",
+ args.aws_access_key or "",
+ args.aws_secret_key or "",
+ "",
+ AZURE_OPENAI_INFERENCE_ENDPOINT or "",
+ summarise_format_radio,
+ args.summarisation_additional_instructions or "",
+ args.summarisation_max_pages_per_group,
+ None,
+ )
+
+ processing_time = elapsed_seconds
+
+ print(f"\n{status_message}")
+ if output_files:
+ print("Output files:")
+ for p in output_files:
+ print(f" {p}")
+ if summary_display_text:
+ print("\n--- Summary ---")
+ print(
+ summary_display_text[:2000]
+ + ("..." if len(summary_display_text) > 2000 else "")
+ )
+
+ # Usage logging (same fields as GUI summarisation success callback)
+ if usage_logger:
+ try:
+ first_input = input_paths[0] if input_paths else ""
+ doc_file_name = (
+ os.path.basename(first_input)
+ if args.display_file_names_in_logs and first_input
+ else "document"
+ )
+ data_file_name = ""
+ total_pages = (
+ int(ocr_df["page"].max())
+ if "page" in ocr_df.columns and not ocr_df.empty
+ else 0
+ )
+
+ log_redaction_usage(
+ logger=usage_logger,
+ session_hash=session_hash,
+ doc_file_name=doc_file_name,
+ data_file_name=data_file_name,
+ time_taken=processing_time,
+ total_pages=total_pages,
+ textract_queries=0,
+ pii_method=args.summarisation_inference_method,
+ comprehend_queries=0,
+ cost_code=args.cost_code,
+ handwriting_signature="",
+ text_extraction_method="",
+ is_textract_call=False,
+ task="summarisation",
+ save_to_dynamodb=args.save_logs_to_dynamodb,
+ save_to_s3=args.upload_logs_to_s3,
+ s3_bucket=args.s3_bucket,
+ s3_key_prefix=args.s3_logs_prefix,
+ vlm_model_name="",
+ vlm_total_input_tokens=0,
+ vlm_total_output_tokens=0,
+ llm_model_name=llm_model_name or "",
+ llm_total_input_tokens=llm_total_input_tokens or 0,
+ llm_total_output_tokens=llm_total_output_tokens or 0,
+ )
+ except Exception as e:
+ print(f"Warning: Could not log usage data: {e}")
+
+ except Exception as e:
+ print(f"\nAn error occurred during summarisation: {e}")
+ import traceback
+
+ traceback.print_exc()
+
+ elif args.task == "combine_review_pdfs":
+ print("--- Combine review PDFs ---")
+ try:
+ from tools.file_conversion import combine_review_pdf_files
+
+ paths = (
+ [args.input_file]
+ if isinstance(args.input_file, str)
+ else list(args.input_file)
+ )
+ if len(paths) < 2:
+ print("Error: combine_review_pdfs requires at least 2 input PDF files.")
+ return
+ out_dir = args.output_dir
+ os.makedirs(out_dir, exist_ok=True)
+ result = combine_review_pdf_files(paths, output_folder=out_dir)
+ if result:
+ print(f"Combined PDF saved to: {result[0]}")
+ else:
+ print("No output produced (empty file list or no valid paths).")
+ except ValueError as e:
+ print(f"Error: {e}")
+ except Exception as e:
+ print(f"\nAn error occurred while combining review PDFs: {e}")
+ import traceback
+
+ traceback.print_exc()
+
+ else:
+ print(f"Error: Invalid task '{args.task}'.")
+ print(
+ "Valid options: 'redact', 'deduplicate', 'textract', 'summarise', or 'combine_review_pdfs'"
+ )
+
+
+if __name__ == "__main__":
+ main()
diff --git a/docker-compose_llama.yml b/docker-compose_llama.yml
new file mode 100644
index 0000000000000000000000000000000000000000..ded1427d1e29459df0fdf75cb4e550abe68355cd
--- /dev/null
+++ b/docker-compose_llama.yml
@@ -0,0 +1,211 @@
+# Pick which GGUF model runs by setting COMPOSE_PROFILES in .env (or pass --profile):
+# COMPOSE_PROFILES=35b -> qwen35-35b_q4_gguf
+# COMPOSE_PROFILES=27b -> qwen35-27b_q4_gguf
+# The app always talks to http://llama-inference:8080 (shared network alias on both model services).
+# Each model service uses its own llama.cpp cache volume so mmproj-F16.gguf (same filename per repo)
+# is never shared between 35B and 27B downloads.
+# Example CLI commands:
+# docker compose -f docker-compose_llama.yml --profile 35b up -d
+# docker compose -f docker-compose_llama.yml --profile 27b up -d
+services:
+ # Qwen 3.5 35B model setup below requires 24GB of VRAM with n-cpu-moe set to 0. For lower VRAM systems, n-cpu-moe ~ 40 could work for a 12GB VRAM system, and n-cpu-moe ~ 20 for a 16GB VRAM system.
+ qwen35-35b_q4_gguf:
+ profiles: ["35b"]
+ image: ghcr.io/ggml-org/llama.cpp:server-cuda12
+ command:
+ - -hf
+ - unsloth/Qwen3.5-35B-A3B-GGUF
+ - --hf-file
+ - Qwen3.5-35B-A3B-UD-IQ4_NL.gguf
+ - --mmproj-url
+ - https://huggingface.co/unsloth/Qwen3.5-35B-A3B-GGUF/resolve/main/mmproj-F16.gguf
+ - --n-gpu-layers
+ - "999"
+ - --ctx-size
+ - "32768"
+ - --fit
+ - "off"
+ - --temp
+ - "0.7"
+ - --top-k
+ - "20"
+ - --top-p
+ - "0.8"
+ - --min-p
+ - "0.0"
+ - --frequency-penalty
+ - "1"
+ - --presence-penalty
+ - "1"
+ - --host
+ - "0.0.0.0"
+ - --port
+ - "8080"
+ - --no-warmup
+ - --seed
+ - "42"
+ - --n-cpu-moe
+ - "0" # Increase this value to fit within your availableVRAM
+ ports:
+ - "8001:8080"
+ volumes:
+ - ./models:/models
+ - hf-llama-cache-qwen35-35b:/root/.cache/llama.cpp
+ deploy:
+ resources:
+ reservations:
+ devices:
+ - driver: nvidia
+ count: all
+ capabilities: [gpu]
+ healthcheck:
+ test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null || exit 1"]
+ interval: 30s
+ timeout: 15s
+ retries: 8
+ start_period: 1200s
+ networks:
+ redaction-net-llama:
+ aliases:
+ - llama-inference
+
+ # Qwen 3.5 27B model setup below requires 24GB of VRAM to run.
+ qwen35-27b_q4_gguf:
+ profiles: ["27b"]
+ image: ghcr.io/ggml-org/llama.cpp:server-cuda12
+ command:
+ - -hf
+ - unsloth/Qwen3.5-27B-GGUF
+ - --hf-file
+ - Qwen3.5-27B-UD-Q4_K_XL.gguf
+ - --mmproj-url
+ - https://huggingface.co/unsloth/Qwen3.5-27B-GGUF/resolve/main/mmproj-F16.gguf
+ - --n-gpu-layers
+ - "999"
+ - --ctx-size
+ - "32768"
+ - --fit
+ - "off"
+ - --temp
+ - "0.7"
+ - --top-k
+ - "20"
+ - --top-p
+ - "0.8"
+ - --min-p
+ - "0.0"
+ - --frequency-penalty
+ - "1"
+ - --presence-penalty
+ - "1"
+ - --host
+ - "0.0.0.0"
+ - --port
+ - "8080"
+ - --no-warmup
+ - --seed
+ - "42"
+ ports:
+ - "8000:8080"
+ volumes:
+ - ./models:/models
+ - hf-llama-cache-qwen35-27b:/root/.cache/llama.cpp
+ deploy:
+ resources:
+ reservations:
+ devices:
+ - driver: nvidia
+ count: all
+ capabilities: [gpu]
+ healthcheck:
+ test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null || exit 1"]
+ interval: 30s
+ timeout: 15s
+ retries: 8
+ start_period: 1200s
+ networks:
+ redaction-net-llama:
+ aliases:
+ - llama-inference
+
+ redaction-app-llama:
+ profiles: ["35b", "27b"]
+ image: redaction-app-main
+ build:
+ context: . # Look in the current folder
+ dockerfile: Dockerfile # Use this file
+ target: gradio # Use the 'gradio' stage from your Dockerfile
+ args: # Pass your build-time variables here!
+ - TORCH_GPU_ENABLED=False
+ - INSTALL_VLM=False
+ - PADDLE_GPU_ENABLED=True
+ - INSTALL_PADDLEOCR=True
+ shm_size: '8gb'
+ depends_on:
+ qwen35-35b_q4_gguf:
+ condition: service_healthy
+ required: false
+ qwen35-27b_q4_gguf:
+ condition: service_healthy
+ required: false
+ environment:
+ - FLAGS_fraction_of_gpu_memory_to_use=0.05
+ - RUN_FASTAPI=True
+ - APP_MODE=fastapi
+ - SHOW_PADDLE_MODEL_OPTIONS=True
+ - SHOW_LOCAL_OCR_MODEL_OPTIONS=True
+ - SHOW_LOCAL_PII_DETECTION_OPTIONS=True
+ - SHOW_INFERENCE_SERVER_PII_OPTIONS=True
+ - SHOW_INFERENCE_SERVER_VLM_OPTIONS=True
+ - SHOW_HYBRID_MODELS=True
+ - SHOW_DIFFICULT_OCR_EXAMPLES=True
+ - SHOW_ALL_OUTPUTS_IN_OUTPUT_FOLDER=True
+ - SHOW_SUMMARISATION=True
+ - SHOW_AWS_API_KEYS=True
+ - DEFAULT_TEXT_EXTRACTION_MODEL=Local OCR model - PDFs without selectable text
+ - DEFAULT_LOCAL_OCR_MODEL=paddle
+ - DEFAULT_PII_DETECTION_MODEL=Local
+ - INFERENCE_SERVER_API_URL=http://llama-inference:8080
+ - DEFAULT_INFERENCE_SERVER_VLM_MODEL=""
+ - DEFAULT_INFERENCE_SERVER_PII_MODEL=""
+ - CUSTOM_VLM_BACKEND=inference_vlm
+ - MAX_WORKERS=12
+ - TESSERACT_MAX_WORKERS=8
+ - PADDLE_MAX_WORKERS=1 # Keep this to 1 to avoid VRAM overflow or errors
+ - LOAD_PADDLE_AT_STARTUP=False
+ - EFFICIENT_OCR=True
+ - SHOW_CUSTOM_VLM_ENTITIES=True
+ - SESSION_OUTPUT_FOLDER=True
+ - SAVE_PAGE_OCR_VISUALISATIONS=False
+ - HYBRID_OCR_CONFIDENCE_THRESHOLD=97
+ - INCLUDE_OCR_VISUALISATION_IN_OUTPUT_FILES=True
+ - PREPROCESS_LOCAL_OCR_IMAGES=False
+ - INFERENCE_SERVER_DISABLE_THINKING=True
+ - MAX_NEW_TOKENS=16384
+ - SAVE_EXAMPLE_HYBRID_IMAGES=False
+ - SAVE_VLM_INPUT_IMAGES=False
+ - VLM_MAX_DPI=200.0
+ - DEFAULT_NEW_BATCH_CHAR_COUNT=1250
+ - REPORT_VLM_OUTPUTS_TO_GUI=True
+ - REPORT_LLM_OUTPUTS_TO_GUI=True
+ - ADD_VLM_BOUNDING_BOX_RULES=False
+
+ deploy:
+ resources:
+ reservations:
+ devices:
+ - driver: nvidia
+ count: all
+ capabilities: [gpu]
+ ports:
+ - "7861:7860"
+ networks:
+ - redaction-net-llama
+
+networks:
+ redaction-net-llama:
+ driver: bridge
+
+volumes:
+ hf-llama-cache-qwen35-35b:
+ hf-llama-cache-qwen35-27b:
\ No newline at end of file
diff --git a/docker-compose_vllm.yml b/docker-compose_vllm.yml
new file mode 100644
index 0000000000000000000000000000000000000000..77d7cd35c266a52efc99afba59fc3d3a9084ed28
--- /dev/null
+++ b/docker-compose_vllm.yml
@@ -0,0 +1,163 @@
+# Pick which vLLM stack runs via COMPOSE_PROFILES (or --profile). Set VLLM_OPENAI_MODEL in .env
+# to match the served model (required for 27b; 9b defaults below if omitted):
+# COMPOSE_PROFILES=vllm-9b -> QuantTrio/Qwen3.5-9B-AWQ (default VLLM_OPENAI_MODEL)
+# COMPOSE_PROFILES=vllm-27b -> set VLLM_OPENAI_MODEL=QuantTrio/Qwen3.5-27B-AWQ
+# App uses http://vllm-inference:8000 (shared network alias on both vLLM services).
+# Example CLI commands:
+# docker compose -f docker-compose_vllm.yml --profile vllm-9b up -d
+# docker compose -f docker-compose_vllm.yml --profile vllm-27b up -d
+services:
+ vllm-server-qwen35-9b:
+ profiles: ["vllm-9b"]
+ image: vllm/vllm-openai:latest
+ shm_size: '8gb'
+ command: |
+ --model QuantTrio/Qwen3.5-9B-AWQ
+ --gpu-memory-utilization 0.7
+ --tensor-parallel-size 1
+ --max-num-seqs 1
+ --reasoning-parser qwen3
+ --max-model-len 32768
+ --speculative-config '{"method":"mtp","num_speculative_tokens":3}'
+ --max-num-batched-tokens 2048
+
+ deploy:
+ resources:
+ reservations:
+ devices:
+ - driver: nvidia
+ count: all
+ capabilities: [gpu]
+ healthcheck:
+ test: ["CMD-SHELL", "curl -fsS http://localhost:8000/v1/models >/dev/null || exit 1"]
+ interval: 30s
+ timeout: 15s
+ retries: 8
+ start_period: 1200s
+ ports:
+ - "8000:8000"
+ volumes:
+ - hf-model-cache:/root/.cache/huggingface
+ networks:
+ redaction-net-vllm:
+ aliases:
+ - vllm-inference
+
+ vllm-server-qwen35-27b:
+ profiles: ["vllm-27b"]
+ image: vllm/vllm-openai:latest
+ shm_size: '16gb'
+ command: |
+ --model QuantTrio/Qwen3.5-27B-AWQ
+ --gpu-memory-utilization 0.94
+ --tensor-parallel-size 1
+ --max-num-seqs 2
+ --reasoning-parser qwen3
+ --max-model-len 16384
+ --max-num-batched-tokens 4096
+ --enforce-eager
+ --kv-cache-dtype fp8
+ --enable-chunked-prefill
+ --enable-prefix-caching
+
+ deploy:
+ resources:
+ reservations:
+ devices:
+ - driver: nvidia
+ count: all
+ capabilities: [gpu]
+ healthcheck:
+ test: ["CMD-SHELL", "curl -fsS http://localhost:8000/v1/models >/dev/null || exit 1"]
+ interval: 30s
+ timeout: 15s
+ retries: 8
+ start_period: 1200s
+ ports:
+ - "8001:8000"
+ volumes:
+ - hf-model-cache:/root/.cache/huggingface
+ networks:
+ redaction-net-vllm:
+ aliases:
+ - vllm-inference
+
+ redaction-app-vllm:
+ profiles: ["vllm-9b", "vllm-27b"]
+ image: redaction-app-main
+ build:
+ context: . # Look in the current folder
+ dockerfile: Dockerfile # Use this file
+ target: gradio # Use the 'gradio' stage from your Dockerfile
+ args: # Pass your build-time variables here!
+ - TORCH_GPU_ENABLED=False
+ - INSTALL_VLM=False
+ - PADDLE_GPU_ENABLED=True
+ - INSTALL_PADDLEOCR=True
+ shm_size: '8gb'
+ depends_on:
+ vllm-server-qwen35-9b:
+ condition: service_healthy
+ required: false
+ vllm-server-qwen35-27b:
+ condition: service_healthy
+ required: false
+ environment:
+ - FLAGS_fraction_of_gpu_memory_to_use=0.05
+ - RUN_FASTAPI=True
+ - APP_MODE=fastapi
+ - SHOW_PADDLE_MODEL_OPTIONS=True
+ - SHOW_LOCAL_OCR_MODEL_OPTIONS=True
+ - SHOW_INFERENCE_SERVER_PII_OPTIONS=True
+ - SHOW_INFERENCE_SERVER_VLM_OPTIONS=True
+ - SHOW_HYBRID_MODELS=True
+ - SHOW_DIFFICULT_OCR_EXAMPLES=True
+ - SHOW_ALL_OUTPUTS_IN_OUTPUT_FOLDER=True
+ - SHOW_SUMMARISATION=True
+ - SHOW_AWS_API_KEYS=True
+ - DEFAULT_TEXT_EXTRACTION_MODEL=Local OCR model - PDFs without selectable text
+ - DEFAULT_LOCAL_OCR_MODEL=paddle
+ - DEFAULT_PII_DETECTION_MODEL=Local
+ - CUSTOM_VLM_BACKEND=inference_vlm
+ - MAX_WORKERS=12
+ - TESSERACT_MAX_WORKERS=8
+ - PADDLE_MAX_WORKERS=1 # Keep this to 1 to avoid VRAM overflow or errors
+ - LOAD_PADDLE_AT_STARTUP=False
+ - INFERENCE_SERVER_API_URL=http://vllm-inference:8000
+ - DEFAULT_INFERENCE_SERVER_VLM_MODEL=${VLLM_OPENAI_MODEL:-QuantTrio/Qwen3.5-9B-AWQ} # Change this to QuantTrio/Qwen3.5-27B-AWQ if running that model
+ - DEFAULT_INFERENCE_SERVER_PII_MODEL=${VLLM_OPENAI_MODEL:-QuantTrio/Qwen3.5-9B-AWQ} # Change this to QuantTrio/Qwen3.5-27B-AWQ if running that model
+ - EFFICIENT_OCR=True
+ - SHOW_CUSTOM_VLM_ENTITIES=True
+ - SESSION_OUTPUT_FOLDER=True
+ - SAVE_PAGE_OCR_VISUALISATIONS=False
+ - HYBRID_OCR_CONFIDENCE_THRESHOLD=97
+ - INCLUDE_OCR_VISUALISATION_IN_OUTPUT_FILES=True
+ - PREPROCESS_LOCAL_OCR_IMAGES=False
+ - INFERENCE_SERVER_DISABLE_THINKING=True
+ - MAX_NEW_TOKENS=16384
+ - SAVE_EXAMPLE_HYBRID_IMAGES=False
+ - SAVE_VLM_INPUT_IMAGES=False
+ - VLM_MAX_DPI=200.0
+ - DEFAULT_NEW_BATCH_CHAR_COUNT=1250
+ - REPORT_VLM_OUTPUTS_TO_GUI=True
+ - REPORT_LLM_OUTPUTS_TO_GUI=True
+ - ADD_VLM_BOUNDING_BOX_RULES=False
+
+ deploy:
+ resources:
+ reservations:
+ devices:
+ - driver: nvidia
+ count: all
+ capabilities: [gpu]
+ ports:
+ - "7860:7860"
+ networks:
+ - redaction-net-vllm
+
+networks:
+ redaction-net-vllm:
+ driver: bridge
+
+volumes:
+ hf-model-cache:
\ No newline at end of file
diff --git a/entrypoint.sh b/entrypoint.sh
new file mode 100644
index 0000000000000000000000000000000000000000..3caaddfc82fee1c45674a10b8d47a605c0b8d3ed
--- /dev/null
+++ b/entrypoint.sh
@@ -0,0 +1,52 @@
+#!/bin/sh
+
+# Exit immediately if a command exits with a non-zero status.
+set -e
+
+echo "Starting in APP_MODE: $APP_MODE"
+
+# --- Ensure application directories are writable by the current user ---
+# This is important when Docker volumes are bind-mounted from the host and
+# the host directory may be owned by root (uid 0), which would prevent the
+# non-root container user (uid 1000) from writing output/input files.
+for dir in \
+ "${GRADIO_OUTPUT_FOLDER:-/home/user/app/output}" \
+ "${GRADIO_INPUT_FOLDER:-/home/user/app/input}" \
+ "${GRADIO_TEMP_DIR:-/tmp/gradio_tmp}" \
+ "${ACCESS_LOGS_FOLDER:-/home/user/app/logs}" \
+ "${USAGE_LOGS_FOLDER:-/home/user/app/usage}" \
+ "${FEEDBACK_LOGS_FOLDER:-/home/user/app/feedback}" \
+ "${CONFIG_FOLDER:-/home/user/app/config}"; do
+ mkdir -p "$dir" 2>/dev/null || true
+ if [ ! -w "$dir" ]; then
+ echo "WARNING: Directory $dir is not writable by current user (uid=$(id -u)). File I/O will fail." >&2
+ fi
+done
+
+# --- Start the app based on mode ---
+
+if [ "$APP_MODE" = "lambda" ]; then
+ echo "Starting in Lambda mode..."
+ # The CMD from Dockerfile will be passed as "$@"
+ exec python -m awslambdaric "$@"
+else
+ echo "Starting in Gradio/FastAPI mode..."
+
+ if [ "$RUN_FASTAPI" = "True" ]; then
+ echo "Starting in FastAPI mode..."
+
+ GRADIO_SERVER_NAME=${GRADIO_SERVER_NAME:-0.0.0.0}
+ GRADIO_SERVER_PORT=${GRADIO_SERVER_PORT:-7860}
+
+ # Start uvicorn server.
+ echo "Starting with Uvicorn on $GRADIO_SERVER_NAME:$GRADIO_SERVER_PORT"
+ exec uvicorn app:app \
+ --host $GRADIO_SERVER_NAME \
+ --port $GRADIO_SERVER_PORT \
+ --proxy-headers \
+ --forwarded-allow-ips "*"
+ else
+ echo "Starting in Gradio mode..."
+ exec python app.py
+ fi
+fi
\ No newline at end of file
diff --git a/example_app_config.env b/example_app_config.env
new file mode 100644
index 0000000000000000000000000000000000000000..3d142490aba91c3cd7d2f7dfefc1da338ba64fc8
--- /dev/null
+++ b/example_app_config.env
@@ -0,0 +1,129 @@
+# Rename this file to app_config.env and place it in the folder config/ (i.e. it will be located at app_base_folder/config/app_config.env). The app will then automatically load in these variables at startup. See tools/config.py for all the possible config variables you can set, or src/app_settings.qmd for descriptions. Below are some suggested config variables to start
+
+# General app run options
+TESSERACT_FOLDER=tesseract/ # If in a custom folder, not needed if in PATH
+POPPLER_FOLDER=poppler/poppler-24.02.0/Library/bin/ # If in a custom folder, Not needed if in PATH
+
+GRADIO_SERVER_NAME=127.0.0.1
+GRADIO_SERVER_PORT=7860
+
+USER_GUIDE_URL=
+CUSTOM_BOX_COLOUR=(128, 128, 128)
+RUN_FASTAPI=False
+FAVICON_PATH=favicon.png
+INTRO_TEXT=intros/short_intro.txt
+
+# GUI options
+SHOW_QUICKSTART=False
+SHOW_SUMMARISATION=True
+SHOW_EXAMPLES=True
+SHOW_DIFFICULT_OCR_EXAMPLES=True
+SHOW_LANGUAGE_SELECTION=True
+SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS=False
+SHOW_COSTS=True
+SHOW_LOCAL_OCR_MODEL_OPTIONS=True
+SHOW_ALL_OUTPUTS_IN_OUTPUT_FOLDER=True
+SHOW_PII_IDENTIFICATION_OPTIONS=True
+SHOW_LOCAL_PII_DETECTION_OPTIONS=True
+SHOW_OCR_GUI_OPTIONS=True
+EXTRACTION_AND_PII_OPTIONS_OPEN_BY_DEFAULT=True
+
+# Model / redaction process options
+DEFAULT_LOCAL_OCR_MODEL=tesseract
+OVERWRITE_EXISTING_OCR_RESULTS=False
+PREPROCESS_LOCAL_OCR_IMAGES=False # Whether to apply corrections to input images before processing. Will slow down redaction processes
+MAX_WORKERS=4 # How many workers should be working in parallel to run various text extraction/redaction tasks. Adjust depending on how many CPUs your computer has
+
+EFFICIENT_OCR=True
+OVERWRITE_EXISTING_OCR_RESULTS=True
+INCLUDE_OCR_VISUALISATION_IN_OUTPUT_FILES=True
+
+# Redaction box appearance
+CUSTOM_BOX_COLOUR=(128, 128, 128)
+USE_GUI_BOX_COLOURS_FOR_OUTPUTS=False
+
+# Image save options
+SAVE_PAGE_OCR_VISUALISATIONS=True
+SAVE_PREPROCESS_IMAGES=True
+
+# Saving and logging variables
+SAVE_LOGS_TO_CSV=True
+SESSION_OUTPUT_FOLDER=True # Save outputs into user session folders
+DISPLAY_FILE_NAMES_IN_LOGS=False
+
+# PaddleOCR
+SHOW_PADDLE_MODEL_OPTIONS=False
+LOAD_PADDLE_AT_STARTUP=False
+PADDLE_MAX_WORKERS=4 # Number of simultaneous workers for Paddle OCR tasks. Generally advised to keep at 1, but may work with 2 or more depending on your system.
+
+# GUI show VLM/LLM models
+SHOW_HYBRID_MODELS=False
+SHOW_CUSTOM_VLM_ENTITIES=False
+SHOW_VLM_MODEL_OPTIONS=True
+SHOW_INFERENCE_SERVER_PII_OPTIONS=False
+SHOW_INFERENCE_SERVER_VLM_OPTIONS=False
+SHOW_TRANSFORMERS_LLM_PII_DETECTION_OPTIONS=False
+
+# VLM using Transformers options
+SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL=Qwen3.5-9B
+QUANTISE_VLM_MODELS=False
+USE_TRANSFORMERS_VLM_MODEL_AS_LLM=True
+LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE=None
+QUANTISE_TRANSFORMERS_LLM_MODELS=False
+LOAD_TRANSFORMERS_LLM_PII_MODEL_AT_START=False
+LOAD_TRANSFORMERS_VLM_MODEL_AT_START=True
+
+# VLM using inference server options (vLLM / Llama.cpp server)
+INFERENCE_SERVER_API_URL=http://192.168.0.220:8080
+USE_LLAMA_SWAP=True
+INFERENCE_SERVER_LLM_PII_MODEL_CHOICE=qwen_3_5_27b
+
+# General VLM / LLM options
+VLM_DISABLE_QWEN3_5_THINKING=True
+LLM_MAX_NEW_TOKENS=8192
+CUSTOM_VLM_BACKEND=bedrock_vlm # Which model type to use to do face / signature detection. Can choose from "transformers_vlm", "inference_vlm", "bedrock_vlm"
+
+# AWS related variables
+RUN_AWS_FUNCTIONS=True # Set to False if you don't want to run AWS functions. You can remove all the environment variables in the following section if you don't want to use them
+AWS_REGION=example-region
+DOCUMENT_REDACTION_BUCKET=example-bucket
+
+SHOW_AWS_TEXT_EXTRACTION_OPTIONS=True
+SHOW_AWS_PII_DETECTION_OPTIONS=True
+
+SHOW_AWS_EXAMPLES=True
+RUN_ALL_EXAMPLES_THROUGH_AWS=True
+
+SAVE_LOGS_TO_DYNAMODB=True
+ACCESS_LOG_DYNAMODB_TABLE_NAME=example-dynamodb-access-log
+USAGE_LOG_DYNAMODB_TABLE_NAME=example-dynamodb-usage
+FEEDBACK_LOG_DYNAMODB_TABLE_NAME=example-dynamodb-feedback
+
+# AWS Textract options
+SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS=True
+LOAD_PREVIOUS_TEXTRACT_JOBS_S3=True
+TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET=example-bucket-output
+INCLUDE_FORM_EXTRACTION_TEXTRACT_OPTION=False
+INCLUDE_LAYOUT_EXTRACTION_TEXTRACT_OPTION=False
+INCLUDE_TABLE_EXTRACTION_TEXTRACT_OPTION=False
+INCLUDE_FACE_IDENTIFICATION_TEXTRACT_OPTION=False # Needs a VLM option available to work
+
+# AWS VLM / LLM options
+SHOW_BEDROCK_VLM_MODELS=False
+SHOW_AWS_BEDROCK_LLM_MODELS=False
+HYBRID_TEXTRACT_BEDROCK_VLM=False
+
+CLOUD_LLM_PII_MODEL_CHOICE=amazon.nova-pro-v1:0
+CLOUD_LLM_PII_CUSTOM_INSTRUCTIONS_MODEL_CHOICE=anthropic.claude-sonnet-4-6 #amazon.nova-pro-v1:0
+CLOUD_VLM_MODEL_CHOICE=amazon.nova-pro-v1:0 # other possibles: anthropic.claude-sonnet-4-6 #qwen.qwen3-vl-235b-a22b # anthropic.claude-sonnet-4-6 #
+CLOUD_SUMMARISATION_MODEL_CHOICE=amazon.nova-lite-v1:0
+
+# Cost code related variables
+SHOW_COSTS=True
+GET_COST_CODES=True
+COST_CODES_PATH=config/cost_codes.csv
+ENFORCE_COST_CODES=True
+DEFAULT_COST_CODE=example_cost_code
+
+# S3 cost codes
+S3_COST_CODES_PATH=cost_codes.csv
\ No newline at end of file
diff --git a/example_data/Bold minimalist professional cover letter.docx b/example_data/Bold minimalist professional cover letter.docx
new file mode 100644
index 0000000000000000000000000000000000000000..4c4034fa1573bc9cb74a81c794ab07ece5a845ed
--- /dev/null
+++ b/example_data/Bold minimalist professional cover letter.docx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0c8551ac157f350b2093e5d8c89f68474f613350074201cff6d52d5ed5ec28ff
+size 23992
diff --git a/example_data/Difficult handwritten note.jpg b/example_data/Difficult handwritten note.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..feea6ee7684647bc4e92871eab95f94313abebc2
--- /dev/null
+++ b/example_data/Difficult handwritten note.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:28896bfa4c4d6ef48222a285c02529dc8967d15d799df5c4b4cf0f62224e7b6c
+size 85066
diff --git a/example_data/Example-cv-university-graduaty-hr-role-with-photo-2.pdf b/example_data/Example-cv-university-graduaty-hr-role-with-photo-2.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..dba4330578bc1aa12645cbe067cda24f064ef6b4
--- /dev/null
+++ b/example_data/Example-cv-university-graduaty-hr-role-with-photo-2.pdf
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:caf00ca5cb06b8019804d1a7eaeceec772607969e8cad6c34d1d583876345b90
+size 116763
diff --git a/example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv b/example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv
new file mode 100644
index 0000000000000000000000000000000000000000..c1b4e157fed3af2a77963fd8ca74a7661b89a8d5
--- /dev/null
+++ b/example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv
@@ -0,0 +1,295 @@
+,coordinates,filename,languages,last_modified,page_number,parent_id,category,id,title_name,text
+0,"{'points': ((643.4645, 98.23889999999994), (643.4645, 154.23889999999994), (1034.5125, 154.23889999999994), (1034.5125, 98.23889999999994)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,2,6da364a5e9c746cca157b91a7108cf8b,Title,55a930e9cc12e77ae25572779a20e2aa,,Lambeth 2030
+1,"{'points': ((651.9684, 250.84349999999995), (651.9684, 270.84349999999995), (730.3884, 270.84349999999995), (730.3884, 250.84349999999995)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,2,6da364a5e9c746cca157b91a7108cf8b,Title,b5d8fff05f4737267c3039cf7e636bfe,,Contents
+2,"{'points': ((651.9684, 305.8767), (651.9684, 315.9167), (705.6777999999999, 315.9167), (705.6777999999999, 305.8767)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,2,6da364a5e9c746cca157b91a7108cf8b,Title,37cf32b0d24b740a3de9ba62b02ceda7,,Forewords
+3,"{'points': ((651.9684, 338.87669999999997), (651.9684, 348.9167), (712.8874, 348.9167), (712.8874, 338.87669999999997)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,2,6da364a5e9c746cca157b91a7108cf8b,Title,3c520bdfd69fbad61fc7840d3d51daf4,,Introduction
+4,"{'points': ((651.9684, 371.87669999999997), (651.9684, 381.9167), (753.6668, 381.9167), (753.6668, 371.87669999999997)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,2,6da364a5e9c746cca157b91a7108cf8b,Title,08cc8e49bfad3409a2c2729c6e676593,,State of the Borough
+5,"{'points': ((651.9684, 404.87669999999997), (651.9684, 414.9167), (786.6367, 414.9167), (786.6367, 404.87669999999997)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,2,6da364a5e9c746cca157b91a7108cf8b,Title,81bacd6f5d62d0991959eb636396f817,,Our Previous Borough Plan
+6,"{'points': ((651.9684, 437.87669999999997), (651.9684, 447.9167), (826.4361, 447.9167), (826.4361, 437.87669999999997)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,2,6da364a5e9c746cca157b91a7108cf8b,Title,c558e1950261e4ddccdbd2c777c662a9,,Our Shared Vision for Lambeth 2030
+7,"{'points': ((651.9684, 470.87669999999997), (651.9684, 480.9167), (809.1764999999999, 480.9167), (809.1764999999999, 470.87669999999997)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,2,6da364a5e9c746cca157b91a7108cf8b,Title,425b3f7079730a082e1cefeeb91eb45f,,Our Ambitions for Lambeth 2030
+8,"{'points': ((651.9684, 503.87669999999997), (651.9684, 513.9167), (954.0142999999999, 513.9167), (954.0142999999999, 503.87669999999997)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,2,6da364a5e9c746cca157b91a7108cf8b,Title,b68eb349dfb6e5c42d7576def21828b8,,The Lambeth Golden Thread – A Borough of Equity and Justice
+9,"{'points': ((651.9684, 536.8767), (651.9684, 546.9167), (961.0138999999999, 546.9167), (961.0138999999999, 536.8767)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,2,6da364a5e9c746cca157b91a7108cf8b,Title,43b67348c58f51ce182479d5f4ab6081,,Ambition 1 – Making Lambeth Neighbourhoods Fit for the Future
+10,"{'points': ((651.9684, 569.8767), (651.9684, 579.9167), (981.0337999999999, 579.9167), (981.0337999999999, 569.8767)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,2,6da364a5e9c746cca157b91a7108cf8b,Title,495976e3b4b0dfe0f41d6272dcc350b8,,Ambition 2 – Making Lambeth One of the Safest Boroughs in London
+11,"{'points': ((651.9684, 602.8767), (651.9684, 612.9167), (941.6750999999999, 612.9167), (941.6750999999999, 602.8767)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,2,6da364a5e9c746cca157b91a7108cf8b,Title,7e24b934746423093e573cfe643e10eb,,Ambition 3 – Making Lambeth A Place We Can All Call Home
+12,"{'points': ((651.9684, 635.8767), (651.9684, 645.9167), (794.2067, 645.9167), (794.2067, 635.8767)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,2,6da364a5e9c746cca157b91a7108cf8b,Title,ef9385282ded9840eaf57668d4ef9097,,Our Lambeth 2030 Outcomes
+13,"{'points': ((56.6929, 152.23889999999994), (56.6929, 208.23889999999994), (341.9009, 208.23889999999994), (341.9009, 152.23889999999994)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,3,b15e2c2fc379e436cac392688c908d48,Title,486e0ae8e16412c294b1c99aa70b465e,,Forewords
+14,"{'points': ((56.6929, 251.05949999999996), (56.6929, 329.05949999999996), (274.94290000000007, 329.05949999999996), (274.94290000000007, 251.05949999999996)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,3,486e0ae8e16412c294b1c99aa70b465e,NarrativeText,1399eece3fa4b458a81182788a854539,Forewords,Lambeth has long been the home of inspirational creativity and audacious innovation.
+15,"{'points': ((56.6929, 341.27549999999997), (56.6929, 435.27549999999997), (285.48289999999986, 435.27549999999997), (285.48289999999986, 341.27549999999997)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,3,486e0ae8e16412c294b1c99aa70b465e,NarrativeText,da7f83cf4fbc17f220fe374856f65926,Forewords,"From William Blake to Olive Morris, artists and activists have pushed the boundaries of what is possible and changed our borough for the better. We are a place of energy and ambition, a destination for those who wish to make a difference. And we have long been a place of sanctuary, welcoming communities from around the globe who have come to make Lambeth their home."
+16,"{'points': ((56.6929, 449.27549999999997), (56.6929, 531.2755), (286.80089999999996, 531.2755), (286.80089999999996, 449.27549999999997)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,3,486e0ae8e16412c294b1c99aa70b465e,NarrativeText,b655a9a47b3e5887cdfcae839a238533,Forewords,"We also recognise that we are not an equal borough. We have faced exceptionally challenging times - the devastating cuts to public services, austerity Britain, Brexit, the pandemic and the ongoing cost of living crisis. The impacts are not felt equally and have exacerbated the chronic stresses of poverty, racism and inequality that affect so many in our community."
+17,"{'points': ((56.6929, 545.2755), (56.6929, 651.2755), (291.23089999999996, 651.2755), (291.23089999999996, 545.2755)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,3,486e0ae8e16412c294b1c99aa70b465e,NarrativeText,6b0d30f774017abd5e0b55ee27020442,Forewords,"We saw during the coronavirus pandemic the remarkable collective response of our borough - when the Council, businesses, voluntary and community organisations and residents came together as one and carried us through the toughest of times. Our partnership working and genuine collaboration is our core strength. And what is unique about Lambeth is our diversity which forms the bedrock of that collective power."
+18,"{'points': ((306.1417, 251.56359999999995), (306.1417, 309.56359999999995), (524.5497, 309.56359999999995), (524.5497, 251.56359999999995)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,3,486e0ae8e16412c294b1c99aa70b465e,NarrativeText,0a3f423d4f848c27f7cfd10d63b8aa28,Forewords,"It is this open heart and pioneering spirit, along with our geographical connectivity, our vibrant and imaginative business community and passionate voluntary sector that places Lambeth in an unparalleled position in London."
+19,"{'points': ((306.1417, 323.56359999999995), (306.1417, 405.56359999999995), (535.2506999999999, 405.56359999999995), (535.2506999999999, 323.56359999999995)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,3,486e0ae8e16412c294b1c99aa70b465e,NarrativeText,392b955a7da993dc684bbd83d00cf046,Forewords,"But we cannot take our residents for granted. What came through in the hundreds of conversations, meetings, workshops and roundtables we have held in developing Our Future, Our Lambeth, is that whilst our communities are generous and tough, whilst they possess incomparable levels of humanity and resilience their strength is not boundless."
+20,"{'points': ((306.1417, 419.56359999999995), (306.1417, 489.56359999999995), (540.0517, 489.56359999999995), (540.0517, 419.56359999999995)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,3,486e0ae8e16412c294b1c99aa70b465e,NarrativeText,cad462c234e4a7dd4e0af90b1b63788d,Forewords,"It is incumbent upon us all to make the bold decisions now, because the choices we make today will define the Lambeth we create for the next generation. It is those challenges, both the ones we are grappling with presently and the ones just around the corner, that Our Future, Our Lambeth seeks to address."
+21,"{'points': ((306.1417, 503.56359999999995), (306.1417, 573.5636), (540.4596999999999, 573.5636), (540.4596999999999, 503.56359999999995)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,3,486e0ae8e16412c294b1c99aa70b465e,NarrativeText,c0e22839a188bd5fb0f81d0f1d280d49,Forewords,"We have a wonderful opportunity to transform and reshape our neighbourhoods and equip our communities to overcome future obstacles and enable us all to thrive. So, the Lambeth that we see in 2030 is one that is healthier, safer and sustainable, and is active in tearing down deep-rooted inequalities."
+22,"{'points': ((306.1417, 587.5636), (306.1417, 633.5636), (532.1416999999999, 633.5636), (532.1416999999999, 587.5636)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,3,486e0ae8e16412c294b1c99aa70b465e,NarrativeText,d25a3f4fc63c4f374546a808326e2106,Forewords,"Our Future, Our Lambeth is the beginning of us taking that bold action, being brave in the face of an uncertain future, and, together, creating a more just and equitable Lambeth for us all."
+23,"{'points': ((651.9684, 251.05949999999996), (651.9684, 309.05949999999996), (871.0086, 309.05949999999996), (871.0086, 251.05949999999996)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,3,486e0ae8e16412c294b1c99aa70b465e,NarrativeText,fd20eb0c33004a93313609653b1d672e,Forewords,I am truly honoured and proud to share our vision for Lambeth by 2030.
+24,"{'points': ((651.9684, 321.27549999999997), (651.9684, 427.27549999999997), (882.9273999999999, 427.27549999999997), (882.9273999999999, 321.27549999999997)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,3,486e0ae8e16412c294b1c99aa70b465e,NarrativeText,ec21d47ced96d35ce9df4e856760866d,Forewords,"This is the product of a series of fruitful conversations about the borough – what makes Lambeth unique, what we want it to look and feel like by 2030, and what matters most to all of us who live, work, and visit the borough. That means that whilst the Council has held the pen on the Borough Plan, it really does belong to us all – residents, institutions, businesses, the voluntary and community sector – everyone who has a stake in Lambeth."
+25,"{'points': ((651.9684, 441.27549999999997), (651.9684, 523.2755), (882.2073999999999, 523.2755), (882.2073999999999, 441.27549999999997)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,3,486e0ae8e16412c294b1c99aa70b465e,NarrativeText,1b1fb7b5405ad8b7ad013465d30eb705,Forewords,"As Chief Executive of Lambeth Council, I am absolutely committed to improving the lives of every Lambeth resident – and I am determined not to leave anyone behind. Lambeth faces distinct challenges, both now and in the future – and we know the impacts of these challenges are felt differently across our diverse neighbourhoods and communities."
+26,"{'points': ((651.9684, 537.2755), (651.9684, 583.2755), (878.1673999999999, 583.2755), (878.1673999999999, 537.2755)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,3,486e0ae8e16412c294b1c99aa70b465e,NarrativeText,5bd1dc873b12e90b0760aa6b66800c55,Forewords,"Collectively, we have to rise to these challenges and be courageous to overcome them – not being afraid to do things differently to deliver greater impact for ourselves, our friends, families and neighbours."
+27,"{'points': ((651.9684, 597.2755), (651.9684, 655.2755), (886.8563999999999, 655.2755), (886.8563999999999, 597.2755)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,3,486e0ae8e16412c294b1c99aa70b465e,NarrativeText,919c830c6d9d3c9109a6cc1fc1a19d6a,Forewords,"Lambeth 2030 is a vision for the best borough we can be by 2030. That is a borough with social and climate justice at its heart. A borough that is safer, fit for the future, and which everyone can have the opportunity to call home."
+28,"{'points': ((901.4172, 251.56359999999995), (901.4172, 393.56359999999995), (1133.5752, 393.56359999999995), (1133.5752, 251.56359999999995)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,3,486e0ae8e16412c294b1c99aa70b465e,NarrativeText,5f96640253ef9a8de42acce6e20681ae,Forewords,"We know we face major challenges when it comes to making these ambitions a reality, not least the entrenched inequities that persist across Lambeth, despite good progress made to change this. That is why we are tying all our ambitions together with a determination to be a borough of equity and justice – one that is fairer for our Black, Asian and Multi- Ethnic residents, our LGBTQ+ residents, our disabled residents, for women and girls, our faith communities and those with lower socio-economic status. We will be relentless in our pursuit of more equitable outcomes in all that we do together for Lambeth."
+29,"{'points': ((901.4172, 407.56359999999995), (901.4172, 501.56359999999995), (1132.4261999999999, 501.56359999999995), (1132.4261999999999, 407.56359999999995)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,3,486e0ae8e16412c294b1c99aa70b465e,NarrativeText,b8ba008b94db858377083dca8da96e58,Forewords,"Lambeth 2030 is a plan for everyone – it will bind us to work together, through cross-sector collaborations and brave conversations, to realise our 3 bold ambitions. We have special ingredients in Lambeth – including world leading organisations, a vibrant voluntary and community sector and passionate residents – which by working in partnership, can make a real difference."
+30,"{'points': ((901.4172, 515.5636), (901.4172, 597.5636), (1127.3971999999999, 597.5636), (1127.3971999999999, 515.5636)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,3,486e0ae8e16412c294b1c99aa70b465e,NarrativeText,1385d218d4a1f5560213f14ea0856899,Forewords,I want to thank you – our residents and partners – for your involvement in shaping Lambeth 2030. I am continually struck by the pride people have for their local community and for Lambeth and share your passion and drive to be one of the best boroughs in London. Lambeth 2030 is the first step towards our future.
+31,"{'points': ((901.4172, 611.6036), (901.4172, 621.6036), (1007.1751999999999, 621.6036), (1007.1751999999999, 611.6036)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,3,486e0ae8e16412c294b1c99aa70b465e,NarrativeText,492c9c5973b2e693042b73d2bc98834f,Forewords,Let’s do this together.
+32,"{'points': ((56.6929, 665.2755), (56.6929, 783.2755), (286.1908999999999, 783.2755), (286.1908999999999, 665.2755)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,3,486e0ae8e16412c294b1c99aa70b465e,NarrativeText,56abbea97c4711dc75e7aa586a68d31c,Forewords,"We are the home of Windrush. We are home to London’s largest LGBTQ+ community. We are home to the largest Portuguese-speaking community in London and increasingly are welcoming more of the Latin American community who are making Lambeth their home. It is these foundations, being a place of sanctuary and possessing a deeply welcoming, collective, community spirit, an aspiring borough thirsty to achieve, which continues to see us through the challenges that are placed before us."
+33,"{'points': ((306.1417, 741.1768999999999), (306.1417, 763.1069), (425.62170000000003, 763.1069), (425.62170000000003, 741.1768999999999)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,3,b15e2c2fc379e436cac392688c908d48,Title,0f3f533e0d58153d9bb0f4ca0aa66d16,,Councillor Claire Holland Leader of Lambeth Council
+34,"{'points': ((901.4172, 732.4289), (901.4172, 754.3589), (1051.0772, 754.3589), (1051.0772, 732.4289)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,3,b15e2c2fc379e436cac392688c908d48,Title,918da5e6897f332ae5688d21d7296986,,Bayo Dosunmu Chief Executive – Lambeth Council
+35,"{'points': ((298.8109, 114.90250000000003), (298.8109, 248.90250000000003), (1077.8827999999999, 248.90250000000003), (1077.8827999999999, 114.90250000000003)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,4,947286cfd073ce6809861b53e34d596a,Title,95b019e5e9b85e58cdc25a74626d904a,,Introduction
+36,"{'points': ((651.9684, 268.33939999999996), (651.9684, 330.33939999999996), (1112.3948, 330.33939999999996), (1112.3948, 268.33939999999996)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,4,95b019e5e9b85e58cdc25a74626d904a,NarrativeText,ac3189ea918d735950463b0dac1e9128,Introduction,"Lambeth - a borough of diversity, connectivity, full of excitement and opportunity. We have long been home to radicals and reformers, entrepreneurs and innovators - people who work together to help change the lives of others and their own."
+37,"{'points': ((651.9684, 351.4274), (651.9684, 445.4274), (887.1883999999999, 445.4274), (887.1883999999999, 351.4274)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,4,95b019e5e9b85e58cdc25a74626d904a,NarrativeText,aff79956bfc10fcfb74485e73a1d60ff,Introduction,"Lambeth is a global destination, with strong institutions that help shape a unique cultural offer – from Waterloo and South Bank, to the vibrance of Brixton and local highstreets of Streatham and West Norwood, Lambeth has something for everyone. We are a place of sanctuary, and for hundreds of years, we have welcomed new communities who have left a lasting imprint on our borough."
+38,"{'points': ((651.9684, 459.4274), (651.9684, 529.4274), (885.9384, 529.4274), (885.9384, 459.4274)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,4,95b019e5e9b85e58cdc25a74626d904a,NarrativeText,0add352283c17d63896621df09a3070e,Introduction,"We know it is this exceptional history and the contribution and kindness of the people of Lambeth that makes it so special. It is weaved throughout every neighbourhood and community and is why so many of us continue to visit, work in the borough, and have made Lambeth the place they call home."
+39,"{'points': ((901.4172, 351.4274), (901.4172, 397.4274), (1124.2852, 397.4274), (1124.2852, 351.4274)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,4,95b019e5e9b85e58cdc25a74626d904a,NarrativeText,b77edc594f5ab0fa91d179cff2b5ac10,Introduction,"To address the challenges that lie ahead, we’ve developed our collective roadmap to 2030 – “Our Future, Our Lambeth” – a Borough Plan that unites us all."
+40,"{'points': ((901.4172, 411.4274), (901.4172, 517.4274), (1136.1242, 517.4274), (1136.1242, 411.4274)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,4,95b019e5e9b85e58cdc25a74626d904a,NarrativeText,dac4bc409c7e12328697f4b9f6d9935e,Introduction,"To design this Borough Plan, we invited everyone who lives, visits, and works in Lambeth to have their say. Founded on what we were told matters to you, this Borough Plan builds on the strengths that exist in the borough and in our communities, affirms our collective vision and ambitions and outlines how we will take forward our shared priorities, with a longer-term look to 2030 so that we can deliver sustainable change."
+41,"{'points': ((651.9684, 543.4274), (651.9684, 625.4274), (885.2764, 625.4274), (885.2764, 543.4274)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,4,95b019e5e9b85e58cdc25a74626d904a,NarrativeText,5cb3dfdf6f2c0492303ab5b5fe6300ce,Introduction,"Recognising and reflecting this impact is important to us as we look to the future of Lambeth. We know that to be the best borough we can possibly be, we need to harness and nourish our assets, resources, and community energy, so that everyone in Lambeth can belong, can thrive, and so that nobody is left behind."
+42,"{'points': ((651.9684, 639.4274), (651.9684, 745.4274), (878.6773999999999, 745.4274), (878.6773999999999, 639.4274)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,4,95b019e5e9b85e58cdc25a74626d904a,NarrativeText,929c92316e68d578356eedac9414ebfa,Introduction,"Doing this will not be easy and cannot be done alone. Despite all our strengths and our passion, we know a longer-term, nuanced and joined-up approach is needed to continue to tackle the economic, social and environmental challenges facing our residents, our businesses, our partners. We will continue to be ambitious – and have the courage and willingness to do things differently, in partnership, to deliver for our residents."
+43,"{'points': ((901.4172, 531.4274), (901.4172, 685.4274), (1132.2172, 685.4274), (1132.2172, 531.4274)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,4,95b019e5e9b85e58cdc25a74626d904a,NarrativeText,ef12f0cea88b9b8ef3dfb591fb99ff1a,Introduction,"With bold political and civic leadership and strengthened partnerships with key institutions and local organisations, now is the time to future-proof Lambeth and work more closely and effectively together to deliver better outcomes for the people of Lambeth. This Borough Plan will not have all the answers to the challenges we face but it is our commitment to everyone in Lambeth that we will strive to get the basics right, and that we will harness the abundance of local expertise, energy and passion in our design and decision-making so that everybody in the borough is empowered to create Lambeth 2030."
+44,"{'points': ((901.4172, 699.9033999999999), (901.4172, 711.9034), (1131.42, 711.9034), (1131.42, 699.9033999999999)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,4,947286cfd073ce6809861b53e34d596a,Title,9a33f56363461acfbc2af0b92876ac4b,,This is Our Future; This is Our Lambeth.
+45,"{'points': ((902.9826999999999, 52.72120000000007), (902.9826999999999, 101.49519999999995), (935.9826999999999, 101.49519999999995), (935.9826999999999, 52.72120000000007)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,5,f039d64c6097fb763cda8316e354c341,Title,414032508531553dfac8e0b8b89f87f9,,Y T
+46,"{'points': ((56.6929, 98.23889999999994), (56.6929, 208.23889999999994), (539.7488999999999, 208.23889999999994), (539.7488999999999, 98.23889999999994)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,5,f039d64c6097fb763cda8316e354c341,Title,d9da449be88e5c8f7ea2b55534ae6a2c,,Lambeth 2030 Vision Statement
+47,"{'points': ((56.6929, 267.11519999999996), (56.6929, 371.11519999999996), (407.5674, 371.11519999999996), (407.5674, 267.11519999999996)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,5,d9da449be88e5c8f7ea2b55534ae6a2c,NarrativeText,edabe3684c7827360d08312283f6caf1,Lambeth 2030 Vision Statement,Lambeth – a borough with social and climate justice at its heart.
+48,"{'points': ((625.1307, 258.66650000000004), (625.1307, 283.66650000000004), (854.5532000000001, 283.66650000000004), (854.5532000000001, 258.66650000000004)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,5,f039d64c6097fb763cda8316e354c341,Title,37d72d8b6b605707d8ede84491a09df3,,S U S T A I N A B L E
+49,"{'points': ((902.9826999999999, 108.09519999999998), (902.9826999999999, 117.83019999999999), (935.9826999999999, 117.83019999999999), (935.9826999999999, 108.09519999999998)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,5,f039d64c6097fb763cda8316e354c341,Title,d004b1ff682b81502c67a47af51f6dc4,,I
+50,"{'points': ((902.9826999999999, 124.43020000000001), (902.9826999999999, 358.3012), (935.9826999999999, 358.3012), (935.9826999999999, 124.43020000000001)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,5,f039d64c6097fb763cda8316e354c341,Title,3a86f747e415c69e09ab2ac26cca6693,,N U T R O P P O
+51,"{'points': ((56.6929, 387.8764), (56.6929, 563.8764), (518.5568999999999, 563.8764), (518.5568999999999, 387.8764)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,5,3a86f747e415c69e09ab2ac26cca6693,NarrativeText,c2518dfccc2645da64ce2ad786aeb2c2,N U T R O P P O,"By harnessing the power and pride of our people and partnerships, we will proactively tackle inequalities so that children and young people can have the best start in life and so everyone can feel safe and thrive in a place of opportunity."
+52,"{'points': ((832.1166999999999, 567.5014), (832.1166999999999, 622.058), (1001.0684, 622.058), (1001.0684, 567.5014)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,5,f039d64c6097fb763cda8316e354c341,Title,9958a638d5d4877409964d06ed383f54,,Y SAFER T
+53,"{'points': ((832.1166999999999, 623.7080000000001), (832.1166999999999, 633.443), (865.1166999999999, 633.443), (865.1166999999999, 623.7080000000001)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,5,f039d64c6097fb763cda8316e354c341,Title,7f8ff14b7c921dca88b00f305ee24d05,,I
+54,"{'points': ((832.1166999999999, 635.093), (832.1166999999999, 802.2379999999999), (865.1166999999999, 802.2379999999999), (865.1166999999999, 635.093)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,5,f039d64c6097fb763cda8316e354c341,Title,ba72d825e9763542c9e2c6faa977b5e9,,N U M M O C
+55,"{'points': ((1129.6716000000001, 408.99129999999997), (1129.6716000000001, 540.6162999999999), (1156.6716000000001, 540.6162999999999), (1156.6716000000001, 408.99129999999997)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,5,2b8b4fcc3a93c0ad3c1bf3ad291aa535,Title,a69216c578d87019b4b951483f8529f1,,H E A L T H Y
+56,"{'points': ((909.2125, 66.7077999999999), (909.2125, 100.7077999999999), (1024.7113, 100.7077999999999), (1024.7113, 66.7077999999999)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,6,69f82064b95142d2809e1022612907d5,NarrativeText,cf915494fed928add3b0a9b4e021f05e,,22.2% of pupils eligible for and claiming free school meals
+57,"{'points': ((56.6929, 98.23889999999994), (56.6929, 208.23889999999994), (386.58889999999997, 208.23889999999994), (386.58889999999997, 98.23889999999994)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,6,69f82064b95142d2809e1022612907d5,Title,3c50098d46a8d3b6092f80593611566a,,State of the Borough
+58,"{'points': ((425.9055, 108.74349999999993), (425.9055, 142.74349999999993), (540.0138, 142.74349999999993), (540.0138, 108.74349999999993)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,6,3c50098d46a8d3b6092f80593611566a,NarrativeText,b1feb02ad0e3f70acdf88ceff7072c51,State of the Borough,"Lambeth is an inner south London borough with 317,600 residents"
+59,"{'points': ((425.9055, 175.92459999999994), (425.9055, 197.92459999999994), (532.9254999999999, 197.92459999999994), (532.9254999999999, 175.92459999999994)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,6,69f82064b95142d2809e1022612907d5,Title,acc38fc0d1d0f4b4cfde3f8c8d397adb,,9th largest population in London
+60,"{'points': ((647.244, 139.06819999999993), (647.244, 233.06819999999993), (772.564, 233.06819999999993), (772.564, 139.06819999999993)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,6,acc38fc0d1d0f4b4cfde3f8c8d397adb,NarrativeText,2af3bd319981e4d22ccd62da18d82348,9th largest population in London,"Lambeth has a high concentration of people between ages 20 and 40 making it a comparatively young borough, but we are seeing a decrease in children in the borough over time"
+61,"{'points': ((931.8897, 128.2197), (931.8897, 150.2197), (1007.8091, 150.2197), (1007.8091, 128.2197)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,6,acc38fc0d1d0f4b4cfde3f8c8d397adb,NarrativeText,79ed3fe98ecc0349b2da3cf9937fc14e,9th largest population in London,17.8% of pupils identify as SEN
+62,"{'points': ((919.1338, 183.40069999999992), (919.1338, 217.40069999999992), (1107.1237999999998, 217.40069999999992), (1107.1237999999998, 183.40069999999992)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,6,acc38fc0d1d0f4b4cfde3f8c8d397adb,NarrativeText,d557329ed1f8f0132cf4bd0bd7f72ea0,9th largest population in London,"In Lambeth there are 63,200 children (up to 18), of which 43% live in poverty, after housing costs"
+63,"{'points': ((971.5747, 253.41649999999993), (971.5747, 299.4164999999999), (1120.1137, 299.4164999999999), (1120.1137, 253.41649999999993)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,6,acc38fc0d1d0f4b4cfde3f8c8d397adb,NarrativeText,4239e38dfe513d9cee716f45a93c2d00,9th largest population in London,"63% of our children and young people are Black, Asian or Multi-Ethnic compared with 21% nationally"
+64,"{'points': ((86.2677, 416.7217), (86.2677, 438.7217), (251.1077, 438.7217), (251.1077, 416.7217)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,6,acc38fc0d1d0f4b4cfde3f8c8d397adb,NarrativeText,1661e21c6473b6e0c7eaa59935399402,9th largest population in London,"At 22,200 Lambeth has the largest LGBTQ+ population in London"
+65,"{'points': ((361.1338, 323.5031), (361.1338, 345.5031), (514.9928, 345.5031), (514.9928, 323.5031)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,6,acc38fc0d1d0f4b4cfde3f8c8d397adb,NarrativeText,e7919a490822466a6086fef0605d33f7,9th largest population in London,Lambeth’s population is diverse and multicultural
+66,"{'points': ((664.7952, 391.8683), (664.7952, 425.8683), (820.5332, 425.8683), (820.5332, 391.8683)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,6,acc38fc0d1d0f4b4cfde3f8c8d397adb,NarrativeText,03e02e55f797b9ba9121a60747141b1e,9th largest population in London,Schools: 83 schools in Lambeth are rated good and outstanding
+67,"{'points': ((933.8504, 409.1722), (933.8504, 455.1722), (1109.5804, 455.1722), (1109.5804, 409.1722)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,6,acc38fc0d1d0f4b4cfde3f8c8d397adb,NarrativeText,82afa749f35b8a1a7621148b4510334e,9th largest population in London,"In 2022, 82% of employed residents are paid at least the London Living Wage – with the 2022 annual median gross weekly pay £749.40"
+68,"{'points': ((664.7952, 464.7187), (664.7952, 474.7187), (826.0952000000001, 474.7187), (826.0952000000001, 464.7187)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,6,acc38fc0d1d0f4b4cfde3f8c8d397adb,NarrativeText,bbc612cc5859242491a2d8225ecc3e67,9th largest population in London,(that’s 93.3% of Lambeth schools)
+69,"{'points': ((71.4803, 554.6745), (71.4803, 588.6745), (168.72030000000004, 588.6745), (168.72030000000004, 554.6745)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,6,acc38fc0d1d0f4b4cfde3f8c8d397adb,NarrativeText,b291ea693d4554ddf68d861edb16e086,9th largest population in London,"The (mean) average house price in Lambeth is £689,009"
+70,"{'points': ((289.2756, 524.0989999999999), (289.2756, 570.0989999999999), (382.97479999999996, 570.0989999999999), (382.97479999999996, 524.0989999999999)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,6,acc38fc0d1d0f4b4cfde3f8c8d397adb,NarrativeText,5973ec8671c0c31f6eeb172d2193a46d,9th largest population in London,54.9% of Lambeth residents have a religion and 37.5% have no religion
+71,"{'points': ((660.7086, 546.3122000000001), (660.7086, 580.3122000000001), (761.4471, 580.3122000000001), (761.4471, 546.3122000000001)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,6,acc38fc0d1d0f4b4cfde3f8c8d397adb,NarrativeText,b070fb1e70b7bb155b22f9b5aeff7c71,9th largest population in London,38.6% of Lambeth residents were born outside of the UK
+72,"{'points': ((933.8504, 494.0226), (933.8504, 516.0226), (1110.2204000000002, 516.0226), (1110.2204000000002, 494.0226)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,6,acc38fc0d1d0f4b4cfde3f8c8d397adb,NarrativeText,26eb14a20018dbcb95be11b1158eac46,9th largest population in London,"In January 2023, 11,950 (4.9%) of the population are on universal credit"
+73,"{'points': ((71.4803, 624.6902), (71.4803, 646.6902), (132.7903, 646.6902), (132.7903, 624.6902)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,6,69f82064b95142d2809e1022612907d5,Title,9871689619e6ef50b697e4e39f60dbe0,,12th highest in London
+74,"{'points': ((647.0078, 614.7926), (647.0078, 636.7926), (756.6666, 636.7926), (756.6666, 614.7926)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,6,9871689619e6ef50b697e4e39f60dbe0,NarrativeText,5c09ff3780ff85eb0b62e4af7d4da294,12th highest in London,There are 130 number of languages spoken.
+75,"{'points': ((1011.0944, 625.7768), (1011.0944, 647.7768), (1114.0334, 647.7768), (1114.0334, 625.7768)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,6,9871689619e6ef50b697e4e39f60dbe0,NarrativeText,2698e228d77d8198567e4a484ac5d9ab,12th highest in London,36.4% of all waste is reused or recycled
+76,"{'points': ((398.8346, 659.5091), (398.8346, 705.5091), (533.7968, 705.5091), (533.7968, 659.5091)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,6,9871689619e6ef50b697e4e39f60dbe0,NarrativeText,20bb25d9e727c1bcb7fd172b39543a71,12th highest in London,Life expectancy in Lambeth is low compared to London at 78.6 years for males and 83.2 years for females
+77,"{'points': ((814.6299, 680.1106), (814.6299, 726.1106), (962.9399000000001, 726.1106), (962.9399000000001, 680.1106)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,6,9871689619e6ef50b697e4e39f60dbe0,NarrativeText,024dc0883ced3b9e8de0a9ba4c0d6d15,12th highest in London,17% of the population indicate that their day-to-day activities are limited to some extent by health problems or a disability
+78,"{'points': ((1011.0944, 680.9579), (1011.0944, 690.9579), (1116.0944, 690.9579), (1116.0944, 680.9579)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,6,69f82064b95142d2809e1022612907d5,Title,322195343ffda925324dfee0ef455696,,12th lowest in London
+79,"{'points': ((150.2362, 750.7374), (150.2362, 760.7374), (308.9452, 760.7374), (308.9452, 750.7374)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,6,322195343ffda925324dfee0ef455696,NarrativeText,cfb41226537acf7f773afc443b717979,12th lowest in London,17.3% of Lambeth is green space
+80,"{'points': ((398.8346, 738.6902), (398.8346, 784.6902), (529.9346, 784.6902), (529.9346, 738.6902)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,6,322195343ffda925324dfee0ef455696,NarrativeText,de0568eab4e32fe7a65b24a2197a7735,12th lowest in London,(Lambeth males have the 2nd lowest in London and Lambeth females have the 5th lowest in London)
+81,"{'points': ((814.6299, 764.961), (814.6299, 774.961), (919.6299, 774.961), (919.6299, 764.961)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,6,69f82064b95142d2809e1022612907d5,Title,a2f2d97887839660665cda66f66c7e82,,11th lowest in London
+82,"{'points': ((150.2362, 791.0839), (150.2362, 801.0839), (252.4562, 801.0839), (252.4562, 791.0839)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,6,,Footer,ecd77a4383614b6f19b7f410a201816d,,5th lowest in London
+83,"{'points': ((56.6929, 98.23889999999994), (56.6929, 208.23889999999994), (434.2449, 208.23889999999994), (434.2449, 98.23889999999994)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,7,dbabedbf3276fe708dc73b00ddb8a9be,Title,927b317b2b9b9bca850d9e67e40e6488,,Our Previous Borough Plan
+84,"{'points': ((56.6929, 250.98749999999995), (56.6929, 268.98749999999995), (317.15110000000016, 268.98749999999995), (317.15110000000016, 250.98749999999995)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,7,dbabedbf3276fe708dc73b00ddb8a9be,Title,c1ec8f1102b5d6cfba7ca5cd9095f0c0,,About the Borough Plan 2018–22
+85,"{'points': ((56.6929, 292.0227), (56.6929, 314.0227), (263.4279, 314.0227), (263.4279, 292.0227)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,7,c1ec8f1102b5d6cfba7ca5cd9095f0c0,NarrativeText,796f7c074d483f421c297573c1d2524f,About the Borough Plan 2018–22,Our previous Borough Plan was formed around five pillars:
+86,"{'points': ((56.6929, 327.95259999999996), (56.6929, 337.9926), (261.41049999999996, 337.9926), (261.41049999999996, 327.95259999999996)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,7,c1ec8f1102b5d6cfba7ca5cd9095f0c0,ListItem,42f3f76f59968a9eab0a5fb4620f6ea9,About the Borough Plan 2018–22,1. Enable sustainable growth and development
+87,"{'points': ((56.6929, 345.6219), (56.6929, 355.6619), (199.5505, 355.6619), (199.5505, 345.6219)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,7,c1ec8f1102b5d6cfba7ca5cd9095f0c0,ListItem,3e8ed12b7dd15d2dae6341abe4f417a0,About the Borough Plan 2018–22,2. Increase community resilience
+88,"{'points': ((56.6929, 363.29119999999995), (56.6929, 385.29119999999995), (268.48949999999996, 385.29119999999995), (268.48949999999996, 363.29119999999995)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,7,c1ec8f1102b5d6cfba7ca5cd9095f0c0,ListItem,3dcec8d72190a2839b0b4e9744bee164,About the Borough Plan 2018–22,3. Promote care and independence by reforming services
+89,"{'points': ((306.1417, 291.95269999999994), (306.1417, 397.9527), (539.3397, 397.9527), (539.3397, 291.95269999999994)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,7,c1ec8f1102b5d6cfba7ca5cd9095f0c0,NarrativeText,3ac8406f4da305989c0a3d1462433f0d,About the Borough Plan 2018–22,"The global covid-19 pandemic required unpredictable action and unparalleled levels of partnership working to protect the most vulnerable and support businesses and jobs. The pandemic brought our local government, public health team, local NHS and the VCS sector closer together to deliver comprehensive support and care – and we should be collectively proud that our efforts stand Lambeth in good stead as we continue to emerge from the crisis."
+90,"{'points': ((56.6929, 392.96049999999997), (56.6929, 414.96049999999997), (284.4104999999999, 414.96049999999997), (284.4104999999999, 392.96049999999997)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,7,c1ec8f1102b5d6cfba7ca5cd9095f0c0,ListItem,4cc99b782192f3d192e363a7d7887508,About the Borough Plan 2018–22,"4. Make Lambeth a place where people want to live, work and invest"
+91,"{'points': ((56.6929, 422.6298), (56.6929, 546.2991), (289.12289999999996, 546.2991), (289.12289999999996, 422.6298)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,7,c1ec8f1102b5d6cfba7ca5cd9095f0c0,ListItem,a0039dd07a1b7b7f193a650b286dad81,About the Borough Plan 2018–22,"5. A further fifth pillar was consulted on and agreed in 2020 articulating our vision in terms of EDI: be passionate about equality, strengthening diversity and delivering inclusion. Each pillar was underpinned by a total of 20 goals to enable the delivery of the ambitions. The Council’s administration over the four years of the Borough Plan set itself four guiding principles that underpinned decision-making, policy implementation, prioritisation and allocation of expenditure and delivery of services."
+92,"{'points': ((56.6929, 560.3690999999999), (56.6929, 570.3690999999999), (109.37790000000001, 570.3690999999999), (109.37790000000001, 560.3690999999999)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,7,c1ec8f1102b5d6cfba7ca5cd9095f0c0,NarrativeText,1305adfa0b6b080fa88afece59c421bd,About the Borough Plan 2018–22,These were:
+93,"{'points': ((56.6929, 584.2991), (56.6929, 594.3391), (136.9515, 594.3391), (136.9515, 584.2991)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,7,c1ec8f1102b5d6cfba7ca5cd9095f0c0,ListItem,0f76a18ab570337a0e59ebf1ccd5968d,About the Borough Plan 2018–22,1. Value for money
+94,"{'points': ((306.1417, 411.9527), (306.1417, 469.9527), (541.2587, 469.9527), (541.2587, 411.9527)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,7,c1ec8f1102b5d6cfba7ca5cd9095f0c0,NarrativeText,8139525cf2d691d25926abb9336c8988,About the Borough Plan 2018–22,"The murders of George Floyd and Sarah Everard were appalling crimes that caused deep repercussions in our borough, inflicting trauma on our communities that needs to be healed and calls to action for institutions that need to be heeded."
+95,"{'points': ((306.1417, 483.9527), (306.1417, 613.9526), (539.1916999999999, 613.9526), (539.1916999999999, 483.9527)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,7,c1ec8f1102b5d6cfba7ca5cd9095f0c0,NarrativeText,f7d356c135ad1ffdfe0f06ebaa049180,About the Borough Plan 2018–22,"The Council sought to work with communities on the changes that were needed to make Lambeth safer and more equal. Through pioneering work on diversity in the public realm which engaged younger people and residents in a debate on the Lambeth of today, and through the publication and launch of our preventing violence among women and girls, we have placed Lambeth in a leadership position to make a real and lasting difference and to reduce the scourge of attacks on women and girls, calling on men to change their own behaviours."
+96,"{'points': ((56.6929, 601.9684), (56.6929, 612.0083999999999), (145.2715, 612.0083999999999), (145.2715, 601.9684)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,7,c1ec8f1102b5d6cfba7ca5cd9095f0c0,ListItem,d688e8199bca7cdae010b4d6d51fab67,About the Borough Plan 2018–22,2. Tackling inequality
+97,"{'points': ((56.6929, 619.6377), (56.6929, 629.6777), (125.47049999999999, 629.6777), (125.47049999999999, 619.6377)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,7,c1ec8f1102b5d6cfba7ca5cd9095f0c0,ListItem,39a820ba94781bf575b2c9d6fd15fc6d,About the Borough Plan 2018–22,3. Transparency
+98,"{'points': ((56.6929, 637.307), (56.6929, 647.347), (124.9015, 647.347), (124.9015, 637.307)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,7,c1ec8f1102b5d6cfba7ca5cd9095f0c0,ListItem,ac2fdba553e4725d1ae6d4f8e0e11689,About the Borough Plan 2018–22,4. Collaboration
+99,"{'points': ((56.6929, 666.9763), (56.6929, 712.9763), (273.60089999999997, 712.9763), (273.60089999999997, 666.9763)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,7,c1ec8f1102b5d6cfba7ca5cd9095f0c0,NarrativeText,b0d9c553449ab0019583b550a122d26c,About the Borough Plan 2018–22,"It is important to acknowledge the unprecedented and significant global events that occurred during this time, as we collectively sought to deliver on these goals."
+100,"{'points': ((306.1417, 627.9926), (306.1417, 721.9926), (538.3397, 721.9926), (538.3397, 627.9926)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,7,c1ec8f1102b5d6cfba7ca5cd9095f0c0,NarrativeText,962b1e0843324fcac8b1fe21c7170d43,About the Borough Plan 2018–22,"Against the backdrop of these local and global challenges, we began the process to design this new “Our Future, Our Lambeth” Borough Plan. In doing so, we reflected on what the Council and its borough partners had achieved over the last four years, what we haven’t got right and what we must build on and must remain central to our ambitions as we look forward to 2030."
+101,"{'points': ((56.6929, 98.23889999999994), (56.6929, 208.23889999999994), (548.9889000000002, 208.23889999999994), (548.9889000000002, 98.23889999999994)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,8,d080b49c0c3cb34fbf8eb1c6873e1864,Title,9c8f673d178c68975d7f97d92d32061f,,Our Shared Vision: Lambeth 2030
+102,"{'points': ((649.1338, 70.33939999999996), (649.1338, 125.33939999999996), (869.748, 125.33939999999996), (869.748, 70.33939999999996)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,8,d080b49c0c3cb34fbf8eb1c6873e1864,Title,719cd6ea76b3fc61744d2f14b2522636,,Our Borough Plan Engagement
+103,"{'points': ((56.6929, 251.05949999999996), (56.6929, 269.05949999999996), (162.3709, 269.05949999999996), (162.3709, 251.05949999999996)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,8,d080b49c0c3cb34fbf8eb1c6873e1864,Title,aa47134e0039c8b474b666d08dc89c48,,The process
+104,"{'points': ((306.1417, 251.05949999999996), (306.1417, 269.05949999999996), (406.14970000000005, 269.05949999999996), (406.14970000000005, 251.05949999999996)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,8,d080b49c0c3cb34fbf8eb1c6873e1864,Title,5b11037a04404195c2a50cc35bc037b3,,The results
+105,"{'points': ((56.6929, 281.27549999999997), (56.6929, 339.27549999999997), (273.8108999999999, 339.27549999999997), (273.8108999999999, 281.27549999999997)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,8,5b11037a04404195c2a50cc35bc037b3,NarrativeText,c5873d6dbb29bddb3f20be2f956fe549,The results,Our residents are local experts and are the people who know Lambeth best. Building the future of Lambeth will take all of us working together and we wanted to use this process as the start of our collective effort to shape the future of Lambeth.
+106,"{'points': ((56.6929, 353.27549999999997), (56.6929, 447.27549999999997), (290.7468999999998, 447.27549999999997), (290.7468999999998, 353.27549999999997)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,8,5b11037a04404195c2a50cc35bc037b3,NarrativeText,fb796a6f77bec0826ab2067fbb2860ea,The results,"Everyone who lives, visits and works in the borough was invited to share what makes Lambeth unique and why it is important to them, what they would like the future Lambeth to look and feel like and the challenges they are facing now and anticipate on the road to 2030. Crucially, we asked what ideas for change people had also so that this Borough Plan and its ambitions can make a real difference to people’s lives."
+107,"{'points': ((56.6929, 461.27549999999997), (56.6929, 639.2755), (283.9808999999999, 639.2755), (283.9808999999999, 461.27549999999997)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,8,5b11037a04404195c2a50cc35bc037b3,NarrativeText,79dc50411e89550203d7c5422ab9bfed,The results,"Building on the results of the Child Friendly Lambeth consultation, and data and learnings from the Citizens’ Assembly on the Climate Crisis and the Health and Wellbeing Strategy, Citizens’ Assembly and the Health and Wellbeing Strategy, across 2022 we held a series of open invitation workshops attended by residents and local voluntary and community organisations, ran several focussed discussions with charities and local organisations to better understand the perspectives of different resident groups and to ensure we were capturing a representative voice of Lambeth’s residents, we held weeks of on-street conversations in community and public spaces, and we also ran an open Lambeth 2030 survey."
+108,"{'points': ((56.6929, 653.3154999999999), (56.6929, 723.3155), (289.4899, 723.3155), (289.4899, 653.3154999999999)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,8,5b11037a04404195c2a50cc35bc037b3,NarrativeText,c381e211f3d7e80e82e49df4788ed81f,The results,"‘Our Future, Our Lambeth’ is a product of these conversations. It has been shaped by our residents, local organisations and partners and is a result of your time, expertise, and passion – and it represents the beginning of our journey to Lambeth 2030 together."
+109,"{'points': ((306.1417, 281.27549999999997), (306.1417, 375.27549999999997), (540.8017, 375.27549999999997), (540.8017, 281.27549999999997)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,8,5b11037a04404195c2a50cc35bc037b3,NarrativeText,a98ce3ad99812fe1b450dc30267a12f7,The results,"We know that the people of Lambeth are proud of its rich history and legacy of activism, and that our ability to be different and lead the way must be celebrated and not forgotten. Underpinning this is the pride in the diverse cultural offer that the people of Lambeth bring to one another, making it a place where people feel welcome, and our renowned institutions, venues and green spaces feel like home."
+110,"{'points': ((306.1417, 389.27549999999997), (306.1417, 459.27549999999997), (540.6316999999999, 459.27549999999997), (540.6316999999999, 389.27549999999997)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,8,5b11037a04404195c2a50cc35bc037b3,NarrativeText,612e9cabfe13286e7ee6fd0393b8a9ee,The results,"Across Lambeth’s communities this pride and spirit has culminated in people coming together in inspiring, resilient partnerships, and there continues to be a strong and determined willingness to stand up to the challenges facing us in the here and now to improve and fulfil the lives of others throughout the borough."
+111,"{'points': ((306.1417, 473.3155), (306.1417, 531.3154999999999), (515.3717000000003, 531.3154999999999), (515.3717000000003, 473.3155)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,8,5b11037a04404195c2a50cc35bc037b3,NarrativeText,422b3b94862e7a6abaa7e2a4681cb3da,The results,"It is this connectedness to one another, our neighbours, our spaces and the borough that people have told us they want to be nurtured and grown as we look forwards to the future of Lambeth."
+112,"{'points': ((306.1417, 545.2755), (306.1417, 687.2755), (541.0906999999997, 687.2755), (541.0906999999997, 545.2755)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,8,5b11037a04404195c2a50cc35bc037b3,NarrativeText,36b7e4cb214c2ef1f661b32456e1d4a1,The results,"As well as retaining this uniqueness and enabling people to have a stronger stake in their borough and its future, it is widely understood that the stark pressures of inequality and injustice and their distinct impacts are felt differently across our communities, with some feeling like they no longer have a place in Lambeth. These challenges, compounded by central government funding cuts to local services despite a rise in demand and need, mean that we need to be bold and innovative to create a borough that works for everyone, and that we need to do this through working together, listening to those who know best."
+113,"{'points': ((661.181, 313.1653), (661.181, 375.2213), (804.2881, 375.2213), (804.2881, 313.1653)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,8,d080b49c0c3cb34fbf8eb1c6873e1864,Title,edd13c42d34616cbf0887537835be90d,,4 roundtable events with over 50 Lambeth organisations and councillors
+114,"{'points': ((722.5511, 448.373), (722.5511, 510.31699999999995), (871.5811000000001, 510.31699999999995), (871.5811000000001, 448.373)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,8,edd13c42d34616cbf0887537835be90d,NarrativeText,534b92ed4410d8b5c1e3250faab10648,4 roundtable events with over 50 Lambeth organisations and councillors,"9 workshops open to people who live, work and visit Lambeth - attended by over"
+115,"{'points': ((731.889, 512.317), (731.889, 526.317), (801.917, 526.317), (801.917, 512.317)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,8,d080b49c0c3cb34fbf8eb1c6873e1864,Title,fc9f23cc12f2eb230c9908898cd9fa36,,150 people
+116,"{'points': ((959.244, 327.3478), (959.244, 373.4038), (1106.2133000000001, 373.4038), (1106.2133000000001, 327.3478)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,8,d080b49c0c3cb34fbf8eb1c6873e1864,Title,c32aa38f68c4bd57435140eddaa73ac5,,Borough Plan Design Week with over 200 Lambeth Council staff
+117,"{'points': ((935.0787, 462.9851), (935.0787, 556.9291), (1092.9427, 556.9291), (1092.9427, 462.9851)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,8,c32aa38f68c4bd57435140eddaa73ac5,NarrativeText,7d6bd3a9017009888eaeb8f29470aa1f,Borough Plan Design Week with over 200 Lambeth Council staff,7 focussed workshops with local Lambeth organisations and their services-users - attended by over 80 people
+118,"{'points': ((877.0394, 697.0796), (877.0394, 775.0236), (1039.6214, 775.0236), (1039.6214, 697.0796)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,8,c32aa38f68c4bd57435140eddaa73ac5,NarrativeText,5b63d1651f69d9190d8e6a84c3e0d9b9,Borough Plan Design Week with over 200 Lambeth Council staff,"2 weeks of market research across public spaces in Lambeth, In community spaces complet- asking people their vision ing the Lambeth 2030 survey for Lambeth in 2030"
+119,"{'points': ((651.9684, 185.24689999999998), (651.9684, 219.24689999999998), (1056.939, 219.24689999999998), (1056.939, 185.24689999999998)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,9,e4d0ee24451a6b41cab62175bc88fbad,Title,22178f67d8bd4ff68b9d791a395b78fb,,"Our Future, Our Lambeth"
+120,"{'points': ((651.9684, 266.6309), (651.9684, 288.6309), (871.7263999999999, 288.6309), (871.7263999999999, 266.6309)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,9,e4d0ee24451a6b41cab62175bc88fbad,Title,1294961463a517bf74ee1f319dc880d7,,Our Vision Statement
+121,"{'points': ((651.9684, 303.9321), (651.9684, 421.9321), (1133.2667999999999, 421.9321), (1133.2667999999999, 303.9321)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,9,1294961463a517bf74ee1f319dc880d7,NarrativeText,7c7cf1754b0983e21d9b69c392028f15,Our Vision Statement,"Lambeth – a borough with social and climate justice at its heart. By harnessing the power and pride of our people and partnerships, we will proactively tackle inequalities so that children and young people can have the best start in life and so everyone can feel safe and thrive in a place of opportunity."
+122,"{'points': ((56.6927, 467.90989999999994), (56.6927, 485.90989999999994), (143.6867, 485.90989999999994), (143.6867, 467.90989999999994)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,9,e4d0ee24451a6b41cab62175bc88fbad,Title,41e1762402642666e8999cf2deed62a2,,The vision
+123,"{'points': ((56.6927, 498.12589999999994), (56.6927, 544.1259), (284.1516999999999, 544.1259), (284.1516999999999, 498.12589999999994)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,9,41e1762402642666e8999cf2deed62a2,NarrativeText,178b87a301fbc8f2b0642232ee4dd316,The vision,"Through listening and building on what we already know, we’ve created a vision for the future of Lambeth that’s rooted in what people want. This is a vision that belongs to everyone."
+124,"{'points': ((56.6927, 558.1259), (56.6927, 604.1259), (283.21169999999995, 604.1259), (283.21169999999995, 558.1259)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,9,41e1762402642666e8999cf2deed62a2,NarrativeText,1732703329ffb7968ac3684915a06799,The vision,"Achieving this future vision of Lambeth comes down to all of us. We are all connected, and we all have a stake in Lambeth to make it the best place to live, work and visit in the UK."
+125,"{'points': ((56.6927, 618.1259), (56.6927, 688.1259), (291.5706999999999, 688.1259), (291.5706999999999, 618.1259)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,9,41e1762402642666e8999cf2deed62a2,NarrativeText,3d63d118d4df5967ff4a47e4639a7449,The vision,"From our conversations we know people agree with a group of core priorities and ambitions for the future of Lambeth. They are ready to come together and bring this vision to life, and there is also strong support in the shift towards taking a longer-term view, so that we are ready for the unforeseen challenges of the future."
+126,"{'points': ((306.1415, 467.90989999999994), (306.1415, 505.90989999999994), (469.07750000000004, 505.90989999999994), (469.07750000000004, 467.90989999999994)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,9,e4d0ee24451a6b41cab62175bc88fbad,Title,6dd2bf18c9e5bcc95fafe5113d53bf67,,Our Commitments for Lambeth
+127,"{'points': ((306.1415, 518.1259), (306.1415, 576.1259), (536.1814999999999, 576.1259), (536.1814999999999, 518.1259)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,9,6dd2bf18c9e5bcc95fafe5113d53bf67,NarrativeText,09fd6c6d0d1a4f72123b00821627d358,Our Commitments for Lambeth,To deliver this vision requires individual and collective commitment and action. This means the Council and Lambeth’s communities and organisations coming together and standing as one to transform the ways we work.
+128,"{'points': ((306.1415, 590.1259), (306.1415, 696.1259), (538.2314999999999, 696.1259), (538.2314999999999, 590.1259)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,9,6dd2bf18c9e5bcc95fafe5113d53bf67,NarrativeText,e5b8a3c2f7319ed2d6e40b13628f3666,Our Commitments for Lambeth,"By listening to our communities, understanding their experiences, and aligning our priorities with theirs, we can build confidence between Lambeth’s institutions, businesses, community groups and organisations, and residents, and we can ensure that the changes we make, and the partnerships we form, are all contributing to improving the lives of those who live, work, learn and visit our borough. -"
+129,"{'points': ((651.9684, 441.60139999999996), (651.9684, 479.60139999999996), (814.9044, 479.60139999999996), (814.9044, 441.60139999999996)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,9,e4d0ee24451a6b41cab62175bc88fbad,Title,b7630bf5483ae7de3e4c255bba67dad3,,Our Commitments for Lambeth
+130,"{'points': ((651.9684, 491.8574), (651.9684, 540.652), (875.915, 540.652), (875.915, 491.8574)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,9,b7630bf5483ae7de3e4c255bba67dad3,ListItem,1004efbb64d0107eaee9d14bb648da67,Our Commitments for Lambeth,1. We get the basics right and deliver great public services that fit what people need - We will take a one borough approach to deliver our services consistently and well
+131,"{'points': ((651.9684, 554.0306), (651.9684, 576.0306), (879.047, 576.0306), (879.047, 554.0306)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,9,b7630bf5483ae7de3e4c255bba67dad3,ListItem,5347e793e9bbbf00f9dc2cf996b3dc26,Our Commitments for Lambeth,2. People have a say and stake in the decisions that matter
+132,"{'points': ((651.9684, 554.0306), (651.9684, 576.0306), (879.047, 576.0306), (879.047, 554.0306)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,9,b7630bf5483ae7de3e4c255bba67dad3,ListItem,00f854932994a7aac90c5bb1ca7f76c3,Our Commitments for Lambeth,2. People have a say and stake in the decisions that matter
+133,"{'points': ((656.2204, 580.8253), (656.2204, 602.8253), (866.8770000000001, 602.8253), (866.8770000000001, 580.8253)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,9,b7630bf5483ae7de3e4c255bba67dad3,ListItem,38c01b1dd61f9a649e766a067db21ae7,Our Commitments for Lambeth,We will be a listening and open borough that recognises and values our community voices
+134,"{'points': ((651.9684, 616.2039), (651.9684, 638.2039), (875.894, 638.2039), (875.894, 616.2039)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,9,b7630bf5483ae7de3e4c255bba67dad3,ListItem,6d75e99c18534091fcd72f5c8078a67f,Our Commitments for Lambeth,"3. We work together in partnership, to harness what makes Lambeth special"
+135,"{'points': ((656.2204, 642.9984999999999), (656.2204, 664.9984999999999), (884.0328999999999, 664.9984999999999), (884.0328999999999, 642.9984999999999)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,9,b7630bf5483ae7de3e4c255bba67dad3,ListItem,badf051f232860031164fe4bb2a7cf84,Our Commitments for Lambeth,We will collaborate with our people and partners to innovate and implement together
+136,"{'points': ((651.9684, 678.3770999999999), (651.9684, 700.3770999999999), (882.5638, 700.3770999999999), (882.5638, 678.3770999999999)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,9,b7630bf5483ae7de3e4c255bba67dad3,ListItem,3d859a5a8e6183d4a914cad6c620dd8e,Our Commitments for Lambeth,"4. We are accessible, transparent and we stand up to challenges"
+137,"{'points': ((901.4172, 441.60139999999996), (901.4172, 459.60139999999996), (1069.7351999999998, 459.60139999999996), (1069.7351999999998, 441.60139999999996)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,9,e4d0ee24451a6b41cab62175bc88fbad,Title,fc0e3b82e9e1d5c6ac8b771a04a1cd90,,Our 2030 Ambitions
+138,"{'points': ((901.4172, 471.81739999999996), (901.4172, 493.81739999999996), (1089.2351999999998, 493.81739999999996), (1089.2351999999998, 471.81739999999996)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,9,e4d0ee24451a6b41cab62175bc88fbad,Title,8ad7b99318b61416c76120ef53cea553,,The Golden Thread - A Borough of Equity and Justice
+139,"{'points': ((901.4172, 507.81739999999996), (901.4172, 529.8173999999999), (1070.7735, 529.8173999999999), (1070.7735, 507.81739999999996)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,9,8ad7b99318b61416c76120ef53cea553,ListItem,185e97b9c6227d448c1f5d1b6eef1cbe,The Golden Thread - A Borough of Equity and Justice,1. Making Lambeth Neighbourhoods Fit for the Future
+140,"{'points': ((901.4172, 543.156), (901.4172, 553.1959999999999), (1074.1135, 553.1959999999999), (1074.1135, 543.156)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,9,8ad7b99318b61416c76120ef53cea553,ListItem,f988d4206948852167ca88192d403344,The Golden Thread - A Borough of Equity and Justice,2. Making Lambeth One of the Safest
+141,"{'points': ((901.4172, 578.4946), (901.4172, 600.4946), (1081.6733, 600.4946), (1081.6733, 578.4946)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,9,8ad7b99318b61416c76120ef53cea553,ListItem,1282871864bc35c1d3aacc9ab94ab754,The Golden Thread - A Borough of Equity and Justice,3. Making Lambeth A Place We Can All Call Home
+142,"{'points': ((656.2204, 705.1717), (656.2204, 739.1717), (882.1949999999999, 739.1717), (882.1949999999999, 705.1717)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,9,8ad7b99318b61416c76120ef53cea553,ListItem,1437d750d820437974717fcd32efdb8d,The Golden Thread - A Borough of Equity and Justice,"We will focus on what our residents want and be honest about what we can and can’t do, whilst being courageous to take bold action"
+143,"{'points': ((649.1338, 60.99069999999995), (649.1338, 162.7806999999999), (825.5737999999999, 162.7806999999999), (825.5737999999999, 60.99069999999995)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,10,acc6b20e4815b3142583703efeebbb6f,Title,fface9a9a230d37dfa46a79c22da6d9b,,1 Making Lambeth Neighbourhoods Fit for the Future
+144,"{'points': ((56.6929, 208.16829999999993), (56.6929, 240.16829999999993), (196.5969, 240.16829999999993), (196.5969, 208.16829999999993)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,10,acc6b20e4815b3142583703efeebbb6f,Title,072169a315b804ab16f1e5cf24534b83,,ambitions
+145,"{'points': ((56.6929, 267.9954), (56.6929, 417.99539999999996), (541.1809000000002, 417.99539999999996), (541.1809000000002, 267.9954)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,10,072169a315b804ab16f1e5cf24534b83,NarrativeText,ba2edbdf1c9201e17cb47dd00838954c,ambitions,"This is a significant moment in time for us all. As we continue to adapt to the post-covid landscape, we face the harsh realities of the cost-of-living crisis within the context of sustained uncertainty of the future of public sector finance. London’s housing crisis continues to threaten our diverse communities and we know that the very real challenges and impacts of the climate emergency are rapidly changing how we live."
+146,"{'points': ((946.2992, 310.9119999999999), (946.2992, 436.702), (1128.8533, 436.702), (1128.8533, 310.9119999999999)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,10,acc6b20e4815b3142583703efeebbb6f,Title,0a4b8f9ef2aff1a3f1ec5d98f0c1047f,,2 Making Lambeth One of The Safest Boroughs in London
+147,"{'points': ((56.6929, 440.9606), (56.6929, 522.9606), (289.73089999999985, 522.9606), (289.73089999999985, 440.9606)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,10,0a4b8f9ef2aff1a3f1ec5d98f0c1047f,NarrativeText,da66d836a4ef98f052ac64bd80511212,2 Making Lambeth One of The Safest Boroughs in London,"As we look towards the future – one that allows each of us to thrive – we must take a focussed approach and positive action to build a stronger borough that delivers for everyone. We have identified three ambitions for Lambeth, around which we will harness the great energy and spirit of our residents and partners."
+148,"{'points': ((56.6929, 536.9606), (56.6929, 594.9606), (289.3418999999999, 594.9606), (289.3418999999999, 536.9606)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,10,0a4b8f9ef2aff1a3f1ec5d98f0c1047f,NarrativeText,fd67cfeafe8347b1ea31b9e687627df6,2 Making Lambeth One of The Safest Boroughs in London,"Inequality is at the heart of the challenges we face, and we are determined to tackle these head-on. To support us to do exactly that, we have developed the Lambeth Golden Thread – Equity and Justice – to run through the centre of all our ambitions for the future."
+149,"{'points': ((56.6929, 608.9606), (56.6929, 654.9606), (289.7118999999999, 654.9606), (289.7118999999999, 608.9606)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,10,0a4b8f9ef2aff1a3f1ec5d98f0c1047f,NarrativeText,ec10d615be5e09fef66cfe62c4b8f7f6,2 Making Lambeth One of The Safest Boroughs in London,"Our ambitions are bold – and they are intentionally tied together by a relentless commitment to tackle inequality at the root cause, focusing on what matters most to our residents."
+150,"{'points': ((306.1417, 440.9606), (306.1417, 498.9606), (518.6206999999999, 498.9606), (518.6206999999999, 440.9606)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,10,0a4b8f9ef2aff1a3f1ec5d98f0c1047f,NarrativeText,144bc3aed75f4ab2c0b37639a4822a81,2 Making Lambeth One of The Safest Boroughs in London,"We cannot do this alone. Lambeth belongs to all of us, and we all have a role to play in solving the persistent, deep-rooted challenges we face to improve the quality of life of everyone who calls Lambeth home."
+151,"{'points': ((306.1417, 512.9606), (306.1417, 546.9606), (528.6206999999999, 546.9606), (528.6206999999999, 512.9606)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,10,0a4b8f9ef2aff1a3f1ec5d98f0c1047f,NarrativeText,6d98ef93beacb0aa68096645d5df12f1,2 Making Lambeth One of The Safest Boroughs in London,"It is going to take unrelenting, radical effort to make the impact required to make Lambeth the place we want it to be."
+152,"{'points': ((306.1417, 564.6726), (306.1417, 670.6726), (530.8695, 670.6726), (530.8695, 564.6726)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,10,0a4b8f9ef2aff1a3f1ec5d98f0c1047f,NarrativeText,cf8f4bb8ad1c58b94505e19fb875dd25,2 Making Lambeth One of The Safest Boroughs in London,This plan is not just a blueprint for the future – it is a collective call to action owned by all of us who play a role in Lambeth.
+153,"{'points': ((649.1338, 600.2821), (649.1338, 702.0721), (825.5737999999999, 702.0721), (825.5737999999999, 600.2821)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,10,acc6b20e4815b3142583703efeebbb6f,Title,7ceb1632cb63365589ce4590d35ef91b,,3 Making Lambeth A Place We Can All Call Home
+154,"{'points': ((56.6929, 98.23889999999994), (56.6929, 208.23889999999994), (449.69530000000003, 208.23889999999994), (449.69530000000003, 98.23889999999994)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,11,9044e7e9d00cd65d2e7fd9ab591bc067,Title,3a71efdd8685eeb2f4df18f4e1132b6b,,The Lambeth Golden Thread
+155,"{'points': ((56.6929, 267.8512999999999), (56.6929, 287.8512999999999), (421.0129000000001, 287.8512999999999), (421.0129000000001, 267.8512999999999)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,11,3a71efdd8685eeb2f4df18f4e1132b6b,NarrativeText,f3ac75b434a216960c486a296b1a2324,The Lambeth Golden Thread,Lambeth – a borough of equity and justice.
+156,"{'points': ((56.6929, 299.92729999999995), (56.6929, 339.92729999999995), (490.8484, 339.92729999999995), (490.8484, 299.92729999999995)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,11,3a71efdd8685eeb2f4df18f4e1132b6b,NarrativeText,e5820ca1bf64fc018c7f5a3e530ff169,The Lambeth Golden Thread,"By 2030, Lambeth will be a fair and equitable borough, which seeks and delivers justice through all that we do."
+157,"{'points': ((56.6929, 354.38849999999996), (56.6929, 496.38849999999996), (286.8328999999999, 496.38849999999996), (286.8328999999999, 354.38849999999996)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,11,3a71efdd8685eeb2f4df18f4e1132b6b,NarrativeText,d1104f6b532647108c73e3a9206a0266,The Lambeth Golden Thread,"Lambeth is one of the most diverse boroughs in the country. Our history has been uniquely shaped by radicals, activists and changemakers – creating a welcoming borough with social justice at its core. We are the home of the Windrush Generation and have the largest LGBTQ+ community in London. We have a large Caribbean and African community and growing Spanish, Portuguese and South American communities across Lambeth, and deeply rooted faith communities across the borough. We also have a thriving community sector, advocating for the rights of women, disabled residents, older people and many more."
+158,"{'points': ((56.6929, 510.38849999999996), (56.6929, 592.3885), (285.71289999999976, 592.3885), (285.71289999999976, 510.38849999999996)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,11,3a71efdd8685eeb2f4df18f4e1132b6b,NarrativeText,e8968ac652b461dde1447855dfd034db,The Lambeth Golden Thread,"We are the home of pioneers. Our communities and our diversity are our greatest strength, with vast amounts of talent and potential to harness. There is an energy in Lambeth that is unmatched elsewhere – and we want to use this to catalyse greater change and make Lambeth a place where everyone can live safe, healthy, and thriving lives."
+159,"{'points': ((306.1417, 354.38849999999996), (306.1417, 376.38849999999996), (534.6316999999998, 376.38849999999996), (534.6316999999998, 354.38849999999996)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,11,3a71efdd8685eeb2f4df18f4e1132b6b,NarrativeText,f42719339773382056d84179bf1947ec,The Lambeth Golden Thread,"everyone is able to have a good quality of life, but also to fight for the justice they deserve."
+160,"{'points': ((306.1417, 390.38849999999996), (306.1417, 604.3885), (539.6716999999998, 604.3885), (539.6716999999998, 390.38849999999996)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,11,3a71efdd8685eeb2f4df18f4e1132b6b,NarrativeText,8de78e04db25d806a8ced0c6f5f1e936,The Lambeth Golden Thread,"We do not see equity as a separate ambition; instead, it is the golden thread that ties together all that we do in Lambeth. It is the engine of what will drive us forward to achieve our ambitions for 2030. We will develop a new framework for how we advance equality, diversity, and inclusion in Lambeth, both in the community and across our workforce. This will be locally and culturally relevant – and importantly, will be developed with our residents so that we deliver what matters most to people with protected characteristics and our diverse communities. As the first London borough to recognise care experience as a protected characteristic alongside our other additional protected characteristics – language, health and socio-economic status – we will continue to look for ways to go beyond our duty. And, we will embed equity and justice in all that we do as a borough, the Council will develop equity improvement priorities for each ambition, which will be published and reported on annually."
+161,"{'points': ((721.0918, 541.4307), (721.0918, 567.4307), (829.9272118, 567.4307), (829.9272118, 541.4307)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,11,9044e7e9d00cd65d2e7fd9ab591bc067,Title,848f842acb84466403717dcb649cf00d,,Activism
+162,"{'points': ((56.6929, 606.3885), (56.6929, 688.3885), (287.3228999999999, 688.3885), (287.3228999999999, 606.3885)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,11,848f842acb84466403717dcb649cf00d,NarrativeText,9ece160bf885911d5a006917e227f01e,Activism,"Looking at what we have achieved so far as a borough, we have made progress around equality, diversity, and inclusion. But we are not complacent – there is still much further to go. Our commitment is to all our communities, to work together to tackle inequality at the root cause and maintain a continuous conversation on what matters most and the areas of focus over the coming years."
+163,"{'points': ((56.6929, 702.3885), (56.6929, 808.3885), (290.11289999999985, 808.3885), (290.11289999999985, 702.3885)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,11,848f842acb84466403717dcb649cf00d,NarrativeText,4c443520d8d6bce1d268bf3e1efa9b0a,Activism,"This is the moment we shift the dial to have greater impact. We will facilitate a targeted, partnership approach to tackling inequality – with a focus on delivering equity for our diverse communities. We will continue to celebrate the richness of diversity across Lambeth – but do more to honour our heritage, ensuring this is felt within the way we deliver services, as well as seen in the fabric and structures of our borough. And we will stand with our communities in Lambeth, not only to ensure that"
+164,"{'points': ((306.1417, 618.3885), (306.1417, 736.3885), (538.9217000000001, 736.3885), (538.9217000000001, 618.3885)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,11,848f842acb84466403717dcb649cf00d,NarrativeText,dd77c8911ea386bae7d7a1a08fe75356,Activism,"We will work with the tenacity and respect that our communities deserve to make Lambeth a fairer and more equitable borough – but this will be a collaborative effort. To support this, we will develop a refreshed Lambeth United Equity and Inclusion Partnership, which will provide the fuel we need to achieve our goals. This will bring together stakeholders from across the borough – residents, businesses, institutions, and public services – to design and deliver equity missions across race, sexuality, gender, disability and faith."
+165,"{'points': ((306.1417, 750.3885), (306.1417, 796.3885), (536.3427, 796.3885), (536.3427, 750.3885)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,11,848f842acb84466403717dcb649cf00d,NarrativeText,1c1c6d4d55808a65853176c06e63c23a,Activism,"It is a bold ambition, but we are carried by the spirit of Lambeth shaped by those who have come before us. Together, we will create a more equitable and just future for Lambeth."
+166,"{'points': ((306.1417, 119.49860000000001), (306.1417, 223.4986), (1083.4917000000007, 223.4986), (1083.4917000000007, 119.49860000000001)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,12,ef8cd646a262f35339fbd95ec6320731,Title,e5430c5d191951710b94477cb94e0b9f,,1 - Making Lambeth Neighbourhoods Fit for the Future
+167,"{'points': ((651.9685, 267.9954), (651.9685, 307.9954), (1133.4468999999997, 307.9954), (1133.4468999999997, 267.9954)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,12,e5430c5d191951710b94477cb94e0b9f,NarrativeText,1cfc8b712df96d81b5718bf29584d069,1 - Making Lambeth Neighbourhoods Fit for the Future,"By 2030, Lambeth will be a clean, vibrant and climate resilient borough where people can lead healthier, happier lives."
+168,"{'points': ((651.9685, 330.96049999999997), (651.9685, 460.96049999999997), (879.0585, 460.96049999999997), (879.0585, 330.96049999999997)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,12,e5430c5d191951710b94477cb94e0b9f,NarrativeText,4eff3f99194a575a9958716d298e3e24,1 - Making Lambeth Neighbourhoods Fit for the Future,"The on-going and lasting impacts of the climate emergency, the cost-of-living crisis and the covid-19 pandemic mean that we must radically change the way we think and act in Lambeth. From continued lobbying for environmental and fairer legislative changes and investment at national level, to adapting the way we travel, design buildings and public spaces, and make local, healthy and more sustainable choices, we all have a role to play in improving the health, wellbeing and environment of others in Lambeth now and for future generations."
+169,"{'points': ((651.9685, 474.96049999999997), (651.9685, 580.9604999999999), (884.5184999999997, 580.9604999999999), (884.5184999999997, 474.96049999999997)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,12,e5430c5d191951710b94477cb94e0b9f,NarrativeText,a492fc079c6efd9cbe242d156ee94cdf,1 - Making Lambeth Neighbourhoods Fit for the Future,"The people of Lambeth are proud of their local area and have already taken steps to tackle climate change, but there is a desire and urgency within our communities for further decisive, collaborative action. Residents share that their health, carbon footprint and their streets matter to them, and they want to commit to improving our shared environment, ensuring that everyone knows how they can contribute, however small."
+170,"{'points': ((651.9685, 596.7445), (651.9685, 614.7445), (805.1845, 614.7445), (805.1845, 596.7445)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,12,ef8cd646a262f35339fbd95ec6320731,Title,82ef92cbb73525079b7dc6d507f1a8d4,,Climate resilience
+171,"{'points': ((901.4174, 330.96049999999997), (901.4174, 424.96049999999997), (1134.7697, 424.96049999999997), (1134.7697, 330.96049999999997)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,12,82ef92cbb73525079b7dc6d507f1a8d4,NarrativeText,72627277d86f5398741eb7de66d151d7,Climate resilience,"learning from this was key to the development of Lambeth’s first climate partnership, overseeing the implementation of the Lambeth Climate Action Plan. These are momentous actions that will help guide us to achieving our collective goal of a more sustainable and just future – ensuring that everything we do will make Lambeth a more sustainable, climate resilient borough for everyone."
+172,"{'points': ((901.4174, 438.96049999999997), (901.4174, 628.9604999999999), (1133.5964, 628.9604999999999), (1133.5964, 438.96049999999997)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,12,82ef92cbb73525079b7dc6d507f1a8d4,NarrativeText,e541aa73f82e0736d3698b12d035056c,Climate resilience,"Backed by our residents to act now, we are committing to tackling the climate and ecological emergency together. We will reduce greenhouse gas emissions from all sources we control or influence and build our resilience to the impacts of climate change through sustainable development and technologies. This will require a range of interventions and adaptations including improving flood prevention, more tree cover, sustainable urban drainage solutions and efficiency of water use. Based on the Citizens’ Assembly recommendations we also know the need to retrofit at scale to significantly improve energy efficiency of our buildings and focus on decarbonising our transport network to lower emissions, and we will continue to lobby the government tirelessly for funding for a national retrofit programme."
+173,"{'points': ((651.9685, 626.9604999999999), (651.9685, 660.9604999999999), (869.8764999999999, 660.9604999999999), (869.8764999999999, 626.9604999999999)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,12,82ef92cbb73525079b7dc6d507f1a8d4,NarrativeText,57e4a8d9c699afa219249a3d68c08203,Climate resilience,"We face a global climate emergency, and we know that inaction or insufficient responses will have consequences of an irreversible nature."
+174,"{'points': ((651.9685, 674.9604999999999), (651.9685, 744.9605), (875.5784999999998, 744.9605), (875.5784999999998, 674.9604999999999)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,12,82ef92cbb73525079b7dc6d507f1a8d4,NarrativeText,d008f497b69e15a4f5d7c57412fc2626,Climate resilience,"In January 2019, Lambeth became the first London Borough to declare a climate and ecological emergency and commit to becoming carbon neutral across our council buildings and operations by 2030. The Council reached out to residents through the Citizens’ Assembly on the climate crisis and"
+175,"{'points': ((901.4174, 642.9604999999999), (901.4174, 748.9605), (1131.2204, 748.9605), (1131.2204, 642.9604999999999)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,12,82ef92cbb73525079b7dc6d507f1a8d4,NarrativeText,3e2eac4500b1f44f4d6282fddb5018d9,Climate resilience,"Our hyperconnected inner London, highly urbanised location requires us to continue to be led by design that reduces traffic and enables people to walk, cycle and use public transport to experience the borough safely and accessibly. This means coming together and rethinking our transport systems to be inclusive, enabling healthier, more affordable and sustainable ways to get around the borough - including improving our existing network, electric car clubs, e-scooters"
+176,"{'points': ((346.5354, 95.12560000000008), (346.5354, 135.12560000000008), (534.3822, 135.12560000000008), (534.3822, 95.12560000000008)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,13,d408769ea98c2376a072dbb0f2c64c0e,NarrativeText,c3918f2458a497b7d911ffe6210a0105,,"“The value of green spaces - let’s use and protect them to share, to connect, and to make us well”"
+177,"{'points': ((320.3113, 162.97320000000002), (320.3113, 202.97320000000002), (543.5353000000001, 202.97320000000002), (543.5353000000001, 162.97320000000002)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,13,d408769ea98c2376a072dbb0f2c64c0e,NarrativeText,5a1a262810ba30285e66bc61ad8b3719,,"“People want to be healthy - increase access to healthy food and places such as leisure, parks, and green spaces”"
+178,"{'points': ((651.9685, 95.12560000000008), (651.9685, 149.12560000000008), (872.9353000000001, 149.12560000000008), (872.9353000000001, 95.12560000000008)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,13,d408769ea98c2376a072dbb0f2c64c0e,NarrativeText,c53741319c89a4fc96434bb298f1293e,,“We need carbon-neutral streets and communities. Plant trees and add more green grass so we have cleaner air and prettier spaces”
+179,"{'points': ((737.7166, 186.30669999999998), (737.7166, 198.30669999999998), (888.2206, 198.30669999999998), (888.2206, 186.30669999999998)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,13,d408769ea98c2376a072dbb0f2c64c0e,NarrativeText,b5849b10dd60b027e493267ae9edbcf8,,“We all have a part to play”
+180,"{'points': ((901.4173, 126.07050000000004), (901.4173, 166.07050000000004), (1111.5121, 166.07050000000004), (1111.5121, 126.07050000000004)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,13,d408769ea98c2376a072dbb0f2c64c0e,NarrativeText,7d83b150acbab28e9af8521043c12fcb,,"“Better, greener transport means less pollution, less traffic, more walking, cycling and more trees”"
+181,"{'points': ((56.6929, 268.5713999999999), (56.6929, 446.5714), (288.93789999999984, 446.5714), (288.93789999999984, 268.5713999999999)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,13,d408769ea98c2376a072dbb0f2c64c0e,NarrativeText,c946de9a3548f7a7219fbf3d47d70bd7,,"and cargo bike hire. We must build better and safer transport choices for people and reduce the demand and reliance on motor vehicle trips. Our collective aspiration for our streets also extends beyond the way we choose to travel. It is about reimagining the space on our streets, increasing Lambeth’s biodiversity and creating more people friendly initiatives. This has been set out in Lambeth’s trailblazing Kerbside Strategy on how we can reclaim kerbside space to make way for the largest community parklet programme in the capital. This future-thinking approach means that we can all benefit from the provision of more green space for people to meet and socialise, community gardens and outdoor seating as well as reduced traffic and noise pollution."
+182,"{'points': ((651.9684, 268.5713999999999), (651.9684, 362.5714), (878.8083999999999, 362.5714), (878.8083999999999, 268.5713999999999)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,13,d408769ea98c2376a072dbb0f2c64c0e,NarrativeText,5793985e7e7ee6d38355932768adf93e,,"accessible, inclusive and biodiverse green space for the people of Lambeth and for community use. It is about us all advancing on the positive changes that came about by the pandemic, widening the use and enjoyment of natural resources by all of Lambeth’s diverse communities, actively getting involved in tackling climate change and feeling connected to our natural environment."
+183,"{'points': ((901.4172, 268.5713999999999), (901.4172, 362.5714), (1127.9651999999999, 362.5714), (1127.9651999999999, 268.5713999999999)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,13,d408769ea98c2376a072dbb0f2c64c0e,NarrativeText,04ac79b8e4290f4d92f26e385c80a4aa,,"prioritised as outlined in the Climate Action Plan. This means increasing the re-use of materials, expanding food waste composting, increasing the number of materials that can be recycled and supporting all residents to better understand and access recycling options. And, as well as recycling more of our waste, we must commit to reducing our usage of plastic and single use packaging."
+184,"{'points': ((901.4172, 376.5714), (901.4172, 470.5714), (1132.3872, 470.5714), (1132.3872, 376.5714)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,13,d408769ea98c2376a072dbb0f2c64c0e,NarrativeText,34f24083b1e08a2913d8eaf704be23f2,,"Alongside this, we will continue to focus our attention on keeping our streets and open spaces clean from litter, fly-tipping and toxic pollution. From public campaigns discouraging littering and illegal dumping of rubbish to making it easy to report, and increased cleaning measures, we are committed to positively improving our environment – making our streets attractive and welcoming."
+185,"{'points': ((306.1417, 619.6792), (306.1417, 773.6792), (540.5506999999999, 773.6792), (540.5506999999999, 619.6792)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,13,d408769ea98c2376a072dbb0f2c64c0e,NarrativeText,eabdf7342bd220fd296a2172805c8b56,,"To maximise our emission lowering initiatives, we will celebrate and enhance our green spaces. Green infrastructure, just like traditional forms of infrastructure, provides essential support to every living being on the planet. Trees, shrubs, and plants absorb carbon dioxide and pollutant gases, purifying the air we breathe and provides habitats for birds, insects and other species and cools surrounding areas, offering relief from hotter temperatures. Green spaces also provide sanctuary and open space for every one of us who lives and visits the borough. Communities already make great use of Lambeth’s green spaces and share an urgency to ensure that there is more"
+186,"{'points': ((651.9684, 607.4578), (651.9684, 645.4578), (829.3044000000001, 645.4578), (829.3044000000001, 607.4578)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,13,d408769ea98c2376a072dbb0f2c64c0e,Title,b11d6ab9d2d4f7be9371b3af78ac3e9f,,Our streets and neighbourhoods
+187,"{'points': ((651.9684, 657.6738), (651.9684, 727.6738), (878.8963999999999, 727.6738), (878.8963999999999, 657.6738)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,13,b11d6ab9d2d4f7be9371b3af78ac3e9f,NarrativeText,b8a5467a300717bcb9c834779bd97023,Our streets and neighbourhoods,"Our local environment and streets have a major impact on the livelihood of our communities. Recycling and reducing waste, litter and air pollution are priorities for residents across the borough, affecting their day-to-day lives and we all have a role to play in resolving these."
+188,"{'points': ((651.9684, 741.6738), (651.9684, 775.6738), (879.2563999999999, 775.6738), (879.2563999999999, 741.6738)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,13,b11d6ab9d2d4f7be9371b3af78ac3e9f,NarrativeText,dec27c5537a26c2c305da07bb6b63c0d,Our streets and neighbourhoods,"Lambeth is now one of the top councils for recycling in the country, but we want to be a zero-waste borough with reducing, reusing, and recycling"
+189,"{'points': ((901.4172, 713.6738), (901.4172, 783.6738), (1131.8162, 783.6738), (1131.8162, 713.6738)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,13,b11d6ab9d2d4f7be9371b3af78ac3e9f,NarrativeText,0418de955107197d2910fcf125326b67,Our streets and neighbourhoods,A key part of this ambition will be to improve public and active travel provision and shared vehicle access to reduce car dependency and improve air quality whilst making sure local resident and business needs are being met. We face a major public health issue relating to poor air quality that is shortening lives and
+190,"{'points': ((56.6929, 268.5713999999999), (56.6929, 338.5714), (286.7498999999999, 338.5714), (286.7498999999999, 268.5713999999999)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,14,f9ae303e787832c8ef9bb5e6bb8e9637,NarrativeText,d443f7254106671a0caba8e1ad499cae,,"impacting lifelong health – this needs to change now. Whilst we have an array of programmes underway to clean up Lambeth’s air, the work we do with our partners and local communities will need to go further and act more quickly in order to achieve our objectives."
+191,"{'points': ((306.1417, 268.5713999999999), (306.1417, 338.5714), (541.1656999999998, 338.5714), (541.1656999999998, 268.5713999999999)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,14,f9ae303e787832c8ef9bb5e6bb8e9637,NarrativeText,a9ea967f1fb64dd9d982990b2170035d,,"the forefront of Lambeth’s partnerships and strategies; as highlighted in the Health and Wellbeing Strategy, so it is a place where all people can experience good health and wellbeing and where healthy life expectancy is improved for those groups within the population whose outcomes are the poorest."
+192,"{'points': ((56.6929, 354.3554), (56.6929, 372.3554), (226.6849, 372.3554), (226.6849, 354.3554)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,14,f9ae303e787832c8ef9bb5e6bb8e9637,Title,09972d70893f54e198befaa14da4d519,,Healthy active lives
+193,"{'points': ((56.6929, 384.5714), (56.6929, 430.5714), (282.48189999999994, 430.5714), (282.48189999999994, 384.5714)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,14,09972d70893f54e198befaa14da4d519,NarrativeText,bfbfdb810753062d853e3f08e659e8e8,Healthy active lives,Good health and wellbeing are fundamental to us leading full and rewarding lives and it is our ambition that Lambeth is a place where people are able and supported to have this.
+194,"{'points': ((56.6929, 444.5714), (56.6929, 526.5714), (288.58089999999993, 526.5714), (288.58089999999993, 444.5714)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,14,09972d70893f54e198befaa14da4d519,NarrativeText,dba5d50b9dfb63aeb5710ad8a2e62f24,Healthy active lives,"However, we know across Lambeth the benefits of good physical, mental and social wellbeing are not being felt by everyone. Healthy life expectancy unfairly differs in different areas of the borough, with the poorest communities and those from Black, Asian and Multi-Ethnic backgrounds having the worst outcomes across a wide range of health measures."
+195,"{'points': ((56.6929, 540.5714), (56.6929, 766.5714), (289.36789999999974, 766.5714), (289.36789999999974, 540.5714)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,14,09972d70893f54e198befaa14da4d519,NarrativeText,db7ee2a0fb5a6563fd428ad3c5744cab,Healthy active lives,"Through our Lambeth Together Care Partnership, bringing together the NHS, local authority, the voluntary sector and others, we are focussed on improving health and wellbeing in Lambeth and reducing inequalities for people in Lambeth through an integrated health and care system. We must continue to work collectively and draw on our shared intelligence about the borough, listen to local people to understand the needs of their communities and build our understanding of what really works to tackle health inequalities. Lambeth is working together in partnership, with action already underway, connecting us together and enabling us to address these challenges and better understand the impact of wider determinants of health such as housing, the economy, employment and the environment on local inequalities, with a clear focus on prevention. It is through this continued cross-organisational working and civic involvement that we can put health and wellbeing at"
+196,"{'points': ((306.1417, 352.5714), (306.1417, 458.5714), (531.2117, 458.5714), (531.2117, 352.5714)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,14,09972d70893f54e198befaa14da4d519,NarrativeText,569c4dc882cb0d6671a872d661134b3b,Healthy active lives,"Central to our collective approach is ensuring that every Lambeth resident has the best start in life. Through our local partnerships, we will develop safe and secure places for young people and children to socialise and develop their personal skills, through supporting positive emotional health and wellbeing including helping our most disadvantaged young people with access to sports facilities and training programmes."
+197,"{'points': ((306.1417, 472.5714), (306.1417, 734.5714), (541.3816999999999, 734.5714), (541.3816999999999, 472.5714)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,14,09972d70893f54e198befaa14da4d519,NarrativeText,ad348dec3489f0666616e1e797cf1798,Healthy active lives,"Alongside our vision that all young people have the best start is that we enable our residents of all ages to live fulfilling and rewarding lives. Our residents must have access to high-quality health and leisure services and by working with our health partners, businesses and by recognising the value of community groups in supporting better health, we will focus on the prevention of long-term conditions and support those at risk of physical and/or mental health issues to access the right early help and resources. This requires a whole systems approach and a focus to support our most vulnerable by developing inclusive and innovative programmes and sports partnerships across health, leisure and a range of activities. As part of this, we will create environments that promote active travel, physical activity and healthy choices. From increasing spaces for people to connect around their health, as well as their areas and communities to improving the availability and consumption of healthy and affordable food in Lambeth, there are remarkable solutions happening across the borough to help people flourish."
+198,"{'points': ((317.4803, 119.49860000000001), (317.4803, 223.4986), (1067.6253000000002, 223.4986), (1067.6253000000002, 119.49860000000001)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,15,dc1d84e04f411032d3fd8356901aea9b,Title,fe71714028d75ff985940d92532e82f4,,2 - Making Lambeth One of The Safest Boroughs in London
+199,"{'points': ((651.9685, 267.9954), (651.9685, 307.9954), (1132.1545000000003, 307.9954), (1132.1545000000003, 267.9954)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,15,fe71714028d75ff985940d92532e82f4,NarrativeText,d4db6aff40e8d84a6233c563411dbf1d,2 - Making Lambeth One of The Safest Boroughs in London,"By 2030, Lambeth will be one of the safest boroughs in London, where everyone feels safe and secure – in all places."
+200,"{'points': ((651.9685, 330.96049999999997), (651.9685, 388.96049999999997), (886.8454999999999, 388.96049999999997), (886.8454999999999, 330.96049999999997)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,15,fe71714028d75ff985940d92532e82f4,NarrativeText,c3bd49fe1435b9883fbd202fa7942728,2 - Making Lambeth One of The Safest Boroughs in London,"Everyone has the right to be safe from harm, violence, and crime. Making our neighbourhoods safer for everyone is a primary concern for communities across the borough, and our ambition is to make Lambeth one of the safest boroughs in London by 2030."
+201,"{'points': ((651.9685, 402.96049999999997), (651.9685, 604.9604999999999), (884.4364999999998, 604.9604999999999), (884.4364999999998, 402.96049999999997)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,15,fe71714028d75ff985940d92532e82f4,NarrativeText,a0ad81d8494aaf2a48bcf8b522aa5140,2 - Making Lambeth One of The Safest Boroughs in London,"In Lambeth, we know the reality of being and feeling unsafe can affect us all but that it is felt differently across the borough. Many of our young people suffer from the devastating consequences of crime and violence that impacts them, their families, and their local communities. Across the country women and girls experience feeling unsafe and restricted in public and in private spaces and suffer unacceptable abuse, losing their lives to male violence. We know those in our LGBTQ+ communities have felt threatened and unsafe in their local neighbourhoods and that sometimes our residents don’t feel as safe as they would like on our streets and estates. We also know that domestic abuse and sexual violence can affect anyone, regardless of their age, background or gender identity, through different forms including emotional, psychological and controlling behaviour."
+202,"{'points': ((901.4174, 330.96049999999997), (901.4174, 424.96049999999997), (1131.4864, 424.96049999999997), (1131.4864, 330.96049999999997)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,15,fe71714028d75ff985940d92532e82f4,NarrativeText,2caf6d671ab390ab6f8fc10d48f9367a,2 - Making Lambeth One of The Safest Boroughs in London,"people and those with special educational needs and/or disabilities to get the support they need. As we enter a digital first society, we must focus on making sure we can all be safe from online harm and exploitation. Levels of road casualties in the borough are also deeply concerning, and we need to work together to tackle the threat posed by motor vehicles to people walking and cycling in Lambeth."
+203,"{'points': ((901.4174, 438.96049999999997), (901.4174, 592.9604999999999), (1135.1664, 592.9604999999999), (1135.1664, 438.96049999999997)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,15,fe71714028d75ff985940d92532e82f4,NarrativeText,44299ad21257cd7843b26c6e14b54ea4,2 - Making Lambeth One of The Safest Boroughs in London,"To do this, we will focus on tackling the root causes of crime and violence – structural inequality and discrimination – as well as the disproportionate outcomes. We will continue to remain responsive and adaptive to the needs of all our communities, and we will work harder to understand what matters to our residents across the borough so that our collective interventions can be more proactive and focussed. We will also create a safer public realm by ensuring that, through the Safer Business Partnership, licensed premises and other business in the borough are equipped to support our ambition to make Lambeth one of the safest boroughs in London."
+204,"{'points': ((651.9685, 618.9604999999999), (651.9685, 688.9604999999999), (887.1684999999999, 688.9604999999999), (887.1684999999999, 618.9604999999999)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,15,fe71714028d75ff985940d92532e82f4,NarrativeText,3f3d5592b1f0cb45c0c599c977991445,2 - Making Lambeth One of The Safest Boroughs in London,"As well as people experiencing this now, Lambeth’s communities also carry the pain and trauma of historic instances of violence and crime. This must end. Our ambition is challenging, but we are determined to deliver the change that is necessary to achieve this goal."
+205,"{'points': ((651.9685, 702.9604999999999), (651.9685, 784.9605), (877.2165, 784.9605), (877.2165, 702.9604999999999)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,15,fe71714028d75ff985940d92532e82f4,NarrativeText,affefa0f8de7f5af0a974b7124c303bc,2 - Making Lambeth One of The Safest Boroughs in London,"This does not only mean reducing crime, but also reducing the fear of crime by working collectively across Lambeth to keep all our residents safe and secure – in homes and schools, colleges, on streets and public spaces, as well as on public transport. Importantly, this includes safeguarding our residents with vulnerabilities including children and young"
+206,"{'points': ((901.4174, 606.9604999999999), (901.4174, 760.9605), (1136.6424, 760.9605), (1136.6424, 606.9604999999999)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,15,fe71714028d75ff985940d92532e82f4,NarrativeText,07bea0e4d1e866ac2e6e022f58fd6498,2 - Making Lambeth One of The Safest Boroughs in London,"As a borough, we must strive to ensure that we identify the needs of the family early and provide interventions which reduce risk so children and young people can grow up safely in their families and communities and get the right help at the right time. Our children deserve to grow up in families where they are protected from the impact of domestic abuse, and we must ensure all children’s practitioners across the borough have the skills and training to identify abuse and intervene to reduce risks for children and we will do this in collaboration with our partners across the borough to ensure families get the right help at the right time so they can thrive and succeed."
+207,"{'points': ((306.1417, 100.90719999999999), (306.1417, 126.90719999999999), (515.5657, 126.90719999999999), (515.5657, 100.90719999999999)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,16,4575520b4b863e8e8a1b35ad42287981,NarrativeText,9f542e06a1320810992304398c23af83,,“Greater safety for women and girls – information and services”
+208,"{'points': ((650.8511, 100.90719999999999), (650.8511, 140.9072), (791.7899, 140.9072), (791.7899, 100.90719999999999)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,16,4575520b4b863e8e8a1b35ad42287981,NarrativeText,80668a6d9dac72b865bcca2be822c9ce,,“We need to reduce hate crimes so people can safely be themselves”
+209,"{'points': ((372.9921, 157.93960000000004), (372.9921, 197.93960000000004), (548.6221, 197.93960000000004), (548.6221, 157.93960000000004)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,16,4575520b4b863e8e8a1b35ad42287981,NarrativeText,d4aeef68dfb7a7659c3d02c79bf8bcc7,,“We need more spaces and neighbourhoods where people feel comfortable and safe”
+210,"{'points': ((943.7008, 143.80759999999998), (943.7008, 169.80759999999998), (1094.1928, 169.80759999999998), (1094.1928, 143.80759999999998)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,16,4575520b4b863e8e8a1b35ad42287981,Title,ad139aa31495cbd15cd80b3633b92c29,,“Improve night-time safety across the borough”
+211,"{'points': ((56.6929, 268.5713999999999), (56.6929, 362.5714), (285.6518999999999, 362.5714), (285.6518999999999, 268.5713999999999)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,16,ad139aa31495cbd15cd80b3633b92c29,NarrativeText,adf14529edd0c5976d3359459e05282a,“Improve night-time safety across the borough”,"In Lambeth, we are taking a long-term, Public Health approach to making Lambeth one of the safest boroughs in London. This means we will intervene early and focus on prevention so that we reduce the vulnerability to either experiencing or committing acts of violence. We will also be trauma informed, recognising the generational impact this has had across families and communities in Lambeth."
+212,"{'points': ((56.6929, 376.5714), (56.6929, 554.5714), (290.99989999999997, 554.5714), (290.99989999999997, 376.5714)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,16,ad139aa31495cbd15cd80b3633b92c29,NarrativeText,723764e1f99b1e71a166046c1eef5b24,“Improve night-time safety across the borough”,"This requires a collective effort. Our approach will be collaborative, driven by the Safer Lambeth Partnership – Lambeth’s primary vehicle for reducing and preventing crime. To deliver and embed a long- term partnership approach, the Safer Lambeth Partnership brings together the Council, Police, Fire and Rescue, Probation and Health to shape a multi- agency effort to tackle crime. Working in partnership we will ensure we safeguard and promote the welfare of children and adults at risk. We will also continue to work with schools, colleges, local employers, charities, faith-based organisations and, crucially, our residents and community groups, who are the fabric of our fantastic borough, to help make Lambeth a safer place for everyone."
+213,"{'points': ((56.6929, 570.3553999999999), (56.6929, 608.3554), (215.30710000000002, 608.3554), (215.30710000000002, 570.3553999999999)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,16,ad139aa31495cbd15cd80b3633b92c29,NarrativeText,9db47fbc0de636e08e9995178edf920f,“Improve night-time safety across the borough”,Violence affecting young people
+214,"{'points': ((56.6929, 620.5714), (56.6929, 702.5714), (290.6608999999999, 702.5714), (290.6608999999999, 620.5714)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,16,ad139aa31495cbd15cd80b3633b92c29,NarrativeText,5eabe8eae6a475a4d2c31a9341f62425,“Improve night-time safety across the borough”,"Making Lambeth one of the safest boroughs in London is about every individual and community that lives, works, and visits the borough. This means tackling the violence affecting young people with an anti-racist and equity-based ethos, so that children, teenagers, and young adults are safe at home, school and in public spaces."
+215,"{'points': ((56.6929, 716.5714), (56.6929, 774.5714), (281.39189999999985, 774.5714), (281.39189999999985, 716.5714)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,16,ad139aa31495cbd15cd80b3633b92c29,NarrativeText,836a9f2facb8392866f9d26b169c11a2,“Improve night-time safety across the borough”,"We cannot allow violence, the fear of harm or the longstanding and deep-rooted social and economic challenges to continue to hinder the conditions young people need to thrive. As a borough we will work collaboratively to stop the exploitation of our"
+216,"{'points': ((306.1417, 268.5713999999999), (306.1417, 302.5713999999999), (526.5496999999999, 302.5713999999999), (526.5496999999999, 268.5713999999999)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,16,ad139aa31495cbd15cd80b3633b92c29,NarrativeText,8f62c7d73dc56a40c7294e2b6fa5cf2c,“Improve night-time safety across the borough”,"children and young people, and create inclusive, nurturing learning environments both in school and community settings."
+217,"{'points': ((306.1417, 316.5713999999999), (306.1417, 434.5714), (541.1596999999999, 434.5714), (541.1596999999999, 316.5713999999999)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,16,ad139aa31495cbd15cd80b3633b92c29,NarrativeText,09f69ce34059c4f42c0441291aa027da,“Improve night-time safety across the borough”,"To achieve long-term change and meet our bold ambition, we will develop a whole systems approach to preventing youth violence and improving the life chances of our young people. Building on the priorities and learnings in the Lambeth Made Safer for Young People Strategy, we will work with our children, their families, and the networks of influence in our communities to look holistically at violence in all its forms – to provide dynamic, cross-cutting solutions to permanently stopping violence in Lambeth."
+218,"{'points': ((306.1417, 450.3554), (306.1417, 488.3554), (520.3957, 488.3554), (520.3957, 450.3554)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,16,4575520b4b863e8e8a1b35ad42287981,Title,90dd2eddd5639c0c64333da37ce0018b,,Violence against women and girls
+219,"{'points': ((306.1417, 500.5714), (306.1417, 570.5714), (537.4987000000001, 570.5714), (537.4987000000001, 500.5714)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,16,90dd2eddd5639c0c64333da37ce0018b,NarrativeText,36ce82faaeb6aba9b7b59c60d3c7d1ab,Violence against women and girls,"We want to create a Lambeth where all women and girls can be safe from harm and violence – both in feeling and experience. Too often, violence against women and girls remains hidden and under-reported, with forms of structural inequality impacting on both access to support and experiences within services."
+220,"{'points': ((306.1417, 584.5714), (306.1417, 774.5714), (540.9392, 774.5714), (540.9392, 584.5714)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,16,90dd2eddd5639c0c64333da37ce0018b,NarrativeText,6c8240f54c42320a208ef83d1d88fc98,Violence against women and girls,"Since 2011, we have made noticeable progress; developing one of the first Violence Against Women and Girls strategies in the UK and creating the pioneering Gaia Centre, offering a single point of access for anyone experiencing any form of gender- based violence in Lambeth. However, we know just how much further there is to go to realise our vision of Lambeth as a borough where everyone is safe. At the heart of our approach for the future, is a commitment to work with experts by experience – those best placed to advise on the solutions, support and services that will allow us to realise our ambition. All women and girls in Lambeth have the right to participate in, contribute to and benefit from a thriving Lambeth – including across education, employment, and our local inclusive economy."
+221,"{'points': ((651.9684, 268.5713999999999), (651.9684, 326.5713999999999), (876.8083999999999, 326.5713999999999), (876.8083999999999, 268.5713999999999)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,16,90dd2eddd5639c0c64333da37ce0018b,NarrativeText,f7e654bb6d8d31c52433eeb834155f1e,Violence against women and girls,"We will also be intersectional in our approach, recognising the nuanced needs of Black, Asian and Multi-Ethnic communities, LGBTQ+ communities, those with disabilities, as well as those experiencing multiple disadvantages."
+222,"{'points': ((651.9684, 340.5714), (651.9684, 494.5714), (884.4363999999998, 494.5714), (884.4363999999998, 340.5714)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,16,90dd2eddd5639c0c64333da37ce0018b,NarrativeText,640004cfd7b6f2b6a643da6706206647,Violence against women and girls,"Recognising the importance of engaging with men and boys to end violence against women and girls, Lambeth Council has become a White Ribbon accredited employer. This will support us to lead the way in developing and delivering the cultural transformation required to end men’s violence against women and girls. This will be a whole organisational approach, with political leadership, focusing on shifting the societal attitudes and beliefs that prevent gender equality and creating safe environments free from harassment, abuse and violence. To make a real and lasting difference in our borough, we will support other employers across Lambeth to do the same."
+223,"{'points': ((651.9684, 508.5714), (651.9684, 602.5714), (875.8983999999999, 602.5714), (875.8983999999999, 508.5714)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,16,90dd2eddd5639c0c64333da37ce0018b,NarrativeText,f183dba335463935baa0ccf7d1642ac8,Violence against women and girls,"As a borough, we will continue to ‘Look Out for Lambeth’ and take practical steps to create safer streets and public spaces for women and girls. This includes working with our neighbouring boroughs to improve safety and partnership working along our borders, targeting hotspots of harassment, and creating Safe Havens where women can access safety and support."
+224,"{'points': ((651.9684, 618.3554), (651.9684, 656.3554), (882.8364, 656.3554), (882.8364, 618.3554)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,16,4575520b4b863e8e8a1b35ad42287981,Title,e23a34e9616611f7042378b02eb2581d,,"Hate crime, discrimination and anti-social behaviour"
+225,"{'points': ((651.9684, 668.5714), (651.9684, 750.5714), (884.8054, 750.5714), (884.8054, 668.5714)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,16,e23a34e9616611f7042378b02eb2581d,NarrativeText,69ef7ba135bd5defa562a206a58b2c4b,"Hate crime, discrimination and anti-social behaviour","Lambeth is rightly proud of its historic and present- day diversity, which brings with it a vibrancy and cultural identity like no other part of London. We believe everyone, regardless of their background, nationality, religion, sex, gender and/or sexual identity, or disability has the right to live safe and fulfilling lives in their home and in their neighbourhood."
+226,"{'points': ((651.9684, 764.5714), (651.9684, 786.5714), (862.4573999999999, 786.5714), (862.4573999999999, 764.5714)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,16,e23a34e9616611f7042378b02eb2581d,NarrativeText,3c9b60feac31415c699b60d5b9ac48ca,"Hate crime, discrimination and anti-social behaviour","Together, we will create a borough that everyone is able to safely live and move around in without"
+227,"{'points': ((901.4172, 268.5713999999999), (901.4172, 410.5714), (1135.3342, 410.5714), (1135.3342, 268.5713999999999)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,16,e23a34e9616611f7042378b02eb2581d,NarrativeText,33033336363aa03c5212ec6bae2c7834,"Hate crime, discrimination and anti-social behaviour","the fear or experience of hate crime and anti-social harassment. This means standing with women across Lambeth to take a hate crime approach to tackling misogyny, ensuring our children and young people are educated appropriately on consent. It means standing with our significant LGBTQ+ community, fighting homophobic and transphobic hate crime and harassment. We will stand with our disabled residents and faith groups to stamp out ableism and anti- religious sentiment. And we will stand with our Black, Asian and Multi-Ethnic communities to eradicate racism in all its forms."
+228,"{'points': ((901.4172, 424.5714), (901.4172, 530.5714), (1128.1272, 530.5714), (1128.1272, 424.5714)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,16,e23a34e9616611f7042378b02eb2581d,NarrativeText,f584dd0b7ee0cdd82fc8099397e53981,"Hate crime, discrimination and anti-social behaviour","There is no place for hate in Lambeth. We are committed to building resilience and inclusion within and across our many diverse communities, ensuring that everyone is able to safely contribute to and benefit from the great opportunity in our borough. We will be relentless in our effort to tackle anti-social behaviour in our neighbourhoods and will inspire efforts to ensure that our venues and public realm is accessible and secure for every resident."
+229,"{'points': ((306.1417, 119.49860000000001), (306.1417, 223.4986), (995.1833, 223.4986), (995.1833, 119.49860000000001)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,17,d4f8a1dcd815af38d07edafb72ae0ed0,Title,c76fdf5187ca9973882150c48982ad9d,,3 - Making Lambeth A Place We Can All Call Home
+230,"{'points': ((651.9685, 267.9954), (651.9685, 351.99539999999996), (1106.7708999999998, 351.99539999999996), (1106.7708999999998, 267.9954)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,17,c76fdf5187ca9973882150c48982ad9d,NarrativeText,72a0f3c20abdcd11349b0b146bafc0b6,3 - Making Lambeth A Place We Can All Call Home,"By 2030, Lambeth will be a lifelong borough, with the best conditions to grow up and age well, where everyone can contribute to an inclusive economy, and have a place to call home."
+231,"{'points': ((651.9685, 374.96049999999997), (651.9685, 504.96049999999997), (878.1574999999999, 504.96049999999997), (878.1574999999999, 374.96049999999997)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,17,c76fdf5187ca9973882150c48982ad9d,NarrativeText,c292c4deaedff68eb2c638eff7226821,3 - Making Lambeth A Place We Can All Call Home,"Lambeth – forever radical, always welcoming and constantly creative. This is the reputation we have built together throughout our history and is the foundation of our ambition to make Lambeth the best place for children and young people to develop and for older people to enjoy their later years. It is also the spirit that will enable us to ensure that for life in between, everyone is able to access the many opportunities that exist in Lambeth, as well as fundamental basic rights – good quality education, employment, housing, and access to healthcare."
+232,"{'points': ((651.9685, 518.9604999999999), (651.9685, 636.9604999999999), (885.5354999999998, 636.9604999999999), (885.5354999999998, 518.9604999999999)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,17,c76fdf5187ca9973882150c48982ad9d,NarrativeText,b5839e6e1483b704af04a80bd5d4c472,3 - Making Lambeth A Place We Can All Call Home,"Across the borough, we have distinct places with their own unique identities, communities, and assets. We are home to world class institutions and cultural clusters, with historic venues and green spaces adding to the Lambeth offer. We have thriving businesses and great potential for future growth, and a vibrant community sector with local expertise. Yet we know there is more to do to ensure that everyone can benefit from the strengths Lambeth has to offer in order to build and sustain the best life possible."
+233,"{'points': ((651.9685, 652.7445), (651.9685, 690.7445), (875.6185, 690.7445), (875.6185, 652.7445)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,17,c76fdf5187ca9973882150c48982ad9d,NarrativeText,0ce2fb1ae0a95c602a0d81654bc166a8,3 - Making Lambeth A Place We Can All Call Home,Making Lambeth the best place to grow up
+234,"{'points': ((651.9685, 702.9604999999999), (651.9685, 760.9605), (882.5764999999999, 760.9605), (882.5764999999999, 702.9604999999999)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,17,c76fdf5187ca9973882150c48982ad9d,NarrativeText,69de8c76f2cfef0fad366666ff80364d,3 - Making Lambeth A Place We Can All Call Home,"We are committed to making Lambeth the best place to grow up and Lambeth being the place where families want to send their children to school. This recognises that the best start in life is crucial to support lifelong prosperity, allowing each individual to"
+235,"{'points': ((901.4174, 374.96049999999997), (901.4174, 456.96049999999997), (1131.8774, 456.96049999999997), (1131.8774, 374.96049999999997)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,17,c76fdf5187ca9973882150c48982ad9d,NarrativeText,0007798cbcd62a7de51b0fd923fe837d,3 - Making Lambeth A Place We Can All Call Home,"thrive. To deliver this, we are committed to becoming an accredited UNICEF Child Friendly borough – a place where children’s rights and voices are at the heart of everything we do and have worked with over 1,500 children and young people and community groups across the borough to listen to their priorities and concerns."
+236,"{'points': ((901.4174, 470.96049999999997), (901.4174, 756.9605), (1134.7744, 756.9605), (1134.7744, 470.96049999999997)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,17,c76fdf5187ca9973882150c48982ad9d,NarrativeText,ae7126bbd3bccf6a0854a2bc1a792b5f,3 - Making Lambeth A Place We Can All Call Home,"We will take a rights-based approach underpinned by equity and inclusion, making sure that all children and young people, regardless of their background, culture, ability or anything else, feel welcome in Lambeth, have the right opportunities to grow, learn, explore and have fun, and are protected from discrimination and harm. This includes addressing the needs of all children and young people with special educational needs and/or disabilities. By continuing to invest in local specialist provision, we aim to offer a mixed economy of inclusive mainstream schools, specialist resource bases, special schools and specialist colleges within Lambeth. This will support us to ensure that, as far as possible, all our children and young people can be educated within their local community. We will also continue work with all partners working together in the Lambeth local area to make sure our schools and neighbourhoods are fully inclusive and supportive for children with SEND and their families. Alongside this is our continued drive to further improve educational settings, options and standards so that all children and young people benefit from high quality and inclusive access to education."
+237,"{'points': ((386.4567, 88.86570000000006), (386.4567, 142.86570000000006), (558.3087, 142.86570000000006), (558.3087, 88.86570000000006)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,18,33f6ebde07247af7901902365d76800d,NarrativeText,6d8be24d0a84eb8daf3b753f4e8ec8d2,,"“All neighbourhoods, communities and ages are equally connected, invested in and considered”"
+238,"{'points': ((835.5118, 83.4547), (835.5118, 137.4547), (1000.8838000000001, 137.4547), (1000.8838000000001, 83.4547)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,18,33f6ebde07247af7901902365d76800d,NarrativeText,757153d621a38009cbc3347376f23f5e,,“Upskill young people by involving them in projects for their community and connect with businesses”
+239,"{'points': ((369.958, 176.3854), (369.958, 202.3854), (573.1780000000001, 202.3854), (573.1780000000001, 176.3854)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,18,33f6ebde07247af7901902365d76800d,NarrativeText,905a7f7f789fec144564a284f2082ab6,,“Inclusive and diverse spaces for us to connect and build communities”
+240,"{'points': ((665.0129, 160.79489999999998), (665.0129, 200.79489999999998), (866.6836999999999, 200.79489999999998), (866.6836999999999, 160.79489999999998)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,18,33f6ebde07247af7901902365d76800d,NarrativeText,7645b10daf6c717285540ecbaa3fb4ce,,“More council housing and affordable housing more broadly for communities to stay and grow”
+241,"{'points': ((968.0315, 155.12560000000008), (968.0315, 195.12560000000008), (1126.7783000000002, 195.12560000000008), (1126.7783000000002, 155.12560000000008)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,18,33f6ebde07247af7901902365d76800d,NarrativeText,04d8551943b727cc54aff2c927ba4bdb,,“Our streets and estates are socially mixed and there are lots of community activities”
+242,"{'points': ((56.6929, 268.5713999999999), (56.6929, 494.5714), (287.8798999999999, 494.5714), (287.8798999999999, 268.5713999999999)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,18,33f6ebde07247af7901902365d76800d,NarrativeText,6f9e460762e048da234aad380c8035dd,,"Our Child Friendly focus will cover three areas: safety, place and services. We will use our leadership to ensure that children and young people feel safe in their home and local places across the borough and feel able to trust adults – particularly in positions of authority. We will ensure that through regeneration, children and young people can move more freely in their local areas, and that streets and public spaces are child-friendly and welcoming. And we will continue to transform services, delivered by us and our partners, to ensure they support the growth of all our children and young people, with children and young people involved in shaping decisions about how to make services better at every stage of the process. As a Council, we will be amending our own decision-making process, to ensure that an impact assessment on the rights of children and young people is considered in our policy development and service improvement."
+243,"{'points': ((56.6929, 508.5714), (56.6929, 662.5714), (288.9999, 662.5714), (288.9999, 508.5714)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,18,33f6ebde07247af7901902365d76800d,NarrativeText,bae06b022d52e02ad5f0c639a32a0b53,,"We know that deprivation remains one of the biggest challenges in Lambeth – and will do everything that we can to end child poverty in our borough to ensure that our children and young people get the opportunity they deserve. As a borough we will focus on early intervention, ensuring that services and community groups are equipped to support our ambitions, giving children and young people greater opportunity to shape their own lives. And we will work with our partners and institutions, to make sure that every young person in Lambeth is able to participate in our local offer, and has access to strong employment, training and skills opportunities."
+244,"{'points': ((56.6929, 676.5714), (56.6929, 722.5714), (291.21189999999996, 722.5714), (291.21189999999996, 676.5714)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,18,33f6ebde07247af7901902365d76800d,NarrativeText,683061067e1537460e33cbfe026cb90c,,This will require co-operation and leadership – with everyone working together to make Lambeth a better place for children and young people and ensuring that decisions are made with their involvement.
+245,"{'points': ((306.1417, 268.0674), (306.1417, 306.0674), (529.7917000000001, 306.0674), (529.7917000000001, 268.0674)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,18,33f6ebde07247af7901902365d76800d,NarrativeText,a8be577441f5b8d07ffeba0dc9425116,,Making Lambeth the best place to age well
+246,"{'points': ((306.1417, 318.28340000000003), (306.1417, 340.2834), (536.4006999999999, 340.2834), (536.4006999999999, 318.28340000000003)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,18,33f6ebde07247af7901902365d76800d,NarrativeText,8fd01253500c6e82e832d2a0b0914ed5,,"As a lifelong borough, we want to make Lambeth the best place to age well by 2030."
+247,"{'points': ((306.1417, 354.2834), (306.1417, 484.2834), (533.7616999999999, 484.2834), (533.7616999999999, 354.2834)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,18,33f6ebde07247af7901902365d76800d,NarrativeText,86125655810bd5d62b8956420d067a4d,,"We will develop a local approach to becoming an Age-Friendly borough, building on World Health Organisation’s (WHO) framework – creating social and built environments that promote healthy and active later lives for all residents. We will focus on the key areas of community life to build our age- friendly framework: streets, outdoor spaces and buildings, housing, social participation and inclusion, civic participation and employment, community support and health services, and communication and information."
+248,"{'points': ((306.1417, 498.2834), (306.1417, 580.2834), (541.3627, 580.2834), (541.3627, 498.2834)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,18,33f6ebde07247af7901902365d76800d,NarrativeText,5e54f9c7442bf87310cf9c7bd75cb781,,"Equity and inclusion will be central to our effort to creating a borough that is truly age-friendly. To do this, we are committed to listening to and working with our older residents to develop shared priorities for the future. We will mobilise action, in partnership, that is targeted to equipping older residents with the support they need to continue to call Lambeth home."
+249,"{'points': ((651.9685, 268.0674), (651.9685, 326.0674), (822.2305, 326.0674), (822.2305, 268.0674)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,18,33f6ebde07247af7901902365d76800d,Title,161c6954f6a76581fe6dd6abb2e43815,,Inclusive economic development and opportunity
+250,"{'points': ((651.9685, 338.2834), (651.9685, 444.2834), (886.9755, 444.2834), (886.9755, 338.2834)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,18,161c6954f6a76581fe6dd6abb2e43815,NarrativeText,e7aa35dea0962d3e5718703198c96512,Inclusive economic development and opportunity,"Our vision is of a dynamic, strong, equitable local economy, providing opportunities for local people to thrive – irrespective of their starting point. The foundation for this will be Lambeth’s existing strengths in health sciences, low carbon, and creative and digital industries – sectors where we will see our future growth. Economic growth will provide the borough with the resources and tools to deliver the services our residents need."
+251,"{'points': ((651.9685, 458.2834), (651.9685, 552.2834), (887.1894999999998, 552.2834), (887.1894999999998, 458.2834)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,18,161c6954f6a76581fe6dd6abb2e43815,NarrativeText,60d45f6196ada2c710c1a1df43913d87,Inclusive economic development and opportunity,"Our local economy plays an important role in addressing structural inequities across Lambeth. More than ever, we need to be resilient, creative, dynamic, and adaptable to overcome an uncertain economic context. Our ethos is to create an empowered local ecosystem, where all our residents are able to contribute to and benefit from the great opportunities Lambeth has to offer."
+252,"{'points': ((651.9685, 566.2834), (651.9685, 744.2834), (887.1884999999997, 744.2834), (887.1884999999997, 566.2834)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,18,161c6954f6a76581fe6dd6abb2e43815,NarrativeText,849c00abfa7d4c2472fa387c96746f2b,Inclusive economic development and opportunity,"By 2030, Lambeth will be at the forefront of an invigorated economy, which provides more opportunities for more people – making a real difference to the realities of our residents. Lambeth will be a place where industry, educational and cultural institutions, the voluntary and public sectors come together to deliver a world class skills system and agile employment support that provide opportunities for good quality work – responding to systemic inequities, the aspirations of our residents and the needs of the economy. We will focus our efforts to create an equitable, anti-discriminatory, anti-racist and inclusive Lambeth, with good quality training opportunities, improved digital inclusion and literacy, and greater financial resilience."
+253,"{'points': ((901.4174, 268.5713999999999), (901.4174, 398.5714), (1129.6254, 398.5714), (1129.6254, 268.5713999999999)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,18,161c6954f6a76581fe6dd6abb2e43815,NarrativeText,3b58cad13d536ab3b6c972b914c44323,Inclusive economic development and opportunity,"As an attractive destination, Lambeth will capitalise on the opportunities for growth, ensuring that new residential and commercial developments across the borough provide investment to support the creation of an inclusive public realm, parks and new facilities – in support of our ambitions around climate, safety, and health. Our inclusive economic development approach will focus on facilitating sustainable and inclusive development activity which benefits all our communities – providing homes, jobs and vibrant neighbourhoods."
+254,"{'points': ((901.4174, 412.5714), (901.4174, 626.5714), (1136.6174, 626.5714), (1136.6174, 412.5714)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,18,161c6954f6a76581fe6dd6abb2e43815,NarrativeText,12c7a8f202b6b1b57da2b7e627caf335,Inclusive economic development and opportunity,"Using our collective civic leadership, we will leverage opportunities for those furthest away from economic inclusion, and those disproportionately so, to unlock their potential – through upskilling, reskilling and sustainable employment pathways. Our businesses and anchor institutions will support this, by working collaboratively to tackle low-pay and in-work poverty and encouraging the growth of local businesses across Lambeth. Our aspiration is to become a Living Wage borough, using a place-based approach to support families, communities and our local economy by uplifting low-paid workers to the real Living Wage. Good and fair working conditions are also crucial to providing opportunity and we aspire to see more businesses across Lambeth engaging with the Good Work Standard. Partnerships will be crucial – and we will continue to maximise collaboration with BIDs, businesses and other partners to create the conditions for our residents to thrive."
+255,"{'points': ((56.6929, 268.0674), (56.6929, 306.0674), (224.00289999999998, 306.0674), (224.00289999999998, 268.0674)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,19,17fd4a6b874cc626060b6eb33c25cc92,Title,aa78f9e9b5ad70f47640ce6893972386,,Quality housing for local people
+256,"{'points': ((56.6929, 318.28340000000003), (56.6929, 436.2834), (288.79289999999986, 436.2834), (288.79289999999986, 318.28340000000003)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,19,aa78f9e9b5ad70f47640ce6893972386,NarrativeText,15cbbf012ffd83ca16e14e2b864ffcd7,Quality housing for local people,"Good quality, affordable housing is the basis of stability and security for individuals and families. It can provide the foundation of good health, wellbeing and independence, and support people to participate in the local economy and benefit from growth. Yet Lambeth, like the rest of the UK, is in the grip of a serious housing crisis – in terms of availability, affordability, and safety. This is compounded by national policy which stifles both delivery and the financial context within which housing operates."
+257,"{'points': ((56.6929, 450.2834), (56.6929, 580.2834), (291.1699, 580.2834), (291.1699, 450.2834)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,19,aa78f9e9b5ad70f47640ce6893972386,NarrativeText,763b427a029d23fb8660486ce2b843ad,Quality housing for local people,"Whilst we have seen the delivery of the first new council homes in a generation, we recognise that the pace of growth has not matched demand and that is why the Council is committed to accelerating the delivery of affordable housing with our partners. We know at the same time as delivering more homes for social rent, that there is more to do around standards and conditions of existing homes, and our relationship with residents. To ensure that Lambeth is a place we can all call home, we are committed to refreshing and resetting our approach – with residents at the centre."
+258,"{'points': ((306.1417, 268.5713999999999), (306.1417, 362.5714), (528.0587, 362.5714), (528.0587, 268.5713999999999)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,19,aa78f9e9b5ad70f47640ce6893972386,NarrativeText,c9a52150e329c078e61ebdf0265e080c,Quality housing for local people,"Our vision is to ensure everyone has access to a safe and secure home, which is affordable and sustainable. We have committed to increasing the delivery of affordable housing, ensuring that growth delivers investment in our communities. We will go further in our commitments on sustainability, to achieve our net-zero ambition and protect our collective future."
+259,"{'points': ((306.1417, 376.5714), (306.1417, 566.5714), (540.6316999999999, 566.5714), (540.6316999999999, 376.5714)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,19,aa78f9e9b5ad70f47640ce6893972386,NarrativeText,d58516cf31c96024e084623678ed0d35,Quality housing for local people,"Driving this forward will be a new Lambeth Housing Strategy, setting out how we can accelerate the delivery of affordable housing in Lambeth, providing suitable housing options for all stages of life so that it is easier for people who grow up in Lambeth to continue living here. As a major landlord in the borough, the Council will set out an ambitious improvement plan for its stock including transforming its housing management and repairs service, so that Lambeth residents have the quality they deserve – and a voice to ensure services work for them. And we will ensure that housing is at the heart of our approach to supporting healthy and safe communities – working in partnership to tackle homelessness, deliver better standards for private renters, and supporting residents into work."
+260,"{'points': ((651.9684, 268.0674), (651.9684, 286.0674), (853.0104, 286.0674), (853.0104, 268.0674)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,19,17fd4a6b874cc626060b6eb33c25cc92,Title,e4d75408aeb090c06839e088ab1f8bde,,A borough of sanctuary
+261,"{'points': ((651.9684, 298.28340000000003), (651.9684, 380.2834), (882.4053999999999, 380.2834), (882.4053999999999, 298.28340000000003)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,19,e4d75408aeb090c06839e088ab1f8bde,NarrativeText,d3d8710b326c79890620d33cce1a5bda,A borough of sanctuary,"Lambeth has a proud history as a place of sanctuary, hope and opportunity, welcoming refugees from across the world. This will not end, and we have renewed our commitment to never turn our back on those seeking our help by gaining official Borough of Sanctuary status – becoming only the second London Borough to achieve this."
+262,"{'points': ((651.9685, 394.2834), (651.9685, 464.2834), (874.0654999999999, 464.2834), (874.0654999999999, 394.2834)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,19,e4d75408aeb090c06839e088ab1f8bde,NarrativeText,42c4b635d7dd07c43be591e7c00e0cc6,A borough of sanctuary,"We want Lambeth to feel like home for everyone – and we will continue to be a borough that values refugees, migrants and all those seeking sanctuary, supporting them through loss and trauma and working with them to rebuild their lives – safe from violence and persecution."
+263,"{'points': ((901.4172, 280.5713999999999), (901.4172, 398.5714), (1132.2051999999999, 398.5714), (1132.2051999999999, 280.5713999999999)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,19,e4d75408aeb090c06839e088ab1f8bde,NarrativeText,718d3914d692dec45e4ac17032503a08,A borough of sanctuary,"Lambeth’s vision is clear – we want to improve equity of access to support for all sanctuary-seekers and raise the voices of people with lived-experience. We will be led by five core values: Inclusivity, Openness, Participation, Inspiration and Integrity. To drive this forward, we have created the Lambeth Sanctuary Forum, a multi-agency group working with the voluntary and community sector, structured to deliver the priorities of our sanctuary-seekers, with humanity and compassion."
+264,"{'points': ((56.6929, 98.23889999999994), (56.6929, 208.23889999999994), (488.17289999999997, 208.23889999999994), (488.17289999999997, 98.23889999999994)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,20,f9153eab735f198457258ff8e17564c6,Title,5236ca647dfecf3b85f8e2ffdd52a955,,Our Lambeth 2030 Outcomes
+265,"{'points': ((56.6929, 250.84349999999995), (56.6929, 336.84349999999995), (541.2129000000006, 336.84349999999995), (541.2129000000006, 250.84349999999995)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,20,5236ca647dfecf3b85f8e2ffdd52a955,NarrativeText,cdbe3611719d58532d9fa4818c23fd96,Our Lambeth 2030 Outcomes,"Our ambitions are bold – it is going to take everyone in the borough to play their part in delivering for Lambeth, ensuring that we are all accountable and committed to a better future for everyone."
+266,"{'points': ((56.6929, 351.8307), (56.6929, 381.8307), (286.4749, 381.8307), (286.4749, 351.8307)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,20,5236ca647dfecf3b85f8e2ffdd52a955,NarrativeText,12fcbcb3bde25fd13446ca769c5f8945,Our Lambeth 2030 Outcomes,Our Lambeth Outcomes have been shaped to unite us in that effort.
+267,"{'points': ((306.1417, 350.94669999999996), (306.1417, 372.94669999999996), (528.8117, 372.94669999999996), (528.8117, 350.94669999999996)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,20,f9153eab735f198457258ff8e17564c6,Title,fde9d00b06272364d519bf9a8fd5eb56,,Our 2030 Ambition: Making Lambeth One of the Safest Boroughs in London
+268,"{'points': ((56.6929, 397.1199), (56.6929, 407.1199), (205.64290000000003, 407.1199), (205.64290000000003, 397.1199)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,20,f9153eab735f198457258ff8e17564c6,Title,8ed9bbf926b1867eb0ba6c082cdb1cfb,,A Borough of Equity and Justice
+269,"{'points': ((306.1417, 386.87669999999997), (306.1417, 420.87669999999997), (522.1593, 420.87669999999997), (522.1593, 386.87669999999997)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,20,8ed9bbf926b1867eb0ba6c082cdb1cfb,ListItem,eea33db02f3ad2459bd07e6227af95ab,A Borough of Equity and Justice,"By 2030, Lambeth will be a safer borough for everyone, with a significant reduction in serious violence against young people."
+270,"{'points': ((56.6929, 421.0499), (56.6929, 455.0499), (257.5392, 455.0499), (257.5392, 421.0499)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,20,8ed9bbf926b1867eb0ba6c082cdb1cfb,ListItem,081e216faaed45087ea8233f1014da49,A Borough of Equity and Justice,"By 2030, Lambeth will have lower levels of deprivation, with fewer children growing up in poverty."
+271,"{'points': ((306.1417, 428.546), (306.1417, 462.546), (532.7183, 462.546), (532.7183, 428.546)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,20,8ed9bbf926b1867eb0ba6c082cdb1cfb,ListItem,966385b43b08f280da88d7c7947a1dd3,A Borough of Equity and Justice,"By 2030, Lambeth will be safer for women and girls, and all residents experiencing gender-based violence will be able to access support."
+272,"{'points': ((56.6929, 462.7192), (56.6929, 508.7192), (280.8514999999999, 508.7192), (280.8514999999999, 462.7192)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,20,8ed9bbf926b1867eb0ba6c082cdb1cfb,ListItem,37f9bee541f9d81b0a75a4fff8354e4b,A Borough of Equity and Justice,"By 2030, Lambeth will tackle the structural inequalities adversely impacting Black, Asian and Multi-Ethnic residents by being a borough of anti- racism."
+273,"{'points': ((306.1417, 470.21529999999996), (306.1417, 504.21529999999996), (537.9383, 504.21529999999996), (537.9383, 470.21529999999996)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,20,8ed9bbf926b1867eb0ba6c082cdb1cfb,ListItem,4e5d35fce07332d24ca4ff507e70523e,A Borough of Equity and Justice,"By 2030, Lambeth will be a borough of prevention, tackling the root causes of violence to protect our communities."
+274,"{'points': ((56.6929, 516.3885), (56.6929, 562.3885), (281.8415, 562.3885), (281.8415, 516.3885)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,20,8ed9bbf926b1867eb0ba6c082cdb1cfb,ListItem,1c9761d1aadb898cf2bd8bcc591a7e26,A Borough of Equity and Justice,"By 2030, Lambeth will be a borough of progress, working with LGBTQ+ communities and disabled residents to tackle the biggest challenges they face."
+275,"{'points': ((306.1417, 528.9623999999999), (306.1417, 550.9623999999999), (533.4497, 550.9623999999999), (533.4497, 528.9623999999999)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,20,f9153eab735f198457258ff8e17564c6,Title,a528194e026b4ef7161be1edcbe4e656,,Our 2030 Ambition: Making Lambeth A Place We Can All Call Home
+276,"{'points': ((56.6929, 587.1357), (56.6929, 609.1357), (228.41290000000004, 609.1357), (228.41290000000004, 587.1357)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,20,f9153eab735f198457258ff8e17564c6,Title,e1f6923e14d4aaf7ad7b3659590c423c,,Our 2030 Ambition: Making Lambeth Neighbourhoods Fit for the Future
+277,"{'points': ((56.6929, 623.0657), (56.6929, 633.0657), (271.0905, 633.0657), (271.0905, 623.0657)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,20,e1f6923e14d4aaf7ad7b3659590c423c,ListItem,aae50badcc01336af92afa28faedf281,Our 2030 Ambition: Making Lambeth Neighbourhoods Fit for the Future,"By 2030, Lambeth will be a Net Zero Borough."
+278,"{'points': ((306.1417, 564.8924), (306.1417, 598.8924), (541.0783000000001, 598.8924), (541.0783000000001, 564.8924)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,20,e1f6923e14d4aaf7ad7b3659590c423c,ListItem,5488b4b956f3eb8c6033a9ede9806e64,Our 2030 Ambition: Making Lambeth Neighbourhoods Fit for the Future,"By 2030, Lambeth will be a borough of opportunity, with local people benefitting from jobs in our future growth industries."
+279,"{'points': ((306.1417, 606.5617), (306.1417, 640.5617), (519.1993, 640.5617), (519.1993, 606.5617)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,20,e1f6923e14d4aaf7ad7b3659590c423c,ListItem,4a853981b318007aab2305c8399ec362,Our 2030 Ambition: Making Lambeth Neighbourhoods Fit for the Future,"By 2030, Lambeth will increase the supply of genuinely affordable housing and the quality of existing homes for residents who need them."
+280,"{'points': ((56.6929, 640.735), (56.6929, 674.735), (285.48949999999985, 674.735), (285.48949999999985, 640.735)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,20,e1f6923e14d4aaf7ad7b3659590c423c,ListItem,40fac8162f3e29d568b4f9314551d254,Our 2030 Ambition: Making Lambeth Neighbourhoods Fit for the Future,"By 2030, Lambeth residents will experience good health and wellbeing, with an improved healthy life expectancy for those with the poorest outcomes."
+281,"{'points': ((306.1417, 648.231), (306.1417, 682.231), (532.1893, 682.231), (532.1893, 648.231)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,20,e1f6923e14d4aaf7ad7b3659590c423c,ListItem,a0dfbf519233a400168882a9f5eff6cc,Our 2030 Ambition: Making Lambeth Neighbourhoods Fit for the Future,"By 2030, Lambeth will be a borough of sanctuary and an Age and Child Friendly borough, the best place to grow up and age well."
+282,"{'points': ((56.6929, 682.4042), (56.6929, 716.4042), (291.7914999999999, 716.4042), (291.7914999999999, 682.4042)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,20,e1f6923e14d4aaf7ad7b3659590c423c,ListItem,43fd7e2f26254b4334ca0544b9c77cac,Our 2030 Ambition: Making Lambeth Neighbourhoods Fit for the Future,"By 2030, Lambeth will be a sustainable and healthy borough, with more accessible and active travel options for everyone."
+283,"{'points': ((56.6929, 98.23889999999994), (56.6929, 208.23889999999994), (496.4609, 208.23889999999994), (496.4609, 98.23889999999994)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,21,82a8213cf479393d6a921c09240c4dc5,Title,0044980c50da416552866a1e5f3e77b7,,Our Partnership Commitment
+284,"{'points': ((56.6929, 267.8512999999999), (56.6929, 309.8512999999999), (542.3929000000004, 309.8512999999999), (542.3929000000004, 267.8512999999999)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,21,0044980c50da416552866a1e5f3e77b7,NarrativeText,e76ae8ce190e72cfb74f5836bb17a67b,Our Partnership Commitment,"Lambeth 2030 sets out a borough commitment to work in partnership, harnessing what makes Lambeth special."
+285,"{'points': ((56.6929, 327.59659999999997), (56.6929, 389.59659999999997), (523.8054999999998, 389.59659999999997), (523.8054999999998, 327.59659999999997)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,21,0044980c50da416552866a1e5f3e77b7,NarrativeText,fbcf9c00079eef083fdc9fdd672b8661,Our Partnership Commitment,Achieving our shared vision and ambitions for the future can only be done together. This is a call to action for a collective approach to creating a borough fit for the future.
+286,"{'points': ((56.6929, 412.5617), (56.6929, 470.5617), (291.9329, 470.5617), (291.9329, 412.5617)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,21,0044980c50da416552866a1e5f3e77b7,NarrativeText,931c944d6a62957aeb32d981ba6a6c2c,Our Partnership Commitment,"Lambeth will be a borough of partnership - where institutions, businesses, residents, community groups and organisations and strategic partnerships work together to solve the biggest challenges facing the borough."
+287,"{'points': ((56.6929, 484.5617), (56.6929, 602.5617), (290.57290000000006, 602.5617), (290.57290000000006, 484.5617)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,21,0044980c50da416552866a1e5f3e77b7,NarrativeText,eb98c23c6b5cae30d61afb0bdc8b92ae,Our Partnership Commitment,"Importantly, we want Lambeth partnerships to be inclusive and representative. This is reflected in the establishment of two new strategic partnerships: the Lambeth 2030 Partnership, which will oversee the delivery of our ambitious borough plan; and the Lambeth United Equity and Inclusion Partnership, leading our work to become a borough of equity and justice. Both partnerships will work with existing forums and collaborations, to make Lambeth the best borough it can be."
+288,"{'points': ((306.1417, 412.5617), (306.1417, 630.2868), (498.8483, 630.2868), (498.8483, 412.5617)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,21,0044980c50da416552866a1e5f3e77b7,ListItem,268a21e003ff686cfac30d4fa2687af8,Our Partnership Commitment,Lambeth Council • Health and Wellbeing Board • Safer Lambeth Partnership • Lambeth Together • South East London Integrated Care Board • Black Thrive Partnership • Lambeth BIDS • Lambeth Forum Network • Lambeth Community Hubs Network • Lambeth Sanctuary Forum • Kings College London • London South Bank University • Metropolitan Police • Climate Partnership Group • Air Quality Forum
+289,"{'points': ((56.6929, 616.5617), (56.6929, 650.5617), (270.0528999999999, 650.5617), (270.0528999999999, 616.5617)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,21,0044980c50da416552866a1e5f3e77b7,NarrativeText,2e97c3d8298654c5fd6d68323f2493fe,Our Partnership Commitment,"Some of the borough’s key organisations, partnerships and forums working together for a better Lambeth:"
+290,"{'points': ((306.1417, 647.0814), (306.1417, 729.0814), (537.3897000000001, 729.0814), (537.3897000000001, 647.0814)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,21,0044980c50da416552866a1e5f3e77b7,NarrativeText,3b49273ade9805c9624b4bb73ed040de,Our Partnership Commitment,"Lambeth is lucky enough to have hundreds more groups and organisations, from grassroots to those with a global profile, working to make a difference for Lambeth. We continue to be led by pioneering individuals, driven by their love for the borough. Lambeth 2030 is not a future for some of us – but a future for all of us."
+291,"{'points': ((306.1417, 743.0814), (306.1417, 753.0814), (402.4707, 753.0814), (402.4707, 743.0814)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,21,0044980c50da416552866a1e5f3e77b7,NarrativeText,30fd3bb1311703c82c2501f0836b8132,Our Partnership Commitment,Let’s do this together.
+292,"{'points': ((306.1417, 765.5574), (306.1417, 777.5574), (532.8085000000001, 777.5574), (532.8085000000001, 765.5574)), 'system': 'PixelSpace', 'layout_width': 1190.55, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,21,82a8213cf479393d6a921c09240c4dc5,Title,99410ea5a8516e9e5596ba60f4c341b0,,This is Our Future; This is Our Lambeth.
+293,"{'points': ((331.8963, 515.348), (331.8963, 546.348), (519.3390503999999, 546.348), (519.3390503999999, 515.348)), 'system': 'PixelSpace', 'layout_width': 595.276, 'layout_height': 841.89}",Lambeth_2030-Our_Future_Our_Lambeth.pdf,['eng'],2024-04-19T21:04:53,22,28215edaabe75bb8b857ba0d9702dbb7,Title,333438e175726ff4843b3340a55852f1,,DIVERSITY
diff --git a/example_data/Partnership-Agreement-Toolkit_0_0.pdf b/example_data/Partnership-Agreement-Toolkit_0_0.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..e930d2bb30ea0e30d0c48393bc7c566b397edb2d
--- /dev/null
+++ b/example_data/Partnership-Agreement-Toolkit_0_0.pdf
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0db46a784d7aaafb8d02acf8686523dd376400117d07926a5dcb51ceb69e3236
+size 426602
diff --git a/example_data/Partnership-Agreement-Toolkit_test_deny_list_para_single_spell.csv b/example_data/Partnership-Agreement-Toolkit_test_deny_list_para_single_spell.csv
new file mode 100644
index 0000000000000000000000000000000000000000..827e2c45642ae3b172d488ef99326077726cc737
--- /dev/null
+++ b/example_data/Partnership-Agreement-Toolkit_test_deny_list_para_single_spell.csv
@@ -0,0 +1,2 @@
+another country or territory sign a formel agreement on behalf? of their communities endorsing a
+soster citues international
diff --git a/example_data/combined_case_notes.csv b/example_data/combined_case_notes.csv
new file mode 100644
index 0000000000000000000000000000000000000000..39a787296303521a49277d866f603c5faf7d6885
--- /dev/null
+++ b/example_data/combined_case_notes.csv
@@ -0,0 +1,19 @@
+Date,Social Worker,Client,Case Note
+"January 3, 2023",Jane Smith,Alex D.,"Met with Alex at school following reports of increased absences and declining grades. Alex appeared sullen and avoided eye contact. When prompted about school, Alex expressed feelings of isolation and stated, ""No one gets me."" Scheduled a follow-up meeting to further explore these feelings."
+"January 17, 2023",Jane Smith,Alex D.,"Met with Alex at the community center. Alex displayed sudden outbursts of anger when discussing home life, particularly in relation to a new stepfather. Alex mentioned occasional substance use, but did not specify which substances. Recommended a comprehensive assessment."
+"February 5, 2023",Jane Smith,Alex D.,Home visit conducted. Alex's mother reported frequent arguments at home. She expressed concerns about Alex's new group of friends and late-night outings. Noted potential signs of substance abuse. Suggested family counseling.
+"February 21, 2023",Jane Smith,Alex D.,"Met with Alex alone at my office. Alex appeared more agitated than in previous meetings. There were visible signs of self-harm on Alex's arms. When questioned, Alex became defensive. Immediate referral made to a mental health professional."
+"March 10, 2023",Jane Smith,Alex D.,Attended joint session with Alex and a therapist. Alex shared feelings of hopelessness and admitted to occasional thoughts of self-harm. Therapist recommended a comprehensive mental health evaluation and ongoing therapy.
+"March 25, 2023",Jane Smith,Alex D.,"Received a call from Alex's school about a physical altercation with another student. Met with Alex, who displayed high levels of frustration and admitted to the use of alcohol. Discussed the importance of seeking help and finding positive coping mechanisms. Recommended enrollment in an anger management program."
+"April 15, 2023",Jane Smith,Alex D.,Met with Alex and mother to discuss progress. Alex's mother expressed concerns about Alex's increasing aggression at home. Alex acknowledged the issues but blamed others for provoking the behavior. It was decided that a more intensive intervention may be needed.
+"April 30, 2023",Jane Smith,Alex D.,"Met with Alex and a psychiatrist. Psychiatrist diagnosed Alex with Oppositional Defiant Disorder (ODD) and co-morbid substance use disorder. A treatment plan was discussed, including medication, therapy, and family counseling."
+"May 20, 2023",Jane Smith,Alex D.,"Met with Alex to discuss progress. Alex has started attending group therapy and has shown slight improvements in behavior. Still, concerns remain about substance use. Discussed potential for a short-term residential treatment program."
+"January 3, 2023",Jane Smith,Jamie L.,"Met with Jamie at school after receiving reports of consistent tardiness and decreased participation in class. Jamie appeared withdrawn and exhibited signs of sadness. When asked about feelings, Jamie expressed feeling ""empty"" and ""hopeless"" at times. Scheduled a follow-up meeting to further explore these feelings."
+"January 17, 2023",Jane Smith,Jamie L.,"Met with Jamie at the community center. Jamie shared feelings of low self-worth, mentioning that it's hard to find motivation for daily tasks. Discussed potential triggers and learned about recent family financial struggles. Recommended counseling and possible group therapy for peer support."
+"February 5, 2023",Jane Smith,Jamie L.,Home visit conducted. Jamie's parents shared concerns about Jamie's increasing withdrawal from family activities and lack of interest in hobbies. Parents mentioned that Jamie spends a lot of time alone in the room. Suggested family therapy to open communication channels.
+"February 21, 2023",Jane Smith,Jamie L.,Met with Jamie in my office. Jamie opened up about feelings of isolation and mentioned difficulty sleeping. No signs of self-harm or suicidal ideation were noted. Recommended a comprehensive mental health assessment to better understand the depth of the depression.
+"March 10, 2023",Jane Smith,Jamie L.,"Attended a joint session with Jamie and a therapist. The therapist noted signs of moderate depression. Together, we discussed coping strategies and potential interventions. Jamie showed interest in art therapy."
+"March 25, 2023",Jane Smith,Jamie L.,"Received feedback from Jamie's school that academic performance has slightly improved. However, social interactions remain limited. Encouraged Jamie to join school clubs or groups to foster connection."
+"April 15, 2023",Jane Smith,Jamie L.,"Met with Jamie and parents to discuss progress. Parents have observed slight improvements in mood on some days, but overall, Jamie still appears to struggle. It was decided to explore medication as a potential aid alongside therapy."
+"April 30, 2023",Jane Smith,Jamie L.,Met with Jamie and a psychiatrist. The psychiatrist diagnosed Jamie with Major Depressive Disorder (MDD) and suggested considering antidepressant medication. Discussed the potential benefits and side effects. Jamie and parents will think it over.
+"May 20, 2023",Jane Smith,Jamie L.,"Jamie has started on a low dose of an antidepressant. Initial feedback is positive, with some improvement in mood and energy levels. Will continue monitoring and adjusting as necessary."
diff --git a/example_data/combined_case_notes.xlsx b/example_data/combined_case_notes.xlsx
new file mode 100644
index 0000000000000000000000000000000000000000..a8a54440f124ce09bea793bf28d32382450e5d60
--- /dev/null
+++ b/example_data/combined_case_notes.xlsx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:09300597024591d0b5b4ef97faef12fcceb28fcbb6ea09260bc42f43967753a4
+size 12579
diff --git a/example_data/doubled_output_joined.pdf b/example_data/doubled_output_joined.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..761c4c6668629ac96783b33fb7cacdcdafbc9be9
--- /dev/null
+++ b/example_data/doubled_output_joined.pdf
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6eeac353164447c2aa429196e1a6ffae4c095d7171e63c2d1cd1966fdf32d1ed
+size 1274719
diff --git a/example_data/example_complaint_letter.jpg b/example_data/example_complaint_letter.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..4bff5ffffbe1294706aecf0898b971648ba40e14
--- /dev/null
+++ b/example_data/example_complaint_letter.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:db33b67ebe685132a589593e4a3ca05f2dbce358b63de9142c2f2a36202e3f15
+size 117656
diff --git a/example_data/example_of_emails_sent_to_a_professor_before_applying.pdf b/example_data/example_of_emails_sent_to_a_professor_before_applying.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..0c5d8f39257572556cc727b716f7a24026f14432
--- /dev/null
+++ b/example_data/example_of_emails_sent_to_a_professor_before_applying.pdf
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ed0cd82b5b5826b851ca0e7c102d2d4d27580f7a90de4211a33178a6664d008d
+size 8848
diff --git a/example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_ocr_output.csv b/example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_ocr_output.csv
new file mode 100644
index 0000000000000000000000000000000000000000..ab019c9e82d9a5d76560dc116ce23895c2189702
--- /dev/null
+++ b/example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_ocr_output.csv
@@ -0,0 +1,277 @@
+page,text,left,top,width,height,line
+1,Partnership Agreement,0.516078,0.027879,0.440784,0.032424,1
+1,SisterCities,0.169804,0.033333,0.238431,0.028182,2
+1,INTERNATIONAL,0.170196,0.06697,0.237647,0.008788,3
+1,Toolkit,0.830588,0.07303,0.126667,0.025152,4
+1,Connect globally. Thrive locally.,0.169804,0.08697,0.238824,0.01303,5
+1,Types of Affiliations,0.117255,0.157576,0.241961,0.02,6
+1,Sister City Relationship,0.117647,0.187273,0.196863,0.013939,7
+1,"A Sister City relationship is formed when the mayor or highest elected official (or, if elections",0.117255,0.211212,0.738824,0.013636,8
+1,"do not take place, highest appointed official) from a U.S. community and a community in",0.117647,0.227273,0.70902,0.013939,9
+1,another country or territory sign a formal agreement on behalf of their communities endorsing a,0.117647,0.243636,0.761961,0.013636,10
+1,"""sister city/sister cities"" relationship. Sister city agreements shall be considered active/valid",0.118039,0.259697,0.731373,0.013939,11
+1,unless otherwise indicated by one or both of the respective communities.,0.118039,0.276061,0.58549,0.013636,12
+1,Sister Cities International shall formally recognize only those relationships by cities/members in,0.118039,0.299697,0.758824,0.013636,13
+1,good standing (i.e. who are current on membership dues) in its Membership Directory or on its,0.117647,0.316061,0.754902,0.013636,14
+1,"website. However, Sister Cities International shall not assert as invalid or otherwise impugn the",0.116863,0.332121,0.760784,0.013636,15
+1,legitimacy of those relationships formed by non-members.,0.118039,0.348485,0.466275,0.013636,16
+1,Friendship City,0.118039,0.372121,0.127059,0.013939,17
+1,"A Friendship City or Friendship Cities relationship is often formed by cities as a ""stepping",0.117255,0.395758,0.714118,0.013636,18
+1,"stone"" to a more formal ""Sister City"" agreement. Typically Friendship City agreements are",0.117647,0.411515,0.720392,0.014242,19
+1,referred to as such in the formal documents that are signed. Sister Cities International shall,0.118039,0.428182,0.72549,0.013636,20
+1,recognize Friendship City relationships by members in its Membership Directory and website.,0.118039,0.444242,0.747843,0.013636,21
+1,As per Sister Cities International Board of Directors:,0.117255,0.467879,0.413333,0.013636,22
+1,Sister Cities International will recognize a new sister cities affiliation between a,0.169412,0.492121,0.626667,0.013333,23
+1,"U.S. and an international community, even though another affiliation may exist",0.169412,0.507879,0.625098,0.013636,24
+1,"between that international community and a different U.S. community, only if a",0.169412,0.524545,0.62902,0.013636,25
+1,cooperative agreement among all involved communities is filed with Sister Cities,0.16902,0.540606,0.643137,0.013636,26
+1,"International. If a cooperative agreement is denied, or no response to the request",0.170196,0.556667,0.647843,0.013333,27
+1,"is received within a reasonable amount of time, Sister Cities International will",0.169412,0.57303,0.612157,0.012727,28
+1,recognize the partnership as a friendship city and it will be delineated as such,0.169412,0.589091,0.621176,0.013636,29
+1,with a symbol in the membership directories.,0.168627,0.605455,0.358824,0.013333,30
+1,The cooperative agreement must be sent by the Mayor/County,0.168627,0.628788,0.509412,0.013939,31
+1,"Executive/Governor of the requesting community, and must be sent to the",0.169804,0.645152,0.595294,0.014242,32
+1,Mayor/County Executive/Governor of each of the existing partnership,0.169804,0.661212,0.555294,0.013636,33
+1,communities. Although the Mayor/County Executive/Governor may request input,0.16902,0.677879,0.647451,0.013636,34
+1,"from, or may be given input by, the sister cities program, it is up to the discretion",0.168627,0.693939,0.647059,0.013939,35
+1,of the Mayor/County Executive/Governor to sign the cooperative agreement.,0.16902,0.709697,0.612941,0.013939,36
+1,Although Sister Cities International will help with the cooperative agreement,0.168627,0.726364,0.605882,0.013636,37
+1,"process, it is up to the requesting community to get the agreement signed. Sister",0.169412,0.742121,0.650196,0.013939,38
+1,"Cities International will not, in any way, force a community to ""share"" and sign",0.16902,0.758182,0.623922,0.014242,39
+1,the cooperative agreement.,0.168627,0.774848,0.219216,0.013333,40
+1,"To place a relationship into Emeritus status, the mayor or highest elected official of the U.S.",0.117255,0.798485,0.736471,0.013939,41
+1,community must write a letter to the mayor of the foreign city indicating that they wish to,0.118039,0.814545,0.70902,0.013636,42
+1,"remain sister cities, but understand that the relationship will remain inactive until such time as",0.118039,0.831212,0.747451,0.013333,43
+1,both cities are able to sustain an active relationship. Sister Cities International should be,0.118039,0.847273,0.705098,0.013636,44
+1,informed in writing by the mayor of the U.S. city of the situation. Sister Cities International will,0.118039,0.863333,0.746275,0.013636,45
+2,Partnership Agreement,0.516078,0.027879,0.440784,0.032424,1
+2,SisterCities,0.169804,0.033333,0.238824,0.028182,2
+2,INTERNATIONAL,0.170196,0.06697,0.237647,0.008788,3
+2,Toolkit,0.83098,0.072727,0.127059,0.025455,4
+2,Connect globally. Thrive locally.,0.169804,0.08697,0.239216,0.01303,5
+2,then place the partnership into Emeritus Status and will reflect this status in directories and all,0.117255,0.132424,0.751373,0.013333,6
+2,lists of sister city programs.,0.118039,0.148788,0.218431,0.013333,7
+2,"If a community wishes to terminate a sister city relationship, then a letter from the mayor or",0.118431,0.172424,0.732549,0.013333,8
+2,highest elected official of the U.S. city should be sent to the mayor of the sister city. Sister,0.118039,0.188485,0.721569,0.013636,9
+2,Cities International should be informed of this action in writing by the mayor of the U.S. city,0.118039,0.204848,0.72902,0.013333,10
+2,and Sister Cities International will then remove the partnership from its directories and all lists,0.117647,0.221212,0.746275,0.013333,11
+2,of sister city programs. We do not recommend terminating a relationship simply because it is,0.117647,0.237273,0.743529,0.013333,12
+2,"dormant. Many partnerships wax and wane over the years, and in many cases a dormant",0.117647,0.253939,0.713333,0.013333,13
+2,partnership may be reinvigorated by local members years after it has been inactive.,0.118039,0.269697,0.664314,0.013636,14
+2,General Guidelines,0.118039,0.295152,0.231765,0.016061,15
+2,In order for a sister city/county/state partnership to be recognized by Sister Cities International,0.118431,0.324242,0.754902,0.013636,16
+2,"(SCI), the two communities must sign formal documents which clearly endorse the link. This",0.118039,0.340606,0.74,0.013636,17
+2,presumes several key items: that the U.S. community is already a member of SCI and has,0.118039,0.35697,0.718039,0.013636,18
+2,followed proper procedures (e.g. passed a city council resolution declaring the intent to twin,0.117255,0.373333,0.737647,0.013636,19
+2,with the specific city); that both communities share a mutual commitment to the relationship;,0.117255,0.389394,0.740784,0.013636,20
+2,and that both have secured the necessary support structure to build a lasting relationship. You,0.117647,0.405455,0.758039,0.013333,21
+2,should check with your local sister city program to see if they have any additional requirements,0.117647,0.421818,0.760784,0.013636,22
+2,before pursuing a sister city relationship.,0.118039,0.437879,0.323137,0.013636,23
+2,"SCI often refers to these agreements as a ""Sister City Agreement"" or ""Memorandum of",0.118039,0.461515,0.696863,0.013939,24
+2,"Understanding."" However, as the following examples show, the actual name and format of",0.118039,0.477576,0.729804,0.013636,25
+2,your documents is left up to you.,0.117255,0.494242,0.262745,0.013636,26
+2,A few things to keep in mind as you draft your agreement:,0.117255,0.517879,0.463137,0.013636,27
+2,"Your agreement can range from the ceremonial, with language focusing on each city's",0.176471,0.542121,0.69098,0.013939,28
+2,"commitment to fostering understanding, cooperation, and mutual benefit to the precise,",0.176471,0.558485,0.701961,0.013333,29
+2,"with particular areas of interest, specific programs/activities, or more concrete goals",0.176078,0.574848,0.673725,0.013636,30
+2,related to anything from numbers of exchanges to economic development.,0.176863,0.591212,0.596863,0.013636,31
+2,"Don't try to include everything you plan to do. Some specifics, like particular areas of",0.177255,0.620303,0.681176,0.013939,32
+2,"interest or participating institutions are good to include. However, there's no need to",0.176471,0.636667,0.675686,0.013636,33
+2,include all the programs you plan to do if it makes the document too lengthy or limits,0.176863,0.652727,0.678824,0.013939,34
+2,the scope of projects. This is a formal document to establish the relationship; specific,0.176078,0.668788,0.684706,0.013636,35
+2,"tasks, responsibilities, or other nuts-and-bolts text related to implementation or",0.176078,0.685455,0.635686,0.013333,36
+2,administration of the partnership can be expressed more fully in a separate,0.176471,0.701212,0.600392,0.013636,37
+2,memorandum between the respective sister city committees. Your partnership,0.177255,0.717576,0.626667,0.013636,38
+2,agreement is a historical document and should not be dated or limited by being aligned,0.176471,0.733636,0.699216,0.013636,39
+2,with very specific tasks.,0.176078,0.750606,0.190196,0.013333,40
+2,Work with your counterparts. Remember that this is signed by both cities. You should,0.176078,0.779697,0.68549,0.013636,41
+2,share drafts of your agreement with your international partners and solicit feedback on,0.176471,0.795758,0.691765,0.013333,42
+2,what they'd like to see in the agreement. Be flexible to cultural or municipal priorities.,0.176471,0.811818,0.679216,0.013939,43
+2,Ask your counterparts to translate the agreement if it is drafted in English. It is,0.176078,0.841515,0.623137,0.013636,44
+2,important for the citizens of your partner community to be able to read and understand,0.176863,0.857576,0.693725,0.013939,1
+2,the commitment their city has made. Have someone in your own community who,0.176078,0.873939,0.649804,0.013636,2
+3,Partnership Agreement,0.516078,0.027879,0.441176,0.032121,3
+3,SisterCities,0.169804,0.033333,0.239216,0.028182,4
+3,INTERNATIONAL,0.170196,0.06697,0.237255,0.008788,5
+3,Toolkit,0.83098,0.07303,0.126667,0.025152,6
+3,Connect globally. Thrive locally.,0.169804,0.08697,0.239216,0.01303,7
+3,speaks that language check the foreign-language version to make sure it mirrors what,0.176471,0.132424,0.688235,0.013333,8
+3,you have in your own agreement.,0.176471,0.148788,0.264706,0.013333,9
+3,Keep it to one page. Ceremonial documents such as these partnership agreements,0.176863,0.178485,0.66549,0.013636,10
+3,work best if they can be posted in their entirety.,0.176078,0.194545,0.380392,0.013636,11
+3,Most sister city agreements include some acknowledgement of the founding principles,0.177255,0.224242,0.694902,0.013636,12
+3,"of the sister city movement- to promote peace through mutual respect, understanding,",0.176471,0.240303,0.698431,0.013333,13
+3,and cooperation.,0.176471,0.25697,0.13451,0.013333,14
+3,Consider using official letterhead and/or other embellishments such as city seals or,0.176863,0.286061,0.665882,0.013333,15
+3,logos to reflect your enhance the document. Sister city agreements are often posted at,0.176863,0.302121,0.695686,0.013636,16
+3,city hall or other municipal offices and should reflect their historical importance,0.176471,0.318485,0.630588,0.013333,17
+3,Look at other agreements your city has signed. These agreements may give you an idea,0.177255,0.347879,0.705098,0.013636,18
+3,"of what is acceptable or possible, and they may be in an easily replicable format. If you",0.176471,0.364242,0.695686,0.013636,19
+3,"cannot access older agreements please contact Sister Cities International, we may",0.176863,0.380303,0.663137,0.013636,20
+3,"have them on file, although we do not have copies of all partnership agreements.",0.176863,0.396667,0.64549,0.013636,21
+3,Documents must be signed by the top elected official of both communities.,0.177255,0.426364,0.601569,0.013333,22
+3,"Check with your mayor, city council, town clerk, et al. to make sure that the agreement",0.176863,0.455758,0.694118,0.013636,23
+3,"is OK with them. The mayor is the one putting his or her name on the paper, and you",0.176863,0.471818,0.677255,0.013333,24
+3,don't want to spend time developing an agreement which will never be signed.,0.176863,0.488182,0.629412,0.013636,25
+3,Official documents are usually signed during a formal ceremony recognizing the,0.176863,0.517576,0.638431,0.013636,26
+3,partnership. Be sure both communities receive a signed set of the official documents,0.177255,0.533939,0.683922,0.013636,27
+3,for their records.,0.176078,0.550606,0.131373,0.010606,28
+3,Remember to send your signed agreement to Sister Cities International. After we,0.177255,0.579697,0.645098,0.013636,29
+3,receive your agreement we will post the relationship in the City Directory and make sure,0.176863,0.595758,0.703137,0.013636,30
+3,it is included in our Annual Membership Directory.,0.176863,0.612121,0.398039,0.013333,31
+3,Remember that each city's sister city program is independent and can impose requirements,0.118431,0.640606,0.736471,0.013939,32
+3,"like the establishment of a committee, a review period, sustainability/funding plan, among",0.118039,0.65697,0.715686,0.013636,33
+3,"others, before sanctioning a sister city agreement. Check with your local program or mayor's",0.117647,0.672727,0.743529,0.014242,34
+3,office to see if this is the case.,0.117647,0.689091,0.241176,0.011515,35
+3,On the following pages you'll find a series of partnership agreements to give you an idea of,0.118039,0.717879,0.728627,0.013939,36
+3,"what is possible. While you should feel free to use some of the formatting and language, we",0.117255,0.734242,0.73451,0.013636,37
+3,encourage you to make your agreement your own and be creative with what you produce. If,0.117647,0.750606,0.737647,0.013636,38
+3,you are unsure about your agreement or want advice you can always solicit feedback by,0.117647,0.766667,0.708627,0.013636,39
+3,sending it to our Membership Director at akaplan@sister-cities.org or contacting us at (202),0.117647,0.782727,0.732157,0.013636,40
+3,347-8630.,0.117647,0.799394,0.080392,0.010303,41
+4,Partnership Agreement,0.516471,0.027879,0.440784,0.032727,1
+4,SisterCities,0.169412,0.033333,0.239608,0.028485,2
+4,INTERNATIONAL,0.170196,0.066667,0.238431,0.009091,3
+4,Toolkit,0.830588,0.072727,0.127843,0.025758,4
+4,Connect globally. Thrive locally.,0.169412,0.08697,0.239608,0.013333,5
+4,"jull bubzig 2000 3,312",0.378039,0.291212,0.32549,0.019394,6
+4,ABU DHABI MUNICIPALITY & TOWN PLANNING,0.376471,0.316667,0.327451,0.016667,7
+4,AN AGREEMENT FOR THE ESTABLISHMENT OF,0.260784,0.373636,0.52549,0.012727,8
+4,SISTER CITIES RELATIONSHIP,0.337647,0.393636,0.342745,0.012121,9
+4,BETWEEN,0.454902,0.413636,0.110588,0.011212,10
+4,THE CITY OF ABU DHABI ( U. A.E),0.337255,0.432727,0.375686,0.013939,11
+4,AND,0.487843,0.452727,0.048235,0.011212,12
+4,"HOUSTON, TEXAS ( U.S.A)",0.385882,0.471515,0.298039,0.014848,13
+4,"The Sister City Program, administered by Sister Cities International, was initiated",0.221961,0.525455,0.597255,0.01303,14
+4,By the President of the United States of America in 1956 to encourage greater,0.222745,0.539394,0.561961,0.012727,15
+4,Friendship and understanding between the United States and other nations through,0.222745,0.553333,0.608235,0.012727,16
+4,Direct personal contact: and,0.222745,0.567576,0.20549,0.012424,17
+4,"In order to foster those goals, the people of Abu Dhabi and Houston, in a gesture of",0.222353,0.594242,0.603529,0.012424,18
+4,"Friendship and goodwill, agree to collaborate for the mutual benefit of their",0.222745,0.608182,0.547843,0.01303,19
+4,"Communities by exploring education, economic and cultural opportunities.",0.222353,0.622121,0.541961,0.012121,20
+4,"Abu Dhabi and Houston, sharing a common interest in energy, technology and",0.221569,0.648788,0.574118,0.012424,21
+4,"medicine, and the desire to promote mutual understanding among our citizens do",0.222353,0.66303,0.588235,0.012121,22
+4,"hereby proclaim themselves Sister Cities beginning on the 13th day of March 2001,",0.221961,0.673636,0.594118,0.015758,23
+4,the date of Houston City Council resolution estatblishing the Sister City,0.221961,0.690303,0.519608,0.01303,24
+4,relationship became effective.,0.221569,0.705152,0.217647,0.012424,25
+4,"Signed on this 26 of October 2002, in duplicate in the Arabic and English",0.221569,0.732121,0.533333,0.01303,26
+4,"Languages, both text being equally authentic.",0.221961,0.746667,0.328627,0.012727,27
+4,A,0.344314,0.768485,0.084706,0.030303,28
+4,Sheikh Mohammed bin Butti AI Hamed,0.245882,0.806364,0.366275,0.010909,29
+4,Lee P.Brown,0.729412,0.806364,0.118824,0.010303,30
+4,Mayor of Houston,0.704706,0.823333,0.166667,0.012424,31
+4,Chairman of Abu Dhabi Municipality,0.24549,0.823636,0.342353,0.012727,32
+4,&Town Planning,0.324314,0.841212,0.155686,0.012424,33
+5,Partnership Agreement,0.516078,0.027879,0.441176,0.032424,1
+5,SisterCities,0.169412,0.033333,0.239608,0.028485,2
+5,INTERNATIONAL,0.17098,0.066667,0.237255,0.009091,3
+5,Toolkit,0.83098,0.072727,0.127059,0.025758,4
+5,Connect globally. Thrive locally.,0.169412,0.08697,0.239216,0.013333,5
+5,THE CITY OF NEW YORK,0.438824,0.262121,0.240784,0.009697,6
+5,OFFICE OF THE MAYOR,0.450196,0.27697,0.220392,0.009697,7
+5,"NEW YORK, N.Y. 10007",0.461176,0.29303,0.196863,0.010303,8
+5,THE NEW YORK CITY-LONDON SISTER CITY PARTNERSHIP,0.267451,0.355758,0.582745,0.011818,9
+5,Memorandum of Understanding,0.420392,0.371212,0.274902,0.013333,10
+5,The Sister City partnership between New York City and London will foster mutually,0.201176,0.402121,0.674118,0.014242,11
+5,beneficial solutions to common challenges for these two great cosmopolitan entities.,0.201176,0.417273,0.66902,0.013636,12
+5,"Consequently, the Sister City relationship between the two will be one of the most",0.201176,0.432727,0.652549,0.015152,13
+5,"important in their network of global partnerships, as it strives to:",0.201176,0.448182,0.50902,0.015455,14
+5,Encourage and publicize existing exchanges between London and New York City so,0.230588,0.480303,0.671373,0.015152,15
+5,that they can flourish to benefit a wider cross-section of the citizens of both;,0.230588,0.496061,0.602353,0.015152,16
+5,"Support and promote the development of new social, economic, academic and",0.230196,0.512424,0.618431,0.015455,17
+5,community programs to encourage both cities' citizens to share their experiences as a,0.229804,0.527879,0.678039,0.014848,18
+5,medium for learning from one another;,0.229804,0.543636,0.309412,0.013939,19
+5,Generate an improvement of the operation of the cities' various government agencies,0.229804,0.56,0.676078,0.014545,20
+5,by serving as a conduit of information;,0.22902,0.575758,0.307843,0.014848,21
+5,"Identify themes, common to both, that can generate new initiatives to further and",0.229412,0.591818,0.640784,0.015152,22
+5,"nurture the increasingly powerful financial, social and cultural relationships between",0.22902,0.607576,0.671373,0.014242,23
+5,the cities;,0.22902,0.624545,0.076471,0.012424,24
+5,Promote key mayoral priorities relevant to both London and New York City;,0.228627,0.639394,0.608627,0.015152,25
+5,Provide financial or in kind support to community-led programs that advance the,0.228627,0.656061,0.641569,0.013636,26
+5,aims of the Sister City partnership;,0.22902,0.672121,0.275294,0.013636,27
+5,"With the above purposes in mind, the Mayor of the City of New York and the Mayor of",0.198824,0.702424,0.697647,0.014848,28
+5,London solemnly confirm that these two cities are united by an official partnership by the,0.198824,0.718182,0.710196,0.014545,29
+5,protocol of this Memorandum of Understanding.,0.198431,0.733939,0.384314,0.015152,30
+5,This agreement will go into effect from the date of signatures.,0.310196,0.780606,0.488235,0.014545,31
+5,Thedder Rudolph W. Giuliani,0.178824,0.795455,0.244314,0.100909,32
+5,Signed in March of 2001,0.455686,0.796364,0.19451,0.013636,33
+5,Ken Mayor Livingstone,0.672157,0.877576,0.132941,0.029091,34
+5,Mayor,0.311373,0.894848,0.053333,0.012727,35
+5,New York City,0.287843,0.909091,0.121176,0.013333,36
+5,London,0.701961,0.909091,0.061569,0.010606,37
+6,Partnership Agreement,0.515686,0.027576,0.441961,0.03303,1
+6,SisterCities,0.169412,0.03303,0.24,0.028182,2
+6,INTERNATIONAL,0.169804,0.066667,0.238431,0.009091,3
+6,Toolkit,0.83098,0.072727,0.127451,0.025758,4
+6,Connect globally. Thrive locally.,0.169412,0.08697,0.239608,0.013333,5
+6,CHIC OF STATE,0.247451,0.190606,0.141961,0.036364,6
+6,City of Long Beach,0.388627,0.196667,0.476471,0.066364,7
+6,California,0.551373,0.257273,0.136471,0.033333,8
+6,Sister City Agreement,0.321961,0.305455,0.378431,0.035152,9
+6,between the,0.464706,0.352727,0.084314,0.009697,10
+6,City of Long Beach,0.38,0.378485,0.252549,0.01697,11
+6,"California, USA",0.4,0.397576,0.21098,0.016061,12
+6,and the,0.48,0.415152,0.053333,0.009091,13
+6,City of San Pablo de Manta,0.321569,0.428788,0.369804,0.01697,14
+6,"Ecuador, South America",0.347451,0.447879,0.317255,0.015152,15
+6,"In accordance with the authorization and approval expressed by the City of Long Beach,",0.261569,0.482121,0.536863,0.012121,16
+6,"California, USA, and the City of San Pablo de Manta, Ecundor, South America, it is declared",0.217647,0.492727,0.581176,0.01303,17
+6,"that a ""Sister City Agreement between the two cities is hereby established for the following",0.217647,0.502727,0.581569,0.012121,18
+6,purposes:,0.216863,0.516061,0.058039,0.009394,19
+6,(1) to promote and expand the effective and mutually beneficial cooperation between,0.278824,0.532727,0.520392,0.012424,20
+6,the people of Long Beach and the people of San Pablo de Manta; and,0.218039,0.543636,0.40549,0.012424,21
+6,"(2) to promote international goodwill, understanding, and expanded business",0.279216,0.56303,0.520784,0.012424,22
+6,"relations between the two cities and their respective nations by the exchange of people, ideas, and",0.218039,0.573636,0.581569,0.012121,23
+6,"information in a unide variety of economic, social, cultural, municipal, environmental,",0.218039,0.584242,0.581176,0.012121,24
+6,"professional, technical, youth, and other endeavors; and",0.217647,0.594848,0.333333,0.012121,25
+6,"(3) to foster and encourage charitable, scientific, trade and commerce, literary and",0.279608,0.613939,0.520784,0.012727,26
+6,educational activities between the two cities;,0.218039,0.625455,0.265882,0.009697,27
+6,This Sister City Agreement shall be officially established and shall become effective when,0.263137,0.644545,0.536863,0.012727,28
+6,"this document has been duly executed by the Mayor of Long Beach, California, USA, and the",0.218824,0.654848,0.581961,0.012424,29
+6,"Mayor of San Pablo de Manta, Ecundor, South America.",0.218431,0.665758,0.338824,0.012121,30
+6,STATE OFFICE,0.276471,0.713636,0.050588,0.048788,31
+6,Beverly 0 Neill,0.587451,0.736667,0.121961,0.013636,32
+6,"Mayor, City of Long Beach",0.542353,0.751212,0.21098,0.013636,33
+6,"California, USA",0.582745,0.765758,0.125098,0.01303,34
+6,10.2aulus,0.490588,0.771818,0.220392,0.062424,35
+6,Ing. Jorge O. Zambrano Cedeño,0.527059,0.825152,0.242745,0.013333,36
+6,"Mayor, City of San Pablo de Manta",0.505098,0.839394,0.277647,0.013636,37
+6,"Ecuador, South America",0.551765,0.854242,0.188235,0.011818,38
+6,"Dated: September 19, 2000",0.544706,0.883333,0.202745,0.01303,39
+7,Partnership Agreement,0.516078,0.027879,0.441176,0.032424,1
+7,SisterCities,0.169412,0.03303,0.24,0.028485,2
+7,INTERNATIONAL,0.170196,0.066667,0.237647,0.009091,3
+7,Toolkit,0.83098,0.072727,0.127451,0.025758,4
+7,Connect globally. Thrive locally.,0.169412,0.08697,0.239216,0.013333,5
+7,REAFFIRMATION OF SISTER CITIES DECLARATION,0.324706,0.165152,0.483529,0.013939,6
+7,adopted by,0.2,0.213333,0.080392,0.013636,7
+7,THE HONORABLE RICHARD M. DALEY,0.396078,0.214242,0.335686,0.012424,8
+7,MAYOR OF CHICAGO,0.472549,0.231212,0.18549,0.011515,9
+7,and,0.199608,0.260909,0.026275,0.010606,10
+7,THE HONORABLE ZHANG RONGMAO,0.401961,0.261212,0.323137,0.011212,11
+7,MAYOR OF SHENYANG,0.463529,0.273636,0.202353,0.011212,12
+7,ON,0.551765,0.298182,0.026667,0.011515,13
+7,"JUNE 5, 1995",0.500392,0.323636,0.128235,0.014848,14
+7,"On this the tenth anniversary of the signing of a sister city agreement, in order to further",0.255686,0.36303,0.67098,0.015152,15
+7,the traditional links of friendship between Chicago and Shenyang and to reaffirm their mutual,0.198824,0.378788,0.727843,0.015455,16
+7,"aspiration to work in unison for the benefit of their cities and nations, the Honorable Mayor",0.199608,0.394848,0.727843,0.014848,17
+7,"Richard M. Daley, Mayor of the City of Chicago, and the Honorable Zhang Rongmao, Mayor",0.199216,0.411212,0.727451,0.014242,18
+7,"of the City of Shenyang, on this fifth day of June 1995, do hereby acknowledge and reaffirm the",0.199216,0.42697,0.72549,0.014848,19
+7,sister cities agreement between the City of Chicago and the City of Shenyang.,0.199608,0.443636,0.57451,0.014242,20
+7,"The City of Chicago and the City of Shenyang on the basis of friendly cooperation,",0.256078,0.473939,0.665098,0.015152,21
+7,equality and mutual benefit will continue to develop a sister cities relationship to promote and,0.2,0.490303,0.724706,0.014242,22
+7,broaden economic cooperation and cultural exchanges between the two cities.,0.199216,0.506061,0.57451,0.014242,23
+7,The two cities do hereby declare their interest in exploring the establishment of business,0.255294,0.537273,0.668235,0.015455,24
+7,and trade relations between Chicago and Shenyang.,0.198824,0.554545,0.387843,0.013636,25
+7,"In addition, exchanges will be promoted in the area of the arts such as exhibits, music,",0.254118,0.583939,0.666667,0.015455,26
+7,dance and other cultural activities.,0.198431,0.601212,0.256471,0.010606,27
+7,"In addition, exchanges will be promoted in education and the establishment of contacts",0.254118,0.630303,0.668627,0.015758,28
+7,within educational institutions encouraged.,0.198824,0.647273,0.32,0.014242,29
+7,"In addition, we declare our intention to promote exchanges in such fields as science and",0.253725,0.678182,0.668627,0.014848,30
+7,"technology, sports, health, youth and any areas that will contribute to the prosperity and the",0.198039,0.693636,0.722745,0.015152,31
+7,further development of friendship between the people of our two cities.,0.194902,0.711515,0.525098,0.013636,32
+7,3h.5.,0.593725,0.750606,0.218039,0.06303,33
+7,THE HONORABLE ZHANG RONGMAO,0.588627,0.819394,0.287843,0.011818,34
+7,THE HONORABLE RICHARD M. DALEY,0.197255,0.821515,0.303529,0.010606,35
+7,MAYOR OF SHENYANG,0.587451,0.835455,0.177647,0.010303,36
+7,MAYOR OF CHICAGO,0.195686,0.835758,0.164706,0.010606,37
diff --git a/example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_review_file.csv b/example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_review_file.csv
new file mode 100644
index 0000000000000000000000000000000000000000..29db00a58dd58ce9098c95b5f3a95db73ac62d5b
--- /dev/null
+++ b/example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_review_file.csv
@@ -0,0 +1,77 @@
+image,page,label,color,xmin,ymin,xmax,ymax,id,text
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_0.png,1,ADDRESS,"(0, 0, 0)",0.598431,0.524545,0.63098,0.535455,EG3nykuwvxbk,U.S.
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_0.png,1,ADDRESS,"(0, 0, 0)",0.820392,0.798485,0.854118,0.809394,jy1R42e6phNz,U.S.
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_0.png,1,ADDRESS,"(0, 0, 0)",0.433333,0.863333,0.46549,0.873939,9sbrsroLfZy0,U.S.
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_1.png,2,ADDRESS,"(0, 0, 0)",0.354118,0.188788,0.386275,0.199697,k7bWBsQQchJZ,U.S.
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_1.png,2,ADDRESS,"(0, 0, 0)",0.780392,0.204848,0.812941,0.215758,peo6UqIxrjmR,U.S.
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_2.png,3,EMAIL,"(0, 0, 0)",0.447843,0.78303,0.648627,0.796667,DIfz0LenOtQv,akaplan@sister-cities.org
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_2.png,3,PHONE,"(0, 0, 0)",0.809804,0.78303,0.850196,0.796667,odJdySe9XrAn,(202)
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_2.png,3,PHONE,"(0, 0, 0)",0.117647,0.799394,0.198431,0.809697,iURSkUM7BbUG,347-8630
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.637647,0.432727,0.712941,0.44697,fRxAD9qm856s,U. A.E
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.489412,0.43303,0.614902,0.444545,qzRFPlNbslpH,ABU DHABI
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.385882,0.472121,0.593725,0.486364,v1uLbGsofN1f,"HOUSTON, TEXAS"
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.392549,0.539697,0.573725,0.549394,MvbPQiHvSdL7,United States of America
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.539216,0.553333,0.635686,0.563333,05U3cgj5w9PY,United States
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.534902,0.594242,0.615294,0.603939,uHMikyBlMq5f,Abu Dhabi
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.651373,0.594242,0.717255,0.605455,XNUE0GopIBaf,Houston
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.221569,0.65,0.301176,0.659697,6FjbNu2CGA9n,Abu Dhabi
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.337647,0.65,0.404314,0.660606,Yvmm2225ityu,Houston
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,HANDWRITING,"(0, 0, 0)",0.344314,0.768485,0.42902,0.798788,EwTcqq7PENU8,A
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,NAME,"(0, 0, 0)",0.245882,0.806364,0.612549,0.817576,Mj4gqwbgsZWp,Sheikh Mohammed bin Butti AI Hamed
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,NAME,"(0, 0, 0)",0.52,0.806364,0.612549,0.81697,RXYOVgLwq8Ke,AI Hamed
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,NAME,"(0, 0, 0)",0.729412,0.806364,0.848235,0.816667,REPZhwFWGoTc,Lee P.Brown
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,NAME,"(0, 0, 0)",0.245882,0.806667,0.51451,0.817576,rFdxMRFRWLRJ,Sheikh Mohammed bin Butti
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.366667,0.823939,0.465098,0.834242,5iYCxRGdPG1i,Abu Dhabi
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.577647,0.262121,0.68,0.271515,3ZR43H3yYNdy,NEW YORK
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.461176,0.29303,0.555294,0.303333,WNoitmR9A6lu,NEW YORK
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.461176,0.29303,0.658039,0.303333,HjrhxMQhovlF,NEW YORK N.Y. 10007
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.563137,0.29303,0.658039,0.302121,nPN7g7UcnX4u,N.Y. 10007
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.314118,0.356667,0.42549,0.367576,ZoJf29CB3Wrq,NEW YORK
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.655294,0.480909,0.718431,0.491515,iezAqmD2ilnb,London
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.708627,0.639394,0.837255,0.652727,tWAuJEQVpfhi,New York City
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.60902,0.64,0.67098,0.650606,NaW3mmmlhMW9,London
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.667059,0.702727,0.751373,0.713636,pgMiwuMiBp8B,New York
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.198824,0.720303,0.261569,0.731212,fPvElSFZFRoL,London
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,HANDWRITING,"(0, 0, 0)",0.178824,0.795455,0.281961,0.896364,DfniF7P2bXAw,Thedder
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,NAME,"(0, 0, 0)",0.178824,0.795455,0.423529,0.896364,QwnWsAeslO5f,Thedder Rudolph W. Giuliani
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,NAME - ADDRESS,"(0, 0, 0)",0.672157,0.877576,0.80549,0.891212,Vdp95SShYOEO,Ken Livingstone
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.710196,0.877576,0.80549,0.891212,H5DGqsucPAjc,Livingstone
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,NAME,"(0, 0, 0)",0.672157,0.877879,0.705098,0.888182,qotGtnMbhAJr,Ken
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.287843,0.909091,0.40902,0.922727,sFX0tNJJzpE5,New York City
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.701961,0.909091,0.763922,0.919697,2xFbVTbxiOhC,London
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.55451,0.203636,0.86549,0.258485,Nfe3WTBembGQ,Long Beach
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.551373,0.257273,0.687843,0.290606,kndQY5X4itc8,California
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.558824,0.397879,0.611373,0.410303,B5vq8yhWLeOg,USA
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.425882,0.429091,0.691373,0.441818,OtNgqUkoEaZb,San Pablo de Manta
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.347451,0.447879,0.665098,0.46303,Q52VzBx2SWNF,"Ecuador, South America"
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.724314,0.482121,0.798431,0.493939,O7gd9ywvKsKh,"Long Beach,"
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.425098,0.49303,0.506275,0.502727,DzYr3xrM8Tvv,San Pablo de
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.425098,0.49303,0.715294,0.50303,iZ0knpQD54UU,"San Pablo de Manta, Ecundor, South America"
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.509804,0.49303,0.715294,0.50303,pZnYGzr7Pwsl,"Manta, Ecundor, South America"
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.217647,0.493333,0.321961,0.504242,r7Aar8FNQF6D,"California, USA"
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.471765,0.543636,0.596863,0.553939,zg9uBDlSuuA1,San Pablo de Manta
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.295294,0.544242,0.36549,0.556061,A0OY6RjMEocW,Long Beach
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.563137,0.655152,0.748627,0.667576,HQlTdEUhOCgI,"Long Beach, California, USA"
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.463529,0.665758,0.557255,0.674848,bCN9b7kJw0Ik,South America
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.277647,0.666061,0.403529,0.676061,qffN3bDgWRMk,San Pablo de Manta
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.587451,0.736667,0.709804,0.750303,eqMENFw5mbnL,Beverly 0 Neill
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.663137,0.751212,0.753333,0.764545,POqPQVBCES8h,Long Beach
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.582745,0.765758,0.708235,0.779091,mjrjsSMOxwaY,"California, USA"
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,HANDWRITING,"(0, 0, 0)",0.490588,0.771818,0.71098,0.834242,xL8dSawihWuY,10.2aulus
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,NAME,"(0, 0, 0)",0.559608,0.825152,0.769804,0.838485,fHyvwmbOgLMJ,Jorge O. Zambrano Cedeño
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.624314,0.839394,0.782745,0.850303,zGhskyehufSv,San Pablo de Manta
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.551765,0.854242,0.74,0.866061,dSPXmtb8M4nt,"Ecuador, South America"
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,NAME,"(0, 0, 0)",0.556471,0.215152,0.731765,0.226667,BEhuvaI5BVaR,RICHARD M. DALEY
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,NAME,"(0, 0, 0)",0.563137,0.261212,0.725098,0.272424,coo8KK7q6A72,ZHANG RONGMAO
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,ADDRESS,"(0, 0, 0)",0.566275,0.273636,0.666275,0.285152,0P9rVSbeNdB4,SHENYANG
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,ADDRESS,"(0, 0, 0)",0.526667,0.380303,0.588235,0.394242,1GDArufutI5y,Chicago
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,ADDRESS,"(0, 0, 0)",0.628235,0.380606,0.702353,0.394242,QyD751r4fCU1,Shenyang
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,NAME,"(0, 0, 0)",0.736863,0.411515,0.868235,0.424545,rntIekANI8BO,Zhang Rongmao
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,NAME,"(0, 0, 0)",0.199216,0.411818,0.34,0.424848,96TaHazXGIM7,Richard M. Daley
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,ADDRESS,"(0, 0, 0)",0.514902,0.412424,0.580784,0.425758,kbyVj6qhZSPi,Chicago
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,ADDRESS,"(0, 0, 0)",0.696471,0.443939,0.774118,0.45697,rJpaMvepsNln,Shenyang
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,ADDRESS,"(0, 0, 0)",0.353725,0.474545,0.415686,0.489091,PokCVpLQmDki,Chicago
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,ADDRESS,"(0, 0, 0)",0.407451,0.554545,0.469804,0.568182,HqVr414KRg59,Chicago
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,HANDWRITING,"(0, 0, 0)",0.593725,0.750606,0.811765,0.813636,xdawEv0DUH6P,3h.5.
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,NAME,"(0, 0, 0)",0.730196,0.819394,0.876471,0.830606,Gghr7ccN6lS2,ZHANG RONGMAO
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,NAME,"(0, 0, 0)",0.34,0.821515,0.501176,0.831515,vOMIv1RS5Sag,RICHARD M. DALEY
diff --git a/example_data/example_outputs/Partnership-Agreement-Toolkit_0_0_ocr_results_with_words_textract.csv b/example_data/example_outputs/Partnership-Agreement-Toolkit_0_0_ocr_results_with_words_textract.csv
new file mode 100644
index 0000000000000000000000000000000000000000..5110366f0a8224cd8389c3186622a50f0be464e5
--- /dev/null
+++ b/example_data/example_outputs/Partnership-Agreement-Toolkit_0_0_ocr_results_with_words_textract.csv
@@ -0,0 +1,2438 @@
+page,line,word_text,word_x0,word_y0,word_x1,word_y1,line_text,line_x0,line_y0,line_x1,line_y1
+1,2,SisterCities,0.169804,0.033333,0.408627,0.061515,,,,,
+1,3,Partnership,0.516078,0.027879,0.733333,0.060303,,,,,
+1,3,Agreement,0.747843,0.028182,0.957255,0.060303,,,,,
+1,4,INTERNATIONAL,0.170196,0.06697,0.408235,0.075758,,,,,
+1,5,Connect,0.169804,0.087273,0.236078,0.097576,,,,,
+1,5,globally.,0.240784,0.087273,0.301569,0.100303,,,,,
+1,5,Thrive,0.307059,0.08697,0.35451,0.097576,,,,,
+1,5,locally.,0.358824,0.087273,0.40902,0.100303,,,,,
+1,6,Toolkit,0.830588,0.07303,0.957647,0.098485,,,,,
+1,7,Types,0.117255,0.158182,0.190588,0.177879,,,,,
+1,7,of,0.199216,0.157879,0.223922,0.173939,,,,,
+1,7,Affiliations,0.23098,0.157576,0.359216,0.173939,,,,,
+1,8,Sister,0.117647,0.187273,0.166667,0.198788,,,,,
+1,8,City,0.171765,0.187273,0.205098,0.201515,,,,,
+1,8,Relationship,0.21098,0.187273,0.314902,0.201515,,,,,
+1,9,A,0.117255,0.211515,0.129804,0.222121,,,,,
+1,9,Sister,0.13451,0.211212,0.180392,0.222121,,,,,
+1,9,City,0.185882,0.211212,0.216863,0.225152,,,,,
+1,9,relationship,0.222745,0.211515,0.313725,0.224848,,,,,
+1,9,is,0.320392,0.211515,0.332157,0.222121,,,,,
+1,9,formed,0.337647,0.211212,0.393333,0.222121,,,,,
+1,9,when,0.399216,0.211515,0.442745,0.222121,,,,,
+1,9,the,0.447843,0.211515,0.473725,0.222121,,,,,
+1,9,mayor,0.479608,0.213939,0.529804,0.224545,,,,,
+1,9,or,0.534118,0.213939,0.550588,0.222121,,,,,
+1,9,highest,0.555686,0.211515,0.613725,0.224848,,,,,
+1,9,elected,0.619216,0.211515,0.676078,0.222121,,,,,
+1,9,official,0.682745,0.211212,0.733725,0.222121,,,,,
+1,9,"(or,",0.74,0.211212,0.764706,0.224848,,,,,
+1,9,if,0.771373,0.211212,0.780784,0.222121,,,,,
+1,9,elections,0.78549,0.211515,0.856471,0.222424,,,,,
+1,10,do,0.117647,0.227879,0.137647,0.238485,,,,,
+1,10,not,0.143922,0.228182,0.16902,0.238485,,,,,
+1,10,take,0.174118,0.227879,0.208627,0.238485,,,,,
+1,10,"place,",0.214118,0.227879,0.261176,0.241212,,,,,
+1,10,highest,0.267843,0.227879,0.32549,0.241212,,,,,
+1,10,appointed,0.331373,0.227879,0.410588,0.241212,,,,,
+1,10,official),0.417255,0.227273,0.47451,0.241212,,,,,
+1,10,from,0.478824,0.227576,0.514902,0.238485,,,,,
+1,10,a,0.521176,0.230303,0.530588,0.238485,,,,,
+1,10,U.S.,0.536471,0.227576,0.56902,0.238485,,,,,
+1,10,community,0.575294,0.227879,0.664314,0.241212,,,,,
+1,10,and,0.66902,0.227273,0.698824,0.238788,,,,,
+1,10,a,0.704706,0.230303,0.714118,0.238485,,,,,
+1,10,community,0.719608,0.227879,0.808627,0.241212,,,,,
+1,10,in,0.814118,0.227879,0.826667,0.238485,,,,,
+1,11,another,0.117647,0.243939,0.179216,0.254545,,,,,
+1,11,country,0.183922,0.244242,0.244706,0.257273,,,,,
+1,11,or,0.249412,0.246667,0.265882,0.254545,,,,,
+1,11,territory,0.270196,0.243939,0.332941,0.257273,,,,,
+1,11,sign,0.338039,0.243939,0.370196,0.257273,,,,,
+1,11,a,0.376078,0.246364,0.38549,0.254545,,,,,
+1,11,formal,0.390588,0.243636,0.440784,0.254545,,,,,
+1,11,agreement,0.446667,0.244242,0.531373,0.257273,,,,,
+1,11,on,0.537255,0.246364,0.556863,0.254545,,,,,
+1,11,behalf,0.563137,0.243636,0.612549,0.254545,,,,,
+1,11,of,0.616863,0.243636,0.632941,0.254545,,,,,
+1,11,their,0.637255,0.243636,0.672941,0.254242,,,,,
+1,11,communities,0.677647,0.243939,0.779608,0.254545,,,,,
+1,11,endorsing,0.78549,0.243939,0.864314,0.257273,,,,,
+1,11,a,0.870588,0.246364,0.88,0.254545,,,,,
+1,12,"""sister",0.118039,0.259697,0.169412,0.270606,,,,,
+1,12,city/sister,0.174118,0.259697,0.251373,0.273333,,,,,
+1,12,"cities""",0.256078,0.259697,0.303922,0.270606,,,,,
+1,12,relationship.,0.31098,0.26,0.407059,0.273333,,,,,
+1,12,Sister,0.413725,0.259697,0.459608,0.270606,,,,,
+1,12,city,0.464706,0.26,0.492941,0.273636,,,,,
+1,12,agreements,0.498431,0.260303,0.591765,0.273333,,,,,
+1,12,shall,0.597647,0.259697,0.633725,0.270606,,,,,
+1,12,be,0.64,0.26,0.659608,0.270606,,,,,
+1,12,considered,0.664706,0.26,0.753725,0.270606,,,,,
+1,12,active/valid,0.76,0.259697,0.849412,0.270606,,,,,
+1,13,unless,0.118039,0.276364,0.168235,0.28697,,,,,
+1,13,otherwise,0.174118,0.276364,0.252157,0.28697,,,,,
+1,13,indicated,0.257647,0.276364,0.329804,0.28697,,,,,
+1,13,by,0.336078,0.276061,0.355686,0.289697,,,,,
+1,13,one,0.360392,0.278788,0.390196,0.28697,,,,,
+1,13,or,0.395686,0.278788,0.412157,0.28697,,,,,
+1,13,both,0.417255,0.276364,0.452549,0.28697,,,,,
+1,13,of,0.458431,0.276061,0.474902,0.28697,,,,,
+1,13,the,0.478824,0.276364,0.504314,0.287273,,,,,
+1,13,respective,0.509804,0.276364,0.592157,0.289697,,,,,
+1,13,communities.,0.597647,0.276364,0.703529,0.28697,,,,,
+1,14,Sister,0.118039,0.299697,0.163922,0.310606,,,,,
+1,14,Cities,0.16902,0.299697,0.212941,0.310606,,,,,
+1,14,International,0.219608,0.299697,0.316863,0.310606,,,,,
+1,14,shall,0.322745,0.299697,0.358824,0.310606,,,,,
+1,14,formally,0.363922,0.299697,0.427451,0.313333,,,,,
+1,14,recognize,0.432941,0.3,0.510588,0.313333,,,,,
+1,14,only,0.516078,0.3,0.549412,0.313333,,,,,
+1,14,those,0.554118,0.3,0.599216,0.310606,,,,,
+1,14,relationships,0.604314,0.3,0.705098,0.313333,,,,,
+1,14,by,0.711373,0.299697,0.730196,0.313333,,,,,
+1,14,cities/members,0.735686,0.299697,0.858824,0.310606,,,,,
+1,14,in,0.863922,0.3,0.876863,0.310606,,,,,
+1,15,good,0.117647,0.316364,0.157647,0.329697,,,,,
+1,15,standing,0.164314,0.316364,0.232549,0.329697,,,,,
+1,15,(i.e.,0.238824,0.316061,0.26549,0.329697,,,,,
+1,15,who,0.27098,0.316364,0.304706,0.32697,,,,,
+1,15,are,0.310588,0.318788,0.335686,0.32697,,,,,
+1,15,current,0.341176,0.316667,0.397647,0.32697,,,,,
+1,15,on,0.402745,0.318788,0.422353,0.32697,,,,,
+1,15,membership,0.428235,0.316061,0.527451,0.329697,,,,,
+1,15,dues),0.532941,0.316364,0.577647,0.329394,,,,,
+1,15,in,0.582745,0.316364,0.595294,0.32697,,,,,
+1,15,its,0.601569,0.316364,0.619216,0.32697,,,,,
+1,15,Membership,0.62549,0.316364,0.723922,0.329697,,,,,
+1,15,Directory,0.73098,0.316364,0.802745,0.329394,,,,,
+1,15,or,0.808235,0.318788,0.824706,0.32697,,,,,
+1,15,on,0.82902,0.318788,0.848627,0.327273,,,,,
+1,15,its,0.85451,0.316061,0.872549,0.32697,,,,,
+1,16,website.,0.116863,0.332424,0.183529,0.343333,,,,,
+1,16,"However,",0.190588,0.332424,0.264314,0.344848,,,,,
+1,16,Sister,0.271373,0.332121,0.317255,0.34303,,,,,
+1,16,Cities,0.322353,0.332121,0.366667,0.34303,,,,,
+1,16,International,0.372941,0.332121,0.470196,0.34303,,,,,
+1,16,shall,0.476471,0.332121,0.512157,0.34303,,,,,
+1,16,not,0.518431,0.332727,0.543529,0.34303,,,,,
+1,16,assert,0.54902,0.332727,0.596863,0.343333,,,,,
+1,16,as,0.602353,0.334848,0.620784,0.343333,,,,,
+1,16,invalid,0.627059,0.332424,0.676078,0.343333,,,,,
+1,16,or,0.682745,0.334848,0.699216,0.34303,,,,,
+1,16,otherwise,0.703922,0.332424,0.781176,0.343333,,,,,
+1,16,impugn,0.787059,0.332121,0.845882,0.345758,,,,,
+1,16,the,0.851765,0.332424,0.877647,0.343333,,,,,
+1,17,legitimacy,0.118039,0.348485,0.198431,0.362121,,,,,
+1,17,of,0.203529,0.348485,0.22,0.359394,,,,,
+1,17,those,0.223922,0.348485,0.26902,0.359394,,,,,
+1,17,relationships,0.27451,0.348485,0.375294,0.361818,,,,,
+1,17,formed,0.380392,0.348485,0.436471,0.359394,,,,,
+1,17,by,0.443137,0.348485,0.462745,0.362121,,,,,
+1,17,non-members.,0.468235,0.348485,0.584314,0.359394,,,,,
+1,18,Friendship,0.118039,0.372121,0.205098,0.386061,,,,,
+1,18,City,0.212157,0.372121,0.24549,0.386061,,,,,
+1,19,A,0.117255,0.396364,0.129804,0.40697,,,,,
+1,19,Friendship,0.135294,0.396061,0.216863,0.409394,,,,,
+1,19,City,0.223922,0.396061,0.254902,0.409697,,,,,
+1,19,or,0.26,0.398788,0.276471,0.40697,,,,,
+1,19,Friendship,0.281569,0.396061,0.364314,0.409394,,,,,
+1,19,Cities,0.37098,0.396061,0.414902,0.40697,,,,,
+1,19,relationship,0.421176,0.395758,0.513333,0.409091,,,,,
+1,19,is,0.519608,0.396061,0.531373,0.406667,,,,,
+1,19,often,0.536863,0.395758,0.577255,0.40697,,,,,
+1,19,formed,0.583137,0.395758,0.639608,0.40697,,,,,
+1,19,by,0.645882,0.395758,0.665098,0.409697,,,,,
+1,19,cities,0.670588,0.396061,0.711765,0.40697,,,,,
+1,19,as,0.717255,0.398485,0.734902,0.40697,,,,,
+1,19,a,0.741176,0.398485,0.750588,0.407273,,,,,
+1,19,"""stepping",0.755686,0.396061,0.831373,0.409697,,,,,
+1,20,"stone""",0.117647,0.412121,0.168627,0.42303,,,,,
+1,20,to,0.174118,0.412727,0.190196,0.42303,,,,,
+1,20,a,0.196078,0.414545,0.20549,0.42303,,,,,
+1,20,more,0.211373,0.414545,0.251373,0.42303,,,,,
+1,20,formal,0.256471,0.411818,0.306667,0.42303,,,,,
+1,20,"""Sister",0.312549,0.411818,0.366275,0.42303,,,,,
+1,20,"City""",0.371373,0.411515,0.409412,0.425758,,,,,
+1,20,agreement.,0.415294,0.412727,0.504314,0.425758,,,,,
+1,20,Typically,0.510588,0.412121,0.581176,0.425758,,,,,
+1,20,Friendship,0.587451,0.411818,0.669804,0.425455,,,,,
+1,20,City,0.676078,0.411818,0.707451,0.425758,,,,,
+1,20,agreements,0.712549,0.41303,0.806275,0.425758,,,,,
+1,20,are,0.812549,0.414545,0.838039,0.423333,,,,,
+1,21,referred,0.118039,0.428182,0.179216,0.439091,,,,,
+1,21,to,0.185098,0.428788,0.200784,0.439091,,,,,
+1,21,as,0.206275,0.430909,0.224706,0.439091,,,,,
+1,21,such,0.230196,0.428182,0.267843,0.439091,,,,,
+1,21,in,0.274118,0.428182,0.286667,0.439091,,,,,
+1,21,the,0.292157,0.428485,0.317647,0.439091,,,,,
+1,21,formal,0.322353,0.428182,0.372549,0.439091,,,,,
+1,21,documents,0.378431,0.428182,0.467843,0.439091,,,,,
+1,21,that,0.473333,0.428485,0.503922,0.439091,,,,,
+1,21,are,0.509412,0.430909,0.53451,0.439091,,,,,
+1,21,signed.,0.539608,0.428182,0.596863,0.441818,,,,,
+1,21,Sister,0.603922,0.428182,0.649804,0.439091,,,,,
+1,21,Cities,0.65451,0.428182,0.698824,0.439091,,,,,
+1,21,International,0.70549,0.428485,0.802745,0.439091,,,,,
+1,21,shall,0.808627,0.428182,0.843529,0.439091,,,,,
+1,22,recognize,0.118039,0.444545,0.195686,0.458182,,,,,
+1,22,Friendship,0.201569,0.444545,0.283529,0.457879,,,,,
+1,22,City,0.290196,0.444545,0.321176,0.458182,,,,,
+1,22,relationships,0.326667,0.444545,0.427843,0.457879,,,,,
+1,22,by,0.433725,0.444242,0.453333,0.458182,,,,,
+1,22,members,0.458431,0.444545,0.532941,0.455455,,,,,
+1,22,in,0.539216,0.444848,0.551765,0.455455,,,,,
+1,22,its,0.557647,0.444848,0.575686,0.455455,,,,,
+1,22,Membership,0.581569,0.444545,0.680784,0.457879,,,,,
+1,22,Directory,0.687843,0.444545,0.759608,0.458182,,,,,
+1,22,and,0.765098,0.444848,0.793333,0.455455,,,,,
+1,22,website.,0.799608,0.444545,0.865882,0.455455,,,,,
+1,23,As,0.117255,0.468485,0.137647,0.478788,,,,,
+1,23,per,0.143529,0.470909,0.169804,0.481818,,,,,
+1,23,Sister,0.174902,0.467879,0.221176,0.479091,,,,,
+1,23,Cities,0.225882,0.467879,0.269804,0.479091,,,,,
+1,23,International,0.276471,0.468485,0.373725,0.479091,,,,,
+1,23,Board,0.38,0.468182,0.427059,0.479091,,,,,
+1,23,of,0.433333,0.468182,0.450196,0.479091,,,,,
+1,23,Directors:,0.454902,0.468182,0.53098,0.479091,,,,,
+1,24,Sister,0.169412,0.492121,0.215294,0.503333,,,,,
+1,24,Cities,0.220392,0.492121,0.264706,0.50303,,,,,
+1,24,International,0.271373,0.492121,0.368235,0.50303,,,,,
+1,24,will,0.374118,0.492121,0.399216,0.50303,,,,,
+1,24,recognize,0.405098,0.492424,0.482745,0.505758,,,,,
+1,24,a,0.488235,0.494848,0.497647,0.50303,,,,,
+1,24,new,0.503137,0.494848,0.534902,0.50303,,,,,
+1,24,sister,0.541176,0.492424,0.584706,0.503333,,,,,
+1,24,cities,0.589412,0.492424,0.63098,0.50303,,,,,
+1,24,affiliation,0.636471,0.492121,0.707059,0.50303,,,,,
+1,24,between,0.713333,0.492424,0.780392,0.50303,,,,,
+1,24,a,0.787059,0.494848,0.796471,0.50303,,,,,
+1,25,U.S.,0.169412,0.507879,0.202353,0.519394,,,,,
+1,25,and,0.208627,0.508182,0.237647,0.519091,,,,,
+1,25,an,0.244314,0.510909,0.262745,0.519091,,,,,
+1,25,international,0.26902,0.508182,0.366275,0.519091,,,,,
+1,25,"community,",0.372157,0.508485,0.465098,0.521818,,,,,
+1,25,even,0.471373,0.510909,0.508627,0.519091,,,,,
+1,25,though,0.514118,0.508182,0.569412,0.521818,,,,,
+1,25,another,0.576078,0.508182,0.637255,0.519394,,,,,
+1,25,affiliation,0.642353,0.508182,0.712941,0.519091,,,,,
+1,25,may,0.719216,0.510909,0.752941,0.521818,,,,,
+1,25,exist,0.758039,0.508182,0.79451,0.519394,,,,,
+1,26,between,0.169412,0.524545,0.237255,0.535455,,,,,
+1,26,that,0.242745,0.524545,0.273725,0.535455,,,,,
+1,26,international,0.279608,0.524545,0.376078,0.535455,,,,,
+1,26,community,0.382353,0.524545,0.471373,0.537879,,,,,
+1,26,and,0.476471,0.524545,0.505098,0.535455,,,,,
+1,26,a,0.512157,0.527273,0.521569,0.535455,,,,,
+1,26,different,0.526667,0.524545,0.592157,0.535455,,,,,
+1,26,U.S.,0.598431,0.524545,0.63098,0.535455,,,,,
+1,26,"community,",0.638039,0.524545,0.730588,0.538182,,,,,
+1,26,only,0.736863,0.524545,0.769412,0.538182,,,,,
+1,26,if,0.775294,0.524545,0.784314,0.535455,,,,,
+1,26,a,0.789412,0.527273,0.798431,0.535455,,,,,
+1,27,cooperative,0.16902,0.540909,0.263529,0.554242,,,,,
+1,27,agreement,0.26902,0.541212,0.354118,0.554545,,,,,
+1,27,among,0.359608,0.543333,0.413725,0.554242,,,,,
+1,27,all,0.42,0.540909,0.436863,0.551515,,,,,
+1,27,involved,0.442745,0.540909,0.507451,0.551515,,,,,
+1,27,communities,0.514118,0.540606,0.616078,0.551818,,,,,
+1,27,is,0.621961,0.540909,0.634118,0.551515,,,,,
+1,27,filed,0.639608,0.540606,0.671765,0.551515,,,,,
+1,27,with,0.678039,0.540606,0.71098,0.551515,,,,,
+1,27,Sister,0.717255,0.540606,0.763137,0.551515,,,,,
+1,27,Cities,0.768235,0.540606,0.812549,0.551515,,,,,
+1,28,International.,0.170196,0.556667,0.271765,0.567576,,,,,
+1,28,If,0.279216,0.556667,0.288627,0.567576,,,,,
+1,28,a,0.293333,0.559394,0.302745,0.567576,,,,,
+1,28,cooperative,0.307843,0.55697,0.401569,0.57,,,,,
+1,28,agreement,0.407451,0.557273,0.492549,0.570303,,,,,
+1,28,is,0.498824,0.55697,0.510588,0.567576,,,,,
+1,28,"denied,",0.516471,0.55697,0.57451,0.569697,,,,,
+1,28,or,0.580784,0.559394,0.597255,0.567576,,,,,
+1,28,no,0.602745,0.559394,0.621569,0.567576,,,,,
+1,28,response,0.627451,0.559394,0.701569,0.570303,,,,,
+1,28,to,0.705882,0.557576,0.721961,0.567576,,,,,
+1,28,the,0.727059,0.556667,0.752941,0.567576,,,,,
+1,28,request,0.758431,0.557273,0.818039,0.570303,,,,,
+1,29,is,0.169412,0.573333,0.181176,0.583939,,,,,
+1,29,received,0.187451,0.57303,0.253725,0.583939,,,,,
+1,29,within,0.26,0.57303,0.306667,0.583939,,,,,
+1,29,a,0.312549,0.575758,0.321961,0.584242,,,,,
+1,29,reasonable,0.327451,0.573333,0.415686,0.583939,,,,,
+1,29,amount,0.421176,0.573636,0.481176,0.584242,,,,,
+1,29,of,0.486667,0.57303,0.503137,0.583939,,,,,
+1,29,"time,",0.507059,0.573333,0.545882,0.585758,,,,,
+1,29,Sister,0.552157,0.57303,0.598039,0.583939,,,,,
+1,29,Cities,0.603137,0.57303,0.647451,0.583939,,,,,
+1,29,International,0.654118,0.57303,0.751373,0.583939,,,,,
+1,29,will,0.756471,0.57303,0.781569,0.583939,,,,,
+1,30,recognize,0.169412,0.589091,0.247059,0.602727,,,,,
+1,30,the,0.252157,0.589091,0.277647,0.6,,,,,
+1,30,partnership,0.283137,0.589091,0.372157,0.602727,,,,,
+1,30,as,0.378824,0.591818,0.396863,0.6,,,,,
+1,30,a,0.402353,0.591818,0.411765,0.6,,,,,
+1,30,friendship,0.418039,0.589091,0.497255,0.602424,,,,,
+1,30,city,0.502745,0.589394,0.532157,0.602424,,,,,
+1,30,and,0.535294,0.589394,0.563529,0.6,,,,,
+1,30,it,0.570588,0.589091,0.579216,0.599697,,,,,
+1,30,will,0.583922,0.589091,0.60902,0.6,,,,,
+1,30,be,0.615686,0.589091,0.635686,0.6,,,,,
+1,30,delineated,0.640784,0.589091,0.722353,0.6,,,,,
+1,30,as,0.72902,0.591818,0.747059,0.6,,,,,
+1,30,such,0.752941,0.589091,0.790588,0.6,,,,,
+1,31,with,0.168627,0.605455,0.201569,0.616364,,,,,
+1,31,a,0.207451,0.608182,0.216863,0.616364,,,,,
+1,31,symbol,0.222353,0.605455,0.279608,0.618788,,,,,
+1,31,in,0.285882,0.605455,0.298431,0.616061,,,,,
+1,31,the,0.303922,0.605758,0.329412,0.616364,,,,,
+1,31,membership,0.334902,0.605455,0.433333,0.619091,,,,,
+1,31,directories.,0.439608,0.605758,0.527843,0.616364,,,,,
+1,32,The,0.168627,0.629091,0.198824,0.64,,,,,
+1,32,cooperative,0.204314,0.629394,0.298431,0.642727,,,,,
+1,32,agreement,0.303922,0.629697,0.38902,0.64303,,,,,
+1,32,must,0.395294,0.629697,0.437255,0.64,,,,,
+1,32,be,0.440392,0.629394,0.461176,0.64,,,,,
+1,32,sent,0.466275,0.629697,0.501569,0.64,,,,,
+1,32,by,0.505098,0.629091,0.527451,0.64303,,,,,
+1,32,the,0.529412,0.629394,0.554902,0.64,,,,,
+1,32,Mayor/County,0.560392,0.628788,0.678431,0.642727,,,,,
+1,33,Executive/Governor,0.169804,0.645152,0.331373,0.656667,,,,,
+1,33,of,0.332941,0.645152,0.349412,0.656667,,,,,
+1,33,the,0.353333,0.645455,0.378431,0.656364,,,,,
+1,33,requesting,0.383922,0.645758,0.467451,0.659394,,,,,
+1,33,"community,",0.474118,0.645758,0.566667,0.658788,,,,,
+1,33,and,0.572941,0.645758,0.601569,0.656061,,,,,
+1,33,must,0.60902,0.645758,0.65098,0.656061,,,,,
+1,33,be,0.654118,0.645758,0.674902,0.656364,,,,,
+1,33,sent,0.68,0.645758,0.715294,0.656061,,,,,
+1,33,to,0.719216,0.645758,0.735686,0.656364,,,,,
+1,33,the,0.739608,0.645455,0.765098,0.656364,,,,,
+1,34,Mayor/County,0.169804,0.661515,0.286667,0.675152,,,,,
+1,34,Executive/Governor,0.290196,0.661212,0.450196,0.672727,,,,,
+1,34,of,0.452549,0.661515,0.46902,0.672121,,,,,
+1,34,each,0.473333,0.661818,0.51098,0.672424,,,,,
+1,34,of,0.517255,0.661515,0.533333,0.672424,,,,,
+1,34,the,0.537255,0.661818,0.562745,0.672121,,,,,
+1,34,existing,0.568235,0.661515,0.628627,0.675152,,,,,
+1,34,partnership,0.635294,0.661818,0.725098,0.675152,,,,,
+1,35,communities.,0.16902,0.677879,0.275294,0.688788,,,,,
+1,35,Although,0.281569,0.678182,0.352941,0.691515,,,,,
+1,35,the,0.358824,0.678182,0.384314,0.688788,,,,,
+1,35,Mayor/County,0.389804,0.677879,0.506667,0.691818,,,,,
+1,35,Executive/Governor may,0.510196,0.677879,0.706275,0.691818,,,,,
+1,35,request,0.711765,0.678485,0.771373,0.691515,,,,,
+1,35,input,0.777255,0.677879,0.816471,0.691515,,,,,
+1,36,"from,",0.168627,0.693939,0.209804,0.706667,,,,,
+1,36,or,0.216078,0.69697,0.232549,0.704848,,,,,
+1,36,may,0.237647,0.696364,0.271373,0.707576,,,,,
+1,36,be,0.276863,0.693939,0.296471,0.704545,,,,,
+1,36,given,0.301961,0.694545,0.343922,0.707576,,,,,
+1,36,input,0.349804,0.694545,0.389412,0.707576,,,,,
+1,36,"by,",0.395294,0.693939,0.418431,0.707879,,,,,
+1,36,the,0.423922,0.693939,0.449804,0.704848,,,,,
+1,36,sister,0.454902,0.693939,0.498431,0.704848,,,,,
+1,36,cities,0.503137,0.693939,0.544314,0.704545,,,,,
+1,36,"program,",0.550588,0.696667,0.621569,0.707273,,,,,
+1,36,it,0.628627,0.693939,0.637647,0.704848,,,,,
+1,36,is,0.643529,0.693939,0.655686,0.704848,,,,,
+1,36,up,0.661176,0.696364,0.680784,0.707273,,,,,
+1,36,to,0.685882,0.694545,0.701961,0.704848,,,,,
+1,36,the,0.707451,0.693939,0.732941,0.704545,,,,,
+1,36,discretion,0.737647,0.693939,0.816078,0.704545,,,,,
+1,37,of,0.16902,0.71,0.184706,0.720909,,,,,
+1,37,the,0.189412,0.71,0.214902,0.720606,,,,,
+1,37,Mayor/County,0.220392,0.71,0.338039,0.723636,,,,,
+1,37,Executive/Governor,0.341176,0.709697,0.501176,0.721212,,,,,
+1,37,to,0.503529,0.710606,0.518824,0.720606,,,,,
+1,37,sign,0.523922,0.71,0.556863,0.723333,,,,,
+1,37,the,0.561961,0.710303,0.587843,0.720606,,,,,
+1,37,cooperative,0.592941,0.710303,0.686667,0.723333,,,,,
+1,37,agreement.,0.692549,0.710606,0.781961,0.723636,,,,,
+1,38,Although,0.168627,0.726364,0.24,0.74,,,,,
+1,38,Sister,0.246667,0.726364,0.292157,0.737273,,,,,
+1,38,Cities,0.297255,0.726364,0.341569,0.737273,,,,,
+1,38,International,0.347843,0.726667,0.445098,0.73697,,,,,
+1,38,will,0.450588,0.726364,0.476078,0.737273,,,,,
+1,38,help,0.481961,0.726364,0.514902,0.739697,,,,,
+1,38,with,0.520392,0.726364,0.553725,0.737273,,,,,
+1,38,the,0.559216,0.726364,0.584314,0.73697,,,,,
+1,38,cooperative,0.589804,0.726667,0.683529,0.74,,,,,
+1,38,agreement,0.689412,0.72697,0.77451,0.74,,,,,
+1,39,"process,",0.169412,0.745152,0.236863,0.755455,,,,,
+1,39,it,0.243922,0.742424,0.252549,0.753333,,,,,
+1,39,is,0.258431,0.742424,0.270196,0.753333,,,,,
+1,39,up,0.276471,0.745152,0.295294,0.756364,,,,,
+1,39,to,0.301176,0.743333,0.316471,0.75303,,,,,
+1,39,the,0.322353,0.742424,0.347843,0.753333,,,,,
+1,39,requesting,0.353333,0.742727,0.436471,0.755758,,,,,
+1,39,community,0.442745,0.742727,0.531765,0.756364,,,,,
+1,39,to,0.536863,0.743333,0.552157,0.75303,,,,,
+1,39,get,0.558039,0.743333,0.583137,0.756364,,,,,
+1,39,the,0.588235,0.742424,0.613725,0.753333,,,,,
+1,39,agreement,0.619216,0.74303,0.703922,0.756061,,,,,
+1,39,signed.,0.710196,0.742727,0.766667,0.756061,,,,,
+1,39,Sister,0.774118,0.742121,0.82,0.753333,,,,,
+1,40,Cities,0.16902,0.758182,0.213333,0.769697,,,,,
+1,40,International,0.22,0.758485,0.317647,0.769394,,,,,
+1,40,will,0.323529,0.758788,0.348235,0.769394,,,,,
+1,40,"not,",0.35451,0.759091,0.383922,0.771515,,,,,
+1,40,in,0.390588,0.758788,0.403137,0.769091,,,,,
+1,40,any,0.40902,0.760909,0.437647,0.772727,,,,,
+1,40,"way,",0.442745,0.761212,0.479216,0.772424,,,,,
+1,40,force,0.484706,0.758182,0.52549,0.769394,,,,,
+1,40,a,0.531373,0.761515,0.540784,0.769697,,,,,
+1,40,community,0.545882,0.758788,0.63451,0.772727,,,,,
+1,40,to,0.639216,0.759394,0.655686,0.769394,,,,,
+1,40,"""share""",0.661569,0.758182,0.719608,0.769394,,,,,
+1,40,and,0.72549,0.758788,0.75451,0.769697,,,,,
+1,40,sign,0.760784,0.758788,0.793333,0.772424,,,,,
+1,41,the,0.168627,0.774848,0.194118,0.785758,,,,,
+1,41,cooperative,0.199608,0.775152,0.293725,0.788485,,,,,
+1,41,agreement.,0.299216,0.775152,0.388235,0.788485,,,,,
+1,42,To,0.117255,0.798788,0.137255,0.809697,,,,,
+1,42,place,0.143529,0.799091,0.186667,0.812121,,,,,
+1,42,a,0.192157,0.801818,0.201176,0.809697,,,,,
+1,42,relationship,0.207059,0.799091,0.298039,0.812121,,,,,
+1,42,into,0.304706,0.798788,0.333725,0.809394,,,,,
+1,42,Emeritus,0.34,0.798788,0.412941,0.81,,,,,
+1,42,"status,",0.418824,0.799091,0.471373,0.811515,,,,,
+1,42,the,0.477647,0.798788,0.503137,0.809394,,,,,
+1,42,mayor,0.508235,0.801212,0.558824,0.812424,,,,,
+1,42,or,0.563529,0.801818,0.58,0.809697,,,,,
+1,42,highest,0.585098,0.798788,0.642745,0.812424,,,,,
+1,42,elected,0.648235,0.798788,0.70549,0.809697,,,,,
+1,42,official,0.712157,0.798485,0.763137,0.809697,,,,,
+1,42,of,0.76902,0.798485,0.785098,0.809697,,,,,
+1,42,the,0.789412,0.798788,0.814902,0.809394,,,,,
+1,42,U.S.,0.820392,0.798485,0.854118,0.809394,,,,,
+1,43,community,0.118039,0.815152,0.206667,0.828182,,,,,
+1,43,must,0.212157,0.815455,0.251373,0.825455,,,,,
+1,43,write,0.256078,0.815152,0.295294,0.825758,,,,,
+1,43,a,0.300784,0.817576,0.310196,0.825758,,,,,
+1,43,letter,0.316078,0.815152,0.356078,0.825758,,,,,
+1,43,to,0.360392,0.815758,0.376471,0.825758,,,,,
+1,43,the,0.381176,0.814848,0.407059,0.825758,,,,,
+1,43,mayor,0.412549,0.817273,0.462745,0.828485,,,,,
+1,43,of,0.467451,0.814545,0.483922,0.825758,,,,,
+1,43,the,0.487843,0.814848,0.513333,0.825758,,,,,
+1,43,foreign,0.517647,0.814848,0.572941,0.828485,,,,,
+1,43,city,0.579216,0.815152,0.607059,0.828182,,,,,
+1,43,indicating,0.612549,0.815152,0.68902,0.828182,,,,,
+1,43,that,0.69451,0.814848,0.72549,0.825758,,,,,
+1,43,they,0.730196,0.815152,0.765098,0.828485,,,,,
+1,43,wish,0.769804,0.815152,0.805882,0.825758,,,,,
+1,43,to,0.811765,0.815455,0.827059,0.825758,,,,,
+1,44,remain,0.118039,0.831515,0.17098,0.842121,,,,,
+1,44,sister,0.176863,0.831515,0.220784,0.842121,,,,,
+1,44,"cities,",0.22549,0.831212,0.27098,0.843939,,,,,
+1,44,but,0.277647,0.831212,0.303137,0.842121,,,,,
+1,44,understand,0.308627,0.831515,0.398039,0.842121,,,,,
+1,44,that,0.404706,0.831212,0.435294,0.842121,,,,,
+1,44,the,0.44,0.831515,0.465882,0.842121,,,,,
+1,44,relationship,0.47098,0.831515,0.563137,0.844848,,,,,
+1,44,will,0.568627,0.831515,0.593725,0.841818,,,,,
+1,44,remain,0.599608,0.831212,0.653333,0.841818,,,,,
+1,44,inactive,0.659608,0.831212,0.720784,0.842121,,,,,
+1,44,until,0.725882,0.831515,0.758039,0.841818,,,,,
+1,44,such,0.764314,0.831515,0.801961,0.841818,,,,,
+1,44,time,0.807843,0.831212,0.841569,0.842121,,,,,
+1,44,as,0.847843,0.833939,0.86549,0.842424,,,,,
+1,45,both,0.118039,0.847273,0.153333,0.858182,,,,,
+1,45,cities,0.159216,0.847273,0.200784,0.858182,,,,,
+1,45,are,0.206275,0.850303,0.231765,0.858485,,,,,
+1,45,able,0.236471,0.847273,0.270196,0.858182,,,,,
+1,45,to,0.274902,0.847879,0.290196,0.858182,,,,,
+1,45,sustain,0.296471,0.847576,0.352549,0.858182,,,,,
+1,45,an,0.358431,0.85,0.377255,0.858485,,,,,
+1,45,active,0.383137,0.847576,0.430588,0.858182,,,,,
+1,45,relationship.,0.435686,0.847576,0.532157,0.860909,,,,,
+1,45,Sister,0.538824,0.847273,0.585098,0.858182,,,,,
+1,45,Cities,0.590196,0.847273,0.634118,0.858182,,,,,
+1,45,International,0.640784,0.847576,0.738431,0.858182,,,,,
+1,45,should,0.744314,0.847576,0.796471,0.857879,,,,,
+1,45,be,0.803529,0.847576,0.823529,0.858182,,,,,
+1,46,informed,0.118039,0.863333,0.187451,0.874242,,,,,
+1,46,in,0.19451,0.863333,0.206667,0.873939,,,,,
+1,46,writing,0.212157,0.863333,0.26549,0.87697,,,,,
+1,46,by,0.271765,0.863333,0.29098,0.87697,,,,,
+1,46,the,0.295686,0.863636,0.320784,0.873939,,,,,
+1,46,mayor,0.326275,0.865758,0.376863,0.87697,,,,,
+1,46,of,0.381569,0.863333,0.397647,0.873939,,,,,
+1,46,the,0.401961,0.863333,0.427059,0.873939,,,,,
+1,46,U.S.,0.433333,0.863333,0.46549,0.873939,,,,,
+1,46,city,0.472157,0.863333,0.500392,0.87697,,,,,
+1,46,of,0.504706,0.863333,0.521569,0.873939,,,,,
+1,46,the,0.52549,0.863333,0.55098,0.873939,,,,,
+1,46,situation.,0.556471,0.863636,0.627843,0.873939,,,,,
+1,46,Sister,0.634902,0.863333,0.680784,0.874242,,,,,
+1,46,Cities,0.685882,0.863333,0.729412,0.873939,,,,,
+1,46,International,0.735686,0.863636,0.833725,0.874242,,,,,
+1,46,will,0.839216,0.863333,0.864706,0.873939,,,,,
+2,2,SisterCities,0.169804,0.033333,0.40902,0.061515,,,,,
+2,3,Partnership,0.516078,0.027879,0.733725,0.060303,,,,,
+2,3,Agreement,0.747451,0.028182,0.957255,0.060303,,,,,
+2,4,INTERNATIONAL,0.170196,0.06697,0.408235,0.075758,,,,,
+2,5,Connect,0.169804,0.087273,0.236078,0.097576,,,,,
+2,5,globally.,0.240784,0.087273,0.301569,0.100303,,,,,
+2,5,Thrive,0.307059,0.08697,0.354118,0.097576,,,,,
+2,5,locally.,0.358824,0.087273,0.40902,0.1,,,,,
+2,6,Toolkit,0.83098,0.072727,0.958431,0.098485,,,,,
+2,7,then,0.117255,0.132727,0.151765,0.143333,,,,,
+2,7,place,0.158039,0.132727,0.201176,0.146061,,,,,
+2,7,the,0.206275,0.132727,0.231765,0.143333,,,,,
+2,7,partnership,0.237255,0.132727,0.326667,0.146061,,,,,
+2,7,into,0.333333,0.132727,0.361961,0.143333,,,,,
+2,7,Emeritus,0.368627,0.132727,0.437647,0.143333,,,,,
+2,7,Status,0.443922,0.132424,0.49451,0.143333,,,,,
+2,7,and,0.5,0.132727,0.528627,0.143333,,,,,
+2,7,will,0.53451,0.132424,0.56,0.143333,,,,,
+2,7,reflect,0.566275,0.132424,0.614902,0.143333,,,,,
+2,7,this,0.620392,0.132727,0.648627,0.143333,,,,,
+2,7,status,0.65451,0.13303,0.703137,0.143333,,,,,
+2,7,in,0.70902,0.132727,0.721569,0.143333,,,,,
+2,7,directories,0.727059,0.132727,0.811373,0.143333,,,,,
+2,7,and,0.816471,0.132727,0.845882,0.143333,,,,,
+2,7,all,0.852157,0.132727,0.86902,0.143333,,,,,
+2,8,lists,0.118039,0.148788,0.148235,0.159697,,,,,
+2,8,of,0.154118,0.148788,0.170196,0.159697,,,,,
+2,8,sister,0.174902,0.149091,0.218431,0.159697,,,,,
+2,8,city,0.223137,0.149091,0.251373,0.162424,,,,,
+2,8,programs.,0.256471,0.151515,0.336863,0.162424,,,,,
+2,9,If,0.118431,0.172424,0.127843,0.183333,,,,,
+2,9,a,0.132941,0.175455,0.141961,0.183333,,,,,
+2,9,community,0.147843,0.172727,0.236078,0.186061,,,,,
+2,9,wishes,0.241176,0.172727,0.296078,0.183333,,,,,
+2,9,to,0.301569,0.17303,0.316863,0.183333,,,,,
+2,9,terminate,0.322353,0.172727,0.402353,0.183636,,,,,
+2,9,a,0.407843,0.175152,0.417255,0.183333,,,,,
+2,9,sister,0.422353,0.172727,0.46549,0.183636,,,,,
+2,9,city,0.470588,0.172727,0.498824,0.186061,,,,,
+2,9,"relationship,",0.503922,0.172727,0.6,0.186061,,,,,
+2,9,then,0.606275,0.172727,0.640784,0.183333,,,,,
+2,9,a,0.647059,0.175152,0.656471,0.183333,,,,,
+2,9,letter,0.661961,0.172727,0.701961,0.183333,,,,,
+2,9,from,0.706667,0.172424,0.742745,0.183333,,,,,
+2,9,the,0.748627,0.172727,0.774118,0.183333,,,,,
+2,9,mayor,0.78,0.175152,0.830196,0.186061,,,,,
+2,9,or,0.834118,0.175152,0.85098,0.183333,,,,,
+2,10,highest,0.118039,0.189091,0.175686,0.202424,,,,,
+2,10,elected,0.181176,0.189091,0.238431,0.199697,,,,,
+2,10,official,0.245098,0.188788,0.296863,0.199697,,,,,
+2,10,of,0.302353,0.188788,0.318431,0.199697,,,,,
+2,10,the,0.322353,0.189091,0.348235,0.199697,,,,,
+2,10,U.S.,0.354118,0.188788,0.386275,0.199697,,,,,
+2,10,city,0.392941,0.189091,0.421176,0.202424,,,,,
+2,10,should,0.426275,0.189091,0.478431,0.199697,,,,,
+2,10,be,0.48549,0.189091,0.505098,0.199697,,,,,
+2,10,sent,0.510196,0.189394,0.543922,0.199697,,,,,
+2,10,to,0.549412,0.189394,0.564706,0.199697,,,,,
+2,10,the,0.570196,0.189091,0.595686,0.199697,,,,,
+2,10,mayor,0.601569,0.191212,0.651765,0.202424,,,,,
+2,10,of,0.656078,0.188788,0.672549,0.199697,,,,,
+2,10,the,0.676471,0.189091,0.701961,0.199697,,,,,
+2,10,sister,0.707059,0.189091,0.750196,0.199697,,,,,
+2,10,city.,0.755294,0.189091,0.787059,0.202424,,,,,
+2,10,Sister,0.794118,0.188788,0.84,0.199697,,,,,
+2,11,Cities,0.118039,0.204848,0.161961,0.215758,,,,,
+2,11,International,0.168627,0.205152,0.266275,0.215758,,,,,
+2,11,should,0.271765,0.205152,0.323922,0.215758,,,,,
+2,11,be,0.33098,0.205152,0.35098,0.215758,,,,,
+2,11,informed,0.356471,0.204848,0.42549,0.215758,,,,,
+2,11,of,0.432157,0.204848,0.448627,0.215758,,,,,
+2,11,this,0.452549,0.205152,0.480784,0.215758,,,,,
+2,11,action,0.486667,0.205152,0.535294,0.215758,,,,,
+2,11,in,0.541176,0.205152,0.554118,0.215758,,,,,
+2,11,writing,0.558824,0.205152,0.612157,0.218485,,,,,
+2,11,by,0.618431,0.205152,0.637647,0.218485,,,,,
+2,11,the,0.642745,0.205152,0.668235,0.215758,,,,,
+2,11,mayor,0.674118,0.207576,0.723922,0.218485,,,,,
+2,11,of,0.728627,0.204848,0.745098,0.215758,,,,,
+2,11,the,0.74902,0.205152,0.77451,0.215758,,,,,
+2,11,U.S.,0.780392,0.204848,0.812941,0.215758,,,,,
+2,11,city,0.819216,0.205152,0.847451,0.218485,,,,,
+2,12,and,0.117647,0.221515,0.146275,0.232121,,,,,
+2,12,Sister,0.153333,0.221212,0.199216,0.232121,,,,,
+2,12,Cities,0.203922,0.221212,0.248235,0.232121,,,,,
+2,12,International,0.255294,0.221515,0.352157,0.232121,,,,,
+2,12,will,0.357647,0.221515,0.382745,0.232121,,,,,
+2,12,then,0.388235,0.221515,0.422745,0.232121,,,,,
+2,12,remove,0.42902,0.223939,0.488235,0.232121,,,,,
+2,12,the,0.493725,0.221515,0.518824,0.232121,,,,,
+2,12,partnership,0.524706,0.221515,0.613725,0.234848,,,,,
+2,12,from,0.62,0.221212,0.656471,0.232121,,,,,
+2,12,its,0.662353,0.221515,0.680392,0.232121,,,,,
+2,12,directories,0.685882,0.221212,0.769804,0.232121,,,,,
+2,12,and,0.775294,0.221515,0.803922,0.232424,,,,,
+2,12,all,0.810196,0.221515,0.827059,0.232121,,,,,
+2,12,lists,0.834118,0.221212,0.863922,0.232121,,,,,
+2,13,of,0.117647,0.237273,0.133725,0.248182,,,,,
+2,13,sister,0.138431,0.237576,0.181961,0.248485,,,,,
+2,13,city,0.186667,0.237576,0.21451,0.250909,,,,,
+2,13,programs.,0.22,0.24,0.300392,0.250909,,,,,
+2,13,We,0.306275,0.237576,0.332549,0.248182,,,,,
+2,13,do,0.338431,0.237576,0.358431,0.248182,,,,,
+2,13,not,0.364706,0.237879,0.390196,0.248182,,,,,
+2,13,recommend,0.395294,0.237576,0.490196,0.248182,,,,,
+2,13,terminating,0.496471,0.237576,0.58549,0.250909,,,,,
+2,13,a,0.591765,0.240303,0.600784,0.248182,,,,,
+2,13,relationship,0.607059,0.237576,0.697647,0.250909,,,,,
+2,13,simply,0.704314,0.237576,0.755686,0.250909,,,,,
+2,13,because,0.761569,0.237576,0.82902,0.248182,,,,,
+2,13,it,0.834902,0.237576,0.843922,0.248182,,,,,
+2,13,is,0.849412,0.237576,0.861569,0.248182,,,,,
+2,14,dormant.,0.117647,0.253939,0.188627,0.264545,,,,,
+2,14,Many,0.196078,0.253939,0.239216,0.267273,,,,,
+2,14,partnerships,0.244706,0.253939,0.343529,0.26697,,,,,
+2,14,wax,0.34902,0.256364,0.381961,0.264545,,,,,
+2,14,and,0.387451,0.253939,0.415686,0.264545,,,,,
+2,14,wane,0.421569,0.256364,0.465098,0.264545,,,,,
+2,14,over,0.470588,0.256364,0.505882,0.264545,,,,,
+2,14,the,0.510196,0.253939,0.535686,0.264848,,,,,
+2,14,"years,",0.540392,0.256364,0.587843,0.267273,,,,,
+2,14,and,0.594118,0.253939,0.622745,0.264545,,,,,
+2,14,in,0.629804,0.253939,0.642353,0.264545,,,,,
+2,14,many,0.648627,0.256364,0.692157,0.267273,,,,,
+2,14,cases,0.697255,0.256364,0.743529,0.264545,,,,,
+2,14,a,0.749412,0.256667,0.758824,0.264545,,,,,
+2,14,dormant,0.763922,0.253939,0.83098,0.264545,,,,,
+2,15,partnership,0.118039,0.27,0.207059,0.283333,,,,,
+2,15,may,0.214118,0.272121,0.247843,0.283333,,,,,
+2,15,be,0.252941,0.27,0.272549,0.280606,,,,,
+2,15,reinvigorated,0.278431,0.27,0.381176,0.283333,,,,,
+2,15,by,0.388235,0.27,0.407059,0.283333,,,,,
+2,15,local,0.413333,0.27,0.44902,0.280606,,,,,
+2,15,members,0.455294,0.27,0.530196,0.280909,,,,,
+2,15,years,0.535294,0.272424,0.578039,0.283333,,,,,
+2,15,after,0.583922,0.27,0.62,0.280606,,,,,
+2,15,it,0.625098,0.27,0.633333,0.280606,,,,,
+2,15,has,0.639608,0.27,0.667843,0.280909,,,,,
+2,15,been,0.673333,0.27,0.712157,0.280606,,,,,
+2,15,inactive.,0.718039,0.269697,0.782745,0.280909,,,,,
+2,16,General,0.118039,0.295152,0.211373,0.311212,,,,,
+2,16,Guidelines,0.221569,0.295152,0.350196,0.311212,,,,,
+2,17,In,0.118431,0.324848,0.131373,0.335455,,,,,
+2,17,order,0.137255,0.324848,0.179608,0.335455,,,,,
+2,17,for,0.184314,0.324545,0.206667,0.335455,,,,,
+2,17,a,0.211373,0.327273,0.220784,0.335758,,,,,
+2,17,sister,0.225882,0.324848,0.269412,0.335455,,,,,
+2,17,city/county/state,0.27451,0.324242,0.408627,0.337879,,,,,
+2,17,partnership,0.414118,0.324545,0.503529,0.337879,,,,,
+2,17,to,0.509412,0.325455,0.525098,0.335455,,,,,
+2,17,be,0.53098,0.324848,0.55098,0.335455,,,,,
+2,17,recognized,0.556078,0.324545,0.643137,0.337879,,,,,
+2,17,by,0.650196,0.324545,0.669412,0.337879,,,,,
+2,17,Sister,0.675294,0.324545,0.720784,0.335455,,,,,
+2,17,Cities,0.725882,0.324545,0.770588,0.335455,,,,,
+2,17,International,0.776863,0.324848,0.873725,0.335758,,,,,
+2,18,"(SCI),",0.118039,0.340606,0.16,0.354242,,,,,
+2,18,the,0.165882,0.341212,0.191373,0.351818,,,,,
+2,18,two,0.196078,0.341515,0.225098,0.351818,,,,,
+2,18,communities,0.23098,0.340909,0.333333,0.351818,,,,,
+2,18,must,0.339216,0.341515,0.378431,0.351818,,,,,
+2,18,sign,0.383922,0.341212,0.416471,0.354242,,,,,
+2,18,formal,0.421961,0.341212,0.472157,0.351818,,,,,
+2,18,documents,0.477647,0.341212,0.567059,0.351818,,,,,
+2,18,which,0.572549,0.341212,0.619216,0.351818,,,,,
+2,18,clearly,0.625098,0.341212,0.676863,0.354545,,,,,
+2,18,endorse,0.682353,0.341212,0.747059,0.351818,,,,,
+2,18,the,0.752157,0.341212,0.777647,0.351818,,,,,
+2,18,link.,0.782745,0.340909,0.813333,0.351818,,,,,
+2,18,This,0.824706,0.341212,0.858039,0.351818,,,,,
+2,19,presumes,0.118039,0.359697,0.196078,0.370606,,,,,
+2,19,several,0.201961,0.357273,0.258039,0.367879,,,,,
+2,19,key,0.263922,0.35697,0.291765,0.370606,,,,,
+2,19,items:,0.296863,0.357273,0.343922,0.367879,,,,,
+2,19,that,0.349804,0.357273,0.380784,0.367879,,,,,
+2,19,the,0.385882,0.357273,0.411373,0.367879,,,,,
+2,19,U.S.,0.416863,0.35697,0.450196,0.367879,,,,,
+2,19,community,0.456078,0.357273,0.544706,0.370606,,,,,
+2,19,is,0.550588,0.357273,0.562353,0.367879,,,,,
+2,19,already,0.568235,0.35697,0.627059,0.370606,,,,,
+2,19,a,0.631765,0.359697,0.641176,0.367879,,,,,
+2,19,member,0.647059,0.357273,0.713333,0.367879,,,,,
+2,19,of,0.718039,0.35697,0.734118,0.367879,,,,,
+2,19,SCI,0.739216,0.35697,0.767059,0.367576,,,,,
+2,19,and,0.772941,0.357273,0.801569,0.367879,,,,,
+2,19,has,0.80902,0.357273,0.836471,0.367879,,,,,
+2,20,followed,0.117255,0.373333,0.183922,0.384242,,,,,
+2,20,proper,0.19098,0.376061,0.243922,0.38697,,,,,
+2,20,procedures,0.24902,0.373636,0.339216,0.38697,,,,,
+2,20,(e.g.,0.34549,0.373333,0.378431,0.386667,,,,,
+2,20,passed,0.385098,0.373636,0.441961,0.38697,,,,,
+2,20,a,0.448235,0.376061,0.457647,0.384242,,,,,
+2,20,city,0.463529,0.373636,0.491373,0.38697,,,,,
+2,20,council,0.496471,0.373636,0.552549,0.384242,,,,,
+2,20,resolution,0.558824,0.373636,0.636863,0.384242,,,,,
+2,20,declaring,0.642745,0.373636,0.715686,0.38697,,,,,
+2,20,the,0.721569,0.373333,0.747059,0.384242,,,,,
+2,20,intent,0.752549,0.373636,0.796471,0.384242,,,,,
+2,20,to,0.801569,0.373939,0.817255,0.384242,,,,,
+2,20,twin,0.822745,0.373636,0.855294,0.384242,,,,,
+2,21,with,0.117255,0.389697,0.149804,0.4,,,,,
+2,21,the,0.155686,0.389394,0.181176,0.400303,,,,,
+2,21,specific,0.186275,0.389394,0.247451,0.402424,,,,,
+2,21,city);,0.252941,0.389394,0.289804,0.40303,,,,,
+2,21,that,0.296078,0.389697,0.326275,0.400303,,,,,
+2,21,both,0.332157,0.389394,0.367843,0.400303,,,,,
+2,21,communities,0.373725,0.389394,0.476078,0.400303,,,,,
+2,21,share,0.481569,0.389394,0.525098,0.400303,,,,,
+2,21,a,0.530588,0.392121,0.54,0.400303,,,,,
+2,21,mutual,0.545882,0.389697,0.599216,0.400303,,,,,
+2,21,commitment,0.605098,0.389394,0.705098,0.400303,,,,,
+2,21,to,0.710588,0.390303,0.726667,0.400606,,,,,
+2,21,the,0.731373,0.389697,0.756863,0.400303,,,,,
+2,21,relationship;,0.761961,0.389394,0.858039,0.40303,,,,,
+2,22,and,0.117647,0.405758,0.146275,0.416364,,,,,
+2,22,that,0.152549,0.405758,0.183529,0.416364,,,,,
+2,22,both,0.189412,0.405455,0.224314,0.416364,,,,,
+2,22,have,0.23098,0.405758,0.268235,0.416364,,,,,
+2,22,secured,0.273725,0.405455,0.336471,0.416667,,,,,
+2,22,the,0.342745,0.405758,0.368235,0.416364,,,,,
+2,22,necessary,0.374118,0.408182,0.454902,0.419091,,,,,
+2,22,support,0.460392,0.406364,0.521569,0.419091,,,,,
+2,22,structure,0.527451,0.406061,0.598431,0.416364,,,,,
+2,22,to,0.603529,0.406061,0.618824,0.416364,,,,,
+2,22,build,0.625098,0.405455,0.662353,0.416364,,,,,
+2,22,a,0.669412,0.408182,0.678431,0.416364,,,,,
+2,22,lasting,0.684314,0.405455,0.734902,0.419091,,,,,
+2,22,relationship.,0.741961,0.405455,0.836863,0.419091,,,,,
+2,22,You,0.844314,0.405758,0.876078,0.416364,,,,,
+2,23,should,0.117647,0.422121,0.170196,0.432727,,,,,
+2,23,check,0.176863,0.422121,0.224706,0.432727,,,,,
+2,23,with,0.229412,0.422121,0.262745,0.432727,,,,,
+2,23,your,0.268235,0.424545,0.303529,0.435455,,,,,
+2,23,local,0.308627,0.422121,0.344706,0.432727,,,,,
+2,23,sister,0.350588,0.422121,0.394118,0.432727,,,,,
+2,23,city,0.399216,0.422121,0.427451,0.435152,,,,,
+2,23,program,0.432549,0.424545,0.499608,0.435758,,,,,
+2,23,to,0.505098,0.422424,0.520784,0.432727,,,,,
+2,23,see,0.526667,0.424545,0.554902,0.432727,,,,,
+2,23,if,0.56,0.421818,0.569804,0.432727,,,,,
+2,23,they,0.574118,0.421818,0.608235,0.435455,,,,,
+2,23,have,0.61451,0.421818,0.651765,0.432727,,,,,
+2,23,any,0.656863,0.424545,0.685098,0.435455,,,,,
+2,23,additional,0.690588,0.421818,0.767843,0.432727,,,,,
+2,23,requirements,0.774118,0.422121,0.878824,0.435455,,,,,
+2,24,before,0.118039,0.437879,0.16902,0.448788,,,,,
+2,24,pursuing,0.17451,0.438182,0.242745,0.451515,,,,,
+2,24,a,0.24902,0.440606,0.258431,0.448788,,,,,
+2,24,sister,0.263922,0.438182,0.306667,0.448788,,,,,
+2,24,city,0.311765,0.437879,0.34,0.451515,,,,,
+2,24,relationship.,0.345098,0.437879,0.441569,0.451515,,,,,
+2,25,SCI,0.118039,0.461818,0.146275,0.472727,,,,,
+2,25,often,0.152157,0.461515,0.191765,0.472727,,,,,
+2,25,refers,0.198039,0.461515,0.242745,0.472727,,,,,
+2,25,to,0.247843,0.462424,0.263922,0.472727,,,,,
+2,25,these,0.26902,0.461818,0.312941,0.472727,,,,,
+2,25,agreements,0.318039,0.462424,0.412549,0.475455,,,,,
+2,25,as,0.418431,0.464242,0.436471,0.472727,,,,,
+2,25,a,0.441961,0.464242,0.45098,0.47303,,,,,
+2,25,"""Sister",0.457255,0.461515,0.510588,0.472727,,,,,
+2,25,City,0.515686,0.461515,0.546275,0.475455,,,,,
+2,25,"Agreement""",0.55098,0.461515,0.645882,0.475455,,,,,
+2,25,or,0.652549,0.464242,0.669412,0.472727,,,,,
+2,25,"""Memorandum",0.674118,0.461515,0.793333,0.472727,,,,,
+2,25,of,0.799216,0.461515,0.815294,0.472727,,,,,
+2,26,"Understanding.""",0.118039,0.477576,0.248627,0.491515,,,,,
+2,26,"However,",0.259608,0.478182,0.333333,0.490909,,,,,
+2,26,as,0.34,0.480606,0.358039,0.488788,,,,,
+2,26,the,0.363137,0.477879,0.388627,0.488788,,,,,
+2,26,following,0.393333,0.477879,0.464314,0.491515,,,,,
+2,26,examples,0.470588,0.478182,0.547059,0.491515,,,,,
+2,26,"show,",0.552941,0.478182,0.599608,0.490909,,,,,
+2,26,the,0.60549,0.478182,0.63098,0.488788,,,,,
+2,26,actual,0.636078,0.478182,0.683922,0.488788,,,,,
+2,26,name,0.690196,0.480606,0.733333,0.488788,,,,,
+2,26,and,0.739216,0.478182,0.767843,0.488788,,,,,
+2,26,format,0.774118,0.477879,0.826275,0.488788,,,,,
+2,26,of,0.831765,0.477879,0.848235,0.488788,,,,,
+2,27,your,0.117255,0.496667,0.153333,0.507879,,,,,
+2,27,documents,0.158431,0.494242,0.247843,0.505152,,,,,
+2,27,is,0.253725,0.494242,0.26549,0.505152,,,,,
+2,27,left,0.271765,0.494242,0.295294,0.505152,,,,,
+2,27,up,0.301176,0.49697,0.32,0.507879,,,,,
+2,27,to,0.325882,0.494848,0.341176,0.505152,,,,,
+2,27,you.,0.347059,0.49697,0.38,0.507879,,,,,
+2,28,A,0.117255,0.517879,0.129412,0.528485,,,,,
+2,28,few,0.134118,0.517879,0.162353,0.528485,,,,,
+2,28,things,0.167451,0.517879,0.216078,0.531515,,,,,
+2,28,to,0.221176,0.518485,0.236863,0.528788,,,,,
+2,28,keep,0.243137,0.517879,0.280784,0.531212,,,,,
+2,28,in,0.287451,0.517879,0.3,0.528788,,,,,
+2,28,mind,0.306275,0.517879,0.343922,0.528788,,,,,
+2,28,as,0.35098,0.520606,0.36902,0.528788,,,,,
+2,28,you,0.374118,0.520606,0.402745,0.531515,,,,,
+2,28,draft,0.408627,0.517879,0.44549,0.528788,,,,,
+2,28,your,0.45098,0.520606,0.486667,0.531515,,,,,
+2,28,agreement:,0.491373,0.518485,0.580392,0.531818,,,,,
+2,29,Your,0.176471,0.542424,0.21451,0.553333,,,,,
+2,29,agreement,0.219608,0.543333,0.305098,0.556061,,,,,
+2,29,can,0.310588,0.544848,0.338431,0.553333,,,,,
+2,29,range,0.345098,0.544848,0.389412,0.556061,,,,,
+2,29,from,0.394902,0.542121,0.43098,0.553333,,,,,
+2,29,the,0.436863,0.542424,0.461961,0.553333,,,,,
+2,29,"ceremonial,",0.467843,0.542424,0.56,0.555152,,,,,
+2,29,with,0.56549,0.542424,0.598039,0.55303,,,,,
+2,29,language,0.604706,0.542424,0.677255,0.556061,,,,,
+2,29,focusing,0.682353,0.542121,0.750196,0.556061,,,,,
+2,29,on,0.756471,0.544848,0.775686,0.553333,,,,,
+2,29,each,0.781961,0.542424,0.819216,0.553333,,,,,
+2,29,city's,0.825882,0.542424,0.867451,0.556061,,,,,
+2,30,commitment,0.176471,0.558788,0.276863,0.569394,,,,,
+2,30,to,0.281961,0.559091,0.297255,0.569394,,,,,
+2,30,fostering,0.303137,0.558485,0.372157,0.572121,,,,,
+2,30,"understanding,",0.378824,0.558788,0.497647,0.571818,,,,,
+2,30,"cooperation,",0.503922,0.558788,0.603529,0.571818,,,,,
+2,30,and,0.610196,0.558485,0.638431,0.569394,,,,,
+2,30,mutual,0.645882,0.558788,0.699216,0.569394,,,,,
+2,30,benefit,0.70549,0.558485,0.759608,0.569394,,,,,
+2,30,to,0.764706,0.559091,0.780392,0.569394,,,,,
+2,30,the,0.78549,0.558485,0.811373,0.569394,,,,,
+2,30,"precise,",0.816863,0.558788,0.878824,0.571818,,,,,
+2,31,with,0.176078,0.575152,0.208627,0.585758,,,,,
+2,31,particular,0.215686,0.574848,0.290196,0.588485,,,,,
+2,31,areas,0.294902,0.577576,0.338039,0.585758,,,,,
+2,31,of,0.343922,0.574848,0.36,0.585758,,,,,
+2,31,"interest,",0.364314,0.575152,0.427059,0.587576,,,,,
+2,31,specific,0.434118,0.574848,0.49451,0.588182,,,,,
+2,31,"programs/activities,",0.500784,0.574848,0.657647,0.588485,,,,,
+2,31,or,0.664314,0.577576,0.680784,0.585758,,,,,
+2,31,more,0.685882,0.577576,0.726275,0.585758,,,,,
+2,31,concrete,0.732157,0.575455,0.801569,0.586061,,,,,
+2,31,goals,0.807059,0.575152,0.849804,0.588485,,,,,
+2,32,related,0.176863,0.591515,0.230196,0.602121,,,,,
+2,32,to,0.236078,0.591818,0.252157,0.602121,,,,,
+2,32,anything,0.258039,0.591212,0.325098,0.604545,,,,,
+2,32,from,0.33098,0.591212,0.367059,0.602121,,,,,
+2,32,numbers,0.373333,0.591212,0.443529,0.602121,,,,,
+2,32,of,0.448627,0.591212,0.465098,0.602121,,,,,
+2,32,exchanges,0.469412,0.591515,0.556078,0.604848,,,,,
+2,32,to,0.560784,0.591818,0.576863,0.602121,,,,,
+2,32,economic,0.582745,0.591515,0.660392,0.602424,,,,,
+2,32,development.,0.666667,0.591515,0.773725,0.604545,,,,,
+2,33,Don't,0.177255,0.620606,0.22,0.631515,,,,,
+2,33,try,0.224706,0.621212,0.24549,0.634242,,,,,
+2,33,to,0.250588,0.621212,0.266275,0.631515,,,,,
+2,33,include,0.272157,0.620606,0.32902,0.631515,,,,,
+2,33,everything,0.335294,0.620606,0.417255,0.634242,,,,,
+2,33,you,0.422353,0.623333,0.451373,0.634242,,,,,
+2,33,plan,0.457255,0.620606,0.490196,0.633939,,,,,
+2,33,to,0.496078,0.621212,0.511765,0.631515,,,,,
+2,33,do.,0.517647,0.620606,0.542353,0.631515,,,,,
+2,33,Some,0.548627,0.620303,0.594902,0.631515,,,,,
+2,33,"specifics,",0.600392,0.620303,0.67451,0.633939,,,,,
+2,33,like,0.681961,0.620303,0.707843,0.631515,,,,,
+2,33,particular,0.713333,0.620606,0.788235,0.633939,,,,,
+2,33,areas,0.793333,0.62303,0.836471,0.631515,,,,,
+2,33,of,0.841961,0.620606,0.858431,0.631515,,,,,
+2,34,interest,0.176471,0.63697,0.236078,0.647273,,,,,
+2,34,or,0.241176,0.639394,0.258039,0.647576,,,,,
+2,34,participating,0.262745,0.636667,0.361569,0.650606,,,,,
+2,34,institutions,0.367843,0.63697,0.453725,0.647879,,,,,
+2,34,are,0.459608,0.639394,0.484706,0.647576,,,,,
+2,34,good,0.489804,0.636667,0.530588,0.65,,,,,
+2,34,to,0.536078,0.637576,0.552157,0.647576,,,,,
+2,34,include.,0.558039,0.636667,0.619216,0.647576,,,,,
+2,34,"However,",0.626275,0.636667,0.700392,0.649091,,,,,
+2,34,there's,0.706275,0.63697,0.760784,0.647576,,,,,
+2,34,no,0.767059,0.639394,0.786275,0.647879,,,,,
+2,34,need,0.792549,0.63697,0.831373,0.647273,,,,,
+2,34,to,0.836471,0.637576,0.852549,0.647576,,,,,
+2,35,include,0.176863,0.65303,0.233725,0.663939,,,,,
+2,35,all,0.239216,0.65303,0.255686,0.663939,,,,,
+2,35,the,0.261176,0.653333,0.287059,0.663939,,,,,
+2,35,programs,0.292549,0.655455,0.368627,0.66697,,,,,
+2,35,you,0.373725,0.655758,0.402745,0.666061,,,,,
+2,35,plan,0.40902,0.653333,0.441961,0.666364,,,,,
+2,35,to,0.447451,0.653939,0.463137,0.663333,,,,,
+2,35,do,0.46902,0.65303,0.48902,0.663939,,,,,
+2,35,if,0.495294,0.65303,0.504706,0.663636,,,,,
+2,35,it,0.509804,0.65303,0.518431,0.663636,,,,,
+2,35,makes,0.523922,0.653333,0.575686,0.663636,,,,,
+2,35,the,0.581176,0.65303,0.607059,0.663939,,,,,
+2,35,document,0.612157,0.653333,0.692941,0.663939,,,,,
+2,35,too,0.697647,0.653636,0.723529,0.663636,,,,,
+2,35,lengthy,0.729412,0.65303,0.787843,0.66697,,,,,
+2,35,or,0.793333,0.655758,0.809804,0.663939,,,,,
+2,35,limits,0.81451,0.652727,0.855686,0.663939,,,,,
+2,36,the,0.176078,0.669091,0.201569,0.679697,,,,,
+2,36,scope,0.207059,0.671515,0.256078,0.682424,,,,,
+2,36,of,0.261176,0.668788,0.277255,0.679697,,,,,
+2,36,projects.,0.281961,0.669394,0.349804,0.682727,,,,,
+2,36,This,0.356471,0.669091,0.388627,0.679697,,,,,
+2,36,is,0.394902,0.669091,0.406667,0.679697,,,,,
+2,36,a,0.412549,0.672121,0.421961,0.68,,,,,
+2,36,formal,0.427059,0.669091,0.477255,0.679697,,,,,
+2,36,document,0.482745,0.669091,0.563137,0.68,,,,,
+2,36,to,0.568627,0.669697,0.584314,0.679697,,,,,
+2,36,establish,0.590196,0.669091,0.660784,0.679697,,,,,
+2,36,the,0.665882,0.669091,0.691765,0.679697,,,,,
+2,36,relationship;,0.696863,0.669091,0.793725,0.682727,,,,,
+2,36,specific,0.800392,0.669091,0.861176,0.682727,,,,,
+2,37,"tasks,",0.176078,0.685758,0.222353,0.698182,,,,,
+2,37,"responsibilities,",0.22902,0.685758,0.351765,0.698788,,,,,
+2,37,or,0.358431,0.687879,0.374902,0.696061,,,,,
+2,37,other,0.379608,0.685758,0.421569,0.696061,,,,,
+2,37,nuts-and-bolts,0.426667,0.685455,0.544706,0.696061,,,,,
+2,37,text,0.549412,0.685758,0.579608,0.696364,,,,,
+2,37,related,0.585882,0.685758,0.638824,0.696364,,,,,
+2,37,to,0.64549,0.686364,0.661176,0.696364,,,,,
+2,37,implementation,0.666667,0.685758,0.789412,0.698788,,,,,
+2,37,or,0.794902,0.688182,0.811765,0.696061,,,,,
+2,38,administration,0.176471,0.701515,0.28902,0.712727,,,,,
+2,38,of,0.295294,0.701515,0.311373,0.712424,,,,,
+2,38,the,0.315294,0.701515,0.340784,0.712121,,,,,
+2,38,partnership,0.345882,0.701515,0.436471,0.715152,,,,,
+2,38,can,0.442353,0.704242,0.470588,0.712727,,,,,
+2,38,be,0.476863,0.701818,0.496863,0.712121,,,,,
+2,38,expressed,0.502353,0.701818,0.583922,0.715152,,,,,
+2,38,more,0.59098,0.704545,0.63098,0.712424,,,,,
+2,38,fully,0.636471,0.701212,0.668627,0.714848,,,,,
+2,38,in,0.67451,0.701515,0.686667,0.712121,,,,,
+2,38,a,0.692941,0.704545,0.701961,0.712727,,,,,
+2,38,separate,0.707843,0.701818,0.776863,0.715152,,,,,
+2,39,memorandum,0.177255,0.717879,0.287843,0.728182,,,,,
+2,39,between,0.294118,0.717576,0.361569,0.728485,,,,,
+2,39,the,0.367059,0.717879,0.392549,0.728182,,,,,
+2,39,respective,0.397647,0.717879,0.48,0.731212,,,,,
+2,39,sister,0.485882,0.717879,0.528627,0.728182,,,,,
+2,39,city,0.534118,0.717879,0.562353,0.731212,,,,,
+2,39,committees.,0.567059,0.717879,0.664314,0.728182,,,,,
+2,39,Your,0.67098,0.717576,0.709804,0.728182,,,,,
+2,39,partnership,0.714902,0.717576,0.804314,0.731212,,,,,
+2,40,agreement,0.176471,0.734545,0.261569,0.747576,,,,,
+2,40,is,0.267843,0.733939,0.28,0.744545,,,,,
+2,40,a,0.28549,0.73697,0.294902,0.745152,,,,,
+2,40,historical,0.300392,0.733939,0.371373,0.744545,,,,,
+2,40,document,0.376863,0.733939,0.458039,0.744545,,,,,
+2,40,and,0.463137,0.733939,0.492157,0.744545,,,,,
+2,40,should,0.498431,0.733939,0.550588,0.744545,,,,,
+2,40,not,0.558039,0.734242,0.582745,0.744545,,,,,
+2,40,be,0.588627,0.733939,0.608627,0.744545,,,,,
+2,40,dated,0.613725,0.733939,0.658431,0.744545,,,,,
+2,40,or,0.665098,0.736667,0.681569,0.744848,,,,,
+2,40,limited,0.686275,0.733939,0.737647,0.744545,,,,,
+2,40,by,0.744706,0.733636,0.763922,0.747576,,,,,
+2,40,being,0.770196,0.733939,0.812549,0.747576,,,,,
+2,40,aligned,0.819216,0.733939,0.875686,0.747576,,,,,
+2,41,with,0.176078,0.750606,0.208627,0.760909,,,,,
+2,41,very,0.21451,0.752727,0.248235,0.763939,,,,,
+2,41,specific,0.253333,0.750606,0.314118,0.763939,,,,,
+2,41,tasks.,0.319608,0.750909,0.366275,0.761212,,,,,
+2,42,Work,0.176078,0.779697,0.218039,0.790606,,,,,
+2,42,with,0.223529,0.78,0.256471,0.790303,,,,,
+2,42,your,0.261569,0.782424,0.297647,0.79303,,,,,
+2,42,counterparts.,0.302745,0.780303,0.407843,0.793333,,,,,
+2,42,Remember,0.414902,0.779697,0.503137,0.790606,,,,,
+2,42,that,0.507843,0.78,0.538431,0.790606,,,,,
+2,42,this,0.543529,0.78,0.571765,0.790303,,,,,
+2,42,is,0.578039,0.78,0.589804,0.790303,,,,,
+2,42,signed,0.595686,0.78,0.647059,0.793333,,,,,
+2,42,by,0.654118,0.779697,0.673725,0.793333,,,,,
+2,42,both,0.678824,0.779697,0.71451,0.790303,,,,,
+2,42,cities.,0.720392,0.779697,0.76549,0.790606,,,,,
+2,42,You,0.771765,0.779697,0.803137,0.790606,,,,,
+2,42,should,0.80902,0.779697,0.861569,0.790303,,,,,
+2,43,share,0.176471,0.796061,0.220392,0.806364,,,,,
+2,43,drafts,0.225882,0.795758,0.271373,0.806667,,,,,
+2,43,of,0.276863,0.795758,0.292941,0.806364,,,,,
+2,43,your,0.297255,0.798485,0.332941,0.809091,,,,,
+2,43,agreement,0.337647,0.796364,0.423137,0.809394,,,,,
+2,43,with,0.428235,0.796061,0.460784,0.806364,,,,,
+2,43,your,0.466667,0.798485,0.502745,0.809394,,,,,
+2,43,international,0.507451,0.795758,0.604314,0.806667,,,,,
+2,43,partners,0.61098,0.796667,0.676471,0.809394,,,,,
+2,43,and,0.682745,0.795758,0.71098,0.806364,,,,,
+2,43,solicit,0.718039,0.795758,0.763137,0.806364,,,,,
+2,43,feedback,0.76902,0.795758,0.843137,0.806667,,,,,
+2,43,on,0.84902,0.798485,0.868235,0.806667,,,,,
+2,44,what,0.176471,0.812424,0.215294,0.82303,,,,,
+2,44,they'd,0.22,0.812424,0.269804,0.825758,,,,,
+2,44,like,0.276078,0.811818,0.301961,0.822727,,,,,
+2,44,to,0.307059,0.812727,0.323137,0.822727,,,,,
+2,44,see,0.328627,0.814848,0.356471,0.82303,,,,,
+2,44,in,0.362353,0.812424,0.374902,0.822727,,,,,
+2,44,the,0.380392,0.812424,0.405882,0.82303,,,,,
+2,44,agreement.,0.41098,0.81303,0.500392,0.826061,,,,,
+2,44,Be,0.508235,0.812121,0.528627,0.822727,,,,,
+2,44,flexible,0.533725,0.811818,0.590196,0.82303,,,,,
+2,44,to,0.594902,0.81303,0.611373,0.82303,,,,,
+2,44,cultural,0.616863,0.812424,0.675294,0.82303,,,,,
+2,44,or,0.681176,0.814848,0.697255,0.82303,,,,,
+2,44,municipal,0.702353,0.812121,0.777255,0.825758,,,,,
+2,44,priorities.,0.783529,0.812121,0.855686,0.825758,,,,,
+2,45,Ask,0.176078,0.841818,0.205882,0.852121,,,,,
+2,45,your,0.211373,0.844242,0.247059,0.854848,,,,,
+2,45,counterparts,0.252157,0.842121,0.352941,0.855152,,,,,
+2,45,to,0.358431,0.842121,0.373725,0.852424,,,,,
+2,45,translate,0.379216,0.841818,0.448627,0.852424,,,,,
+2,45,the,0.453725,0.841515,0.479216,0.852727,,,,,
+2,45,agreement,0.484706,0.842424,0.569412,0.855152,,,,,
+2,45,if,0.575294,0.841515,0.584706,0.852121,,,,,
+2,45,it,0.589804,0.841515,0.598431,0.852121,,,,,
+2,45,is,0.604314,0.841515,0.616078,0.852121,,,,,
+2,45,drafted,0.621961,0.841515,0.678039,0.852424,,,,,
+2,45,in,0.685098,0.841515,0.697647,0.852121,,,,,
+2,45,English.,0.704314,0.841515,0.765882,0.854848,,,,,
+2,45,It,0.772549,0.841515,0.781176,0.852121,,,,,
+2,45,is,0.787451,0.841818,0.799216,0.852121,,,,,
+2,46,important,0.176863,0.858182,0.252941,0.871515,,,,,
+2,46,for,0.258039,0.857576,0.280784,0.868788,,,,,
+2,46,the,0.284706,0.858182,0.310588,0.868485,,,,,
+2,46,citizens,0.315686,0.858182,0.375294,0.868788,,,,,
+2,46,of,0.381176,0.857879,0.397255,0.868485,,,,,
+2,46,your,0.401569,0.860606,0.437255,0.871515,,,,,
+2,46,partner,0.442353,0.858485,0.500392,0.871515,,,,,
+2,46,community,0.505098,0.858182,0.593333,0.871818,,,,,
+2,46,to,0.598431,0.858485,0.614118,0.868485,,,,,
+2,46,be,0.620392,0.858182,0.64,0.868788,,,,,
+2,46,able,0.64549,0.857879,0.679216,0.868788,,,,,
+2,46,to,0.683922,0.858485,0.7,0.868485,,,,,
+2,46,read,0.705882,0.858485,0.739608,0.868788,,,,,
+2,46,and,0.745882,0.858182,0.77451,0.868788,,,,,
+2,46,understand,0.781569,0.858182,0.87098,0.868788,,,,,
+2,47,the,0.176078,0.873939,0.201569,0.884545,,,,,
+2,47,commitment,0.207059,0.873939,0.307059,0.884545,,,,,
+2,47,their,0.312157,0.873939,0.347843,0.884545,,,,,
+2,47,city,0.352941,0.873939,0.381176,0.887576,,,,,
+2,47,has,0.386667,0.873939,0.414118,0.884545,,,,,
+2,47,made.,0.420392,0.874242,0.468627,0.884545,,,,,
+2,47,Have,0.475686,0.873939,0.515686,0.884545,,,,,
+2,47,someone,0.521569,0.876667,0.595294,0.885152,,,,,
+2,47,in,0.600784,0.873939,0.613725,0.884545,,,,,
+2,47,your,0.619216,0.876667,0.654902,0.887576,,,,,
+2,47,own,0.66,0.876667,0.692941,0.884848,,,,,
+2,47,community,0.698824,0.874545,0.787451,0.887576,,,,,
+2,47,who,0.792157,0.873939,0.826275,0.884545,,,,,
+3,2,SisterCities,0.169804,0.033333,0.40902,0.061515,,,,,
+3,3,Partnership,0.516078,0.027879,0.733333,0.060303,,,,,
+3,3,Agreement,0.747843,0.028182,0.957647,0.060303,,,,,
+3,4,INTERNATIONAL,0.170196,0.06697,0.407843,0.075758,,,,,
+3,5,Connect,0.169804,0.087273,0.236078,0.097576,,,,,
+3,5,globally.,0.240784,0.087273,0.301569,0.100303,,,,,
+3,5,Thrive,0.307059,0.08697,0.354118,0.097576,,,,,
+3,5,locally.,0.358824,0.087273,0.40902,0.100303,,,,,
+3,6,Toolkit,0.83098,0.07303,0.958039,0.098182,,,,,
+3,7,speaks,0.176471,0.132727,0.232941,0.146061,,,,,
+3,7,that,0.238431,0.132727,0.26902,0.143333,,,,,
+3,7,language,0.275294,0.132727,0.347451,0.146061,,,,,
+3,7,check,0.353333,0.132727,0.400784,0.143333,,,,,
+3,7,the,0.405882,0.132727,0.431765,0.143333,,,,,
+3,7,foreign-language,0.436471,0.132424,0.572549,0.146061,,,,,
+3,7,version,0.577647,0.132727,0.634902,0.143333,,,,,
+3,7,to,0.640392,0.13303,0.656471,0.143333,,,,,
+3,7,make,0.662353,0.132727,0.705098,0.143333,,,,,
+3,7,sure,0.710588,0.135152,0.745098,0.143333,,,,,
+3,7,it,0.75098,0.132727,0.759608,0.143333,,,,,
+3,7,mirrors,0.76549,0.132727,0.820784,0.143333,,,,,
+3,7,what,0.826275,0.132727,0.865098,0.143333,,,,,
+3,8,you,0.176471,0.151515,0.204706,0.162424,,,,,
+3,8,have,0.211373,0.148788,0.24902,0.159697,,,,,
+3,8,in,0.25451,0.149091,0.267059,0.159697,,,,,
+3,8,your,0.272941,0.151515,0.308235,0.162424,,,,,
+3,8,own,0.313333,0.151515,0.346275,0.159697,,,,,
+3,8,agreement.,0.352549,0.149394,0.441176,0.162424,,,,,
+3,9,Keep,0.176863,0.178485,0.216863,0.191818,,,,,
+3,9,it,0.223922,0.178788,0.232549,0.189394,,,,,
+3,9,to,0.237647,0.179091,0.252941,0.189394,,,,,
+3,9,one,0.258824,0.181212,0.288627,0.189394,,,,,
+3,9,page.,0.294118,0.181212,0.337647,0.192121,,,,,
+3,9,Ceremonial,0.344314,0.178485,0.43451,0.189394,,,,,
+3,9,documents,0.440392,0.178788,0.530196,0.189394,,,,,
+3,9,such,0.536078,0.178788,0.573725,0.189394,,,,,
+3,9,as,0.579608,0.181212,0.597647,0.189394,,,,,
+3,9,these,0.603137,0.178788,0.646667,0.189394,,,,,
+3,9,partnership,0.652549,0.178788,0.741569,0.192121,,,,,
+3,9,agreements,0.748235,0.179091,0.842353,0.192121,,,,,
+3,10,work,0.176078,0.194848,0.214902,0.205455,,,,,
+3,10,best,0.221176,0.194848,0.254902,0.205758,,,,,
+3,10,if,0.260784,0.194545,0.270196,0.205455,,,,,
+3,10,they,0.27451,0.194545,0.308627,0.208182,,,,,
+3,10,can,0.314118,0.197273,0.342353,0.205455,,,,,
+3,10,be,0.348627,0.194848,0.368627,0.205455,,,,,
+3,10,posted,0.374118,0.194848,0.428235,0.207879,,,,,
+3,10,in,0.435294,0.194848,0.447843,0.205455,,,,,
+3,10,their,0.453333,0.194545,0.488627,0.205455,,,,,
+3,10,entirety.,0.49451,0.194848,0.556863,0.208485,,,,,
+3,11,Most,0.177255,0.224242,0.216471,0.235152,,,,,
+3,11,sister,0.222353,0.224545,0.265098,0.235152,,,,,
+3,11,city,0.270588,0.224545,0.298431,0.237879,,,,,
+3,11,agreements,0.303529,0.224848,0.397647,0.237879,,,,,
+3,11,include,0.403529,0.224242,0.461176,0.235152,,,,,
+3,11,some,0.466275,0.226667,0.510588,0.235152,,,,,
+3,11,acknowledgement,0.516078,0.224242,0.661961,0.237879,,,,,
+3,11,of,0.668235,0.224242,0.684314,0.235152,,,,,
+3,11,the,0.688235,0.224242,0.713725,0.235152,,,,,
+3,11,founding,0.718431,0.224242,0.788235,0.237879,,,,,
+3,11,principles,0.794902,0.224545,0.872157,0.237879,,,,,
+3,12,of,0.176471,0.240303,0.192549,0.251212,,,,,
+3,12,the,0.196863,0.240606,0.222353,0.251212,,,,,
+3,12,sister,0.227451,0.240606,0.270196,0.251515,,,,,
+3,12,city,0.275686,0.240606,0.303922,0.253939,,,,,
+3,12,movement-,0.309804,0.241212,0.414118,0.251212,,,,,
+3,12,to,0.411765,0.240909,0.427059,0.251515,,,,,
+3,12,promote,0.432941,0.240909,0.500784,0.253939,,,,,
+3,12,peace,0.506275,0.24303,0.554902,0.253939,,,,,
+3,12,through,0.559608,0.240606,0.621569,0.253939,,,,,
+3,12,mutual,0.627843,0.240606,0.681176,0.251515,,,,,
+3,12,"respect,",0.687059,0.240909,0.750196,0.253939,,,,,
+3,12,"understanding,",0.757255,0.240606,0.875294,0.253939,,,,,
+3,13,and,0.176471,0.25697,0.205098,0.267576,,,,,
+3,13,cooperation.,0.211765,0.25697,0.31098,0.270303,,,,,
+3,14,Consider,0.176863,0.286061,0.249412,0.29697,,,,,
+3,14,using,0.25451,0.286364,0.296471,0.299697,,,,,
+3,14,official,0.302353,0.286061,0.353333,0.29697,,,,,
+3,14,letterhead,0.359608,0.286364,0.438039,0.29697,,,,,
+3,14,and/or,0.444706,0.286061,0.497647,0.29697,,,,,
+3,14,other,0.502353,0.286364,0.544314,0.29697,,,,,
+3,14,embellishments,0.54902,0.286364,0.674118,0.29697,,,,,
+3,14,such,0.679608,0.286364,0.717255,0.29697,,,,,
+3,14,as,0.723137,0.288788,0.741176,0.29697,,,,,
+3,14,city,0.747059,0.286364,0.775294,0.299697,,,,,
+3,14,seals,0.78,0.286364,0.820784,0.297273,,,,,
+3,14,or,0.826667,0.288788,0.843137,0.29697,,,,,
+3,15,logos,0.176863,0.302121,0.219608,0.315758,,,,,
+3,15,to,0.225098,0.302727,0.240392,0.31303,,,,,
+3,15,reflect,0.246667,0.302121,0.295686,0.31303,,,,,
+3,15,your,0.300784,0.304848,0.336863,0.315758,,,,,
+3,15,enhance,0.341961,0.302424,0.409804,0.31303,,,,,
+3,15,the,0.41451,0.302121,0.44,0.31303,,,,,
+3,15,document.,0.445098,0.302424,0.530196,0.31303,,,,,
+3,15,Sister,0.536863,0.302121,0.582745,0.31303,,,,,
+3,15,city,0.587451,0.302424,0.615686,0.315758,,,,,
+3,15,agreements,0.621176,0.302727,0.714902,0.315758,,,,,
+3,15,are,0.720784,0.304848,0.745882,0.31303,,,,,
+3,15,often,0.750588,0.302424,0.79098,0.31303,,,,,
+3,15,posted,0.797255,0.302424,0.852157,0.315758,,,,,
+3,15,at,0.857647,0.302727,0.872549,0.313333,,,,,
+3,16,city,0.176471,0.318788,0.204706,0.332121,,,,,
+3,16,hall,0.210196,0.318485,0.236471,0.329394,,,,,
+3,16,or,0.241961,0.321212,0.258824,0.329091,,,,,
+3,16,other,0.263922,0.318788,0.30549,0.329394,,,,,
+3,16,municipal,0.310588,0.318485,0.386275,0.331818,,,,,
+3,16,offices,0.392549,0.318485,0.445098,0.329394,,,,,
+3,16,and,0.450588,0.318485,0.478824,0.329394,,,,,
+3,16,should,0.485882,0.318788,0.538039,0.329394,,,,,
+3,16,reflect,0.545098,0.318485,0.593725,0.329394,,,,,
+3,16,their,0.599608,0.318485,0.635294,0.329394,,,,,
+3,16,historical,0.641176,0.318485,0.711373,0.329394,,,,,
+3,16,importance,0.717647,0.318788,0.807451,0.331818,,,,,
+3,17,Look,0.177255,0.347879,0.215686,0.358788,,,,,
+3,17,at,0.221569,0.348485,0.236078,0.358788,,,,,
+3,17,other,0.241569,0.347879,0.283529,0.358788,,,,,
+3,17,agreements,0.288235,0.348485,0.381961,0.361515,,,,,
+3,17,your,0.387843,0.350606,0.423529,0.361515,,,,,
+3,17,city,0.428627,0.348182,0.456863,0.361515,,,,,
+3,17,has,0.462353,0.347879,0.490196,0.358788,,,,,
+3,17,signed.,0.495294,0.347879,0.552157,0.361515,,,,,
+3,17,These,0.558824,0.347879,0.607059,0.358788,,,,,
+3,17,agreements,0.612549,0.348788,0.706275,0.361515,,,,,
+3,17,may,0.712549,0.350606,0.745882,0.361818,,,,,
+3,17,give,0.75098,0.347879,0.783922,0.361515,,,,,
+3,17,you,0.78902,0.350606,0.817255,0.361515,,,,,
+3,17,an,0.823529,0.350606,0.841961,0.358788,,,,,
+3,17,idea,0.848235,0.347879,0.882353,0.358788,,,,,
+3,18,of,0.176471,0.364242,0.192549,0.375152,,,,,
+3,18,what,0.196863,0.364242,0.235686,0.375152,,,,,
+3,18,is,0.241176,0.364545,0.253333,0.375152,,,,,
+3,18,acceptable,0.258824,0.364242,0.347059,0.377576,,,,,
+3,18,or,0.352549,0.36697,0.369412,0.375152,,,,,
+3,18,"possible,",0.374118,0.364242,0.444706,0.377576,,,,,
+3,18,and,0.451373,0.364242,0.479608,0.375152,,,,,
+3,18,they,0.486275,0.364242,0.520392,0.377879,,,,,
+3,18,may,0.526275,0.36697,0.559216,0.377879,,,,,
+3,18,be,0.565098,0.364242,0.584706,0.375152,,,,,
+3,18,in,0.590588,0.364545,0.603137,0.374848,,,,,
+3,18,an,0.60902,0.36697,0.627843,0.375152,,,,,
+3,18,easily,0.634118,0.364242,0.678431,0.377879,,,,,
+3,18,replicable,0.683922,0.364242,0.761176,0.377879,,,,,
+3,18,format.,0.765882,0.364242,0.822353,0.375152,,,,,
+3,18,If,0.829804,0.364242,0.839608,0.375152,,,,,
+3,18,you,0.843922,0.36697,0.872157,0.377879,,,,,
+3,19,cannot,0.176863,0.380909,0.23098,0.391212,,,,,
+3,19,access,0.236863,0.38303,0.292549,0.391515,,,,,
+3,19,older,0.298431,0.380303,0.339216,0.391212,,,,,
+3,19,agreements,0.344314,0.380909,0.437255,0.393939,,,,,
+3,19,please,0.443922,0.380303,0.496078,0.393939,,,,,
+3,19,contact,0.501176,0.380909,0.561569,0.391212,,,,,
+3,19,Sister,0.567059,0.380303,0.612941,0.391212,,,,,
+3,19,Cities,0.618039,0.380303,0.662353,0.391212,,,,,
+3,19,"International,",0.66902,0.380606,0.770588,0.393333,,,,,
+3,19,we,0.776863,0.38303,0.800392,0.391212,,,,,
+3,19,may,0.805882,0.38303,0.84,0.393939,,,,,
+3,20,have,0.176863,0.396667,0.21451,0.407576,,,,,
+3,20,them,0.219216,0.39697,0.259216,0.407576,,,,,
+3,20,on,0.26549,0.399394,0.285098,0.407576,,,,,
+3,20,"file,",0.290196,0.396667,0.317255,0.409394,,,,,
+3,20,although,0.323922,0.39697,0.392157,0.410303,,,,,
+3,20,we,0.398039,0.399394,0.421569,0.407576,,,,,
+3,20,do,0.426667,0.39697,0.446667,0.407879,,,,,
+3,20,not,0.453333,0.397273,0.478039,0.407576,,,,,
+3,20,have,0.484706,0.39697,0.521569,0.407879,,,,,
+3,20,copies,0.526667,0.39697,0.579608,0.41,,,,,
+3,20,of,0.585098,0.396667,0.601569,0.407576,,,,,
+3,20,all,0.605882,0.39697,0.622745,0.407576,,,,,
+3,20,partnership,0.628627,0.396667,0.718431,0.41,,,,,
+3,20,agreements.,0.724706,0.397576,0.822745,0.410606,,,,,
+3,21,Documents,0.177255,0.426364,0.268235,0.437273,,,,,
+3,21,must,0.274118,0.42697,0.313333,0.437273,,,,,
+3,21,be,0.319216,0.426667,0.338824,0.437273,,,,,
+3,21,signed,0.344314,0.426364,0.395686,0.439697,,,,,
+3,21,by,0.402745,0.426364,0.421961,0.439697,,,,,
+3,21,the,0.426667,0.426667,0.452157,0.437273,,,,,
+3,21,top,0.456863,0.42697,0.482745,0.44,,,,,
+3,21,elected,0.489412,0.426667,0.546275,0.437273,,,,,
+3,21,official,0.552941,0.426364,0.604314,0.437273,,,,,
+3,21,of,0.609804,0.426364,0.626275,0.437273,,,,,
+3,21,both,0.63098,0.426364,0.665882,0.437273,,,,,
+3,21,communities.,0.672549,0.426667,0.779216,0.437273,,,,,
+3,22,Check,0.176863,0.455758,0.227451,0.466667,,,,,
+3,22,with,0.232549,0.456061,0.26549,0.466667,,,,,
+3,22,your,0.271373,0.458485,0.306667,0.469394,,,,,
+3,22,"mayor,",0.311765,0.458485,0.36549,0.469091,,,,,
+3,22,city,0.371765,0.456061,0.4,0.469394,,,,,
+3,22,"council,",0.405098,0.456061,0.465882,0.468788,,,,,
+3,22,town,0.471765,0.456364,0.511373,0.466667,,,,,
+3,22,"clerk,",0.517255,0.456061,0.559216,0.468485,,,,,
+3,22,et,0.565882,0.456667,0.580392,0.466667,,,,,
+3,22,al.,0.586275,0.456061,0.602745,0.466667,,,,,
+3,22,to,0.609412,0.456364,0.62549,0.466667,,,,,
+3,22,make,0.631373,0.456061,0.67451,0.466667,,,,,
+3,22,sure,0.68,0.458182,0.714118,0.466667,,,,,
+3,22,that,0.718824,0.456061,0.749804,0.466667,,,,,
+3,22,the,0.755294,0.455758,0.780392,0.466667,,,,,
+3,22,agreement,0.785882,0.456364,0.871373,0.469394,,,,,
+3,23,is,0.176863,0.472121,0.18902,0.482727,,,,,
+3,23,OK,0.194902,0.471818,0.219608,0.482727,,,,,
+3,23,with,0.224314,0.471818,0.257255,0.482727,,,,,
+3,23,them.,0.263137,0.471818,0.307451,0.482727,,,,,
+3,23,The,0.313725,0.471818,0.343529,0.482727,,,,,
+3,23,mayor,0.349412,0.474545,0.399216,0.485455,,,,,
+3,23,is,0.404314,0.472121,0.416078,0.482727,,,,,
+3,23,the,0.421569,0.472121,0.447059,0.48303,,,,,
+3,23,one,0.452157,0.474545,0.481961,0.482727,,,,,
+3,23,putting,0.487451,0.472121,0.542745,0.485455,,,,,
+3,23,his,0.54902,0.471818,0.570588,0.482727,,,,,
+3,23,or,0.576471,0.474545,0.593333,0.482727,,,,,
+3,23,her,0.598824,0.471818,0.623922,0.482727,,,,,
+3,23,name,0.62902,0.474545,0.672549,0.482727,,,,,
+3,23,on,0.678039,0.474545,0.697647,0.482727,,,,,
+3,23,the,0.703137,0.472121,0.728627,0.482727,,,,,
+3,23,"paper,",0.734118,0.474242,0.783922,0.485152,,,,,
+3,23,and,0.790588,0.472121,0.818824,0.482727,,,,,
+3,23,you,0.82549,0.474545,0.85451,0.485455,,,,,
+3,24,don't,0.176863,0.488485,0.218039,0.499394,,,,,
+3,24,want,0.223137,0.488788,0.261961,0.499091,,,,,
+3,24,to,0.267059,0.488788,0.282745,0.499091,,,,,
+3,24,spend,0.288235,0.488182,0.337255,0.501515,,,,,
+3,24,time,0.342745,0.488485,0.377255,0.499091,,,,,
+3,24,developing,0.383137,0.488485,0.470196,0.501818,,,,,
+3,24,an,0.476471,0.490909,0.49451,0.499091,,,,,
+3,24,agreement,0.501176,0.488788,0.586275,0.502121,,,,,
+3,24,which,0.591765,0.488485,0.638039,0.499091,,,,,
+3,24,will,0.644314,0.488182,0.669412,0.498788,,,,,
+3,24,never,0.675294,0.490909,0.719216,0.499091,,,,,
+3,24,be,0.724706,0.488485,0.744314,0.499091,,,,,
+3,24,signed.,0.749804,0.488182,0.806275,0.501818,,,,,
+3,25,Official,0.176863,0.517576,0.23098,0.528485,,,,,
+3,25,documents,0.236863,0.517576,0.326275,0.528485,,,,,
+3,25,are,0.332157,0.520303,0.357255,0.528485,,,,,
+3,25,usually,0.362745,0.517879,0.417647,0.531212,,,,,
+3,25,signed,0.422745,0.517576,0.475294,0.531212,,,,,
+3,25,during,0.481569,0.517576,0.531765,0.531212,,,,,
+3,25,a,0.537647,0.520303,0.546667,0.528485,,,,,
+3,25,formal,0.551765,0.517576,0.601961,0.528485,,,,,
+3,25,ceremony,0.607843,0.520303,0.687059,0.531212,,,,,
+3,25,recognizing,0.692941,0.517879,0.783922,0.531212,,,,,
+3,25,the,0.789804,0.517879,0.815294,0.528788,,,,,
+3,26,partnership.,0.177255,0.533939,0.27098,0.547576,,,,,
+3,26,Be,0.283137,0.534242,0.304314,0.544848,,,,,
+3,26,sure,0.309804,0.536667,0.343922,0.544848,,,,,
+3,26,both,0.349412,0.533939,0.384706,0.544848,,,,,
+3,26,communities,0.39098,0.533939,0.492549,0.544848,,,,,
+3,26,receive,0.498824,0.533939,0.555294,0.544848,,,,,
+3,26,a,0.561176,0.536667,0.570588,0.544848,,,,,
+3,26,signed,0.575686,0.533939,0.627843,0.547273,,,,,
+3,26,set,0.63451,0.534545,0.658431,0.544848,,,,,
+3,26,of,0.663922,0.533939,0.68,0.544848,,,,,
+3,26,the,0.683922,0.533939,0.709412,0.544848,,,,,
+3,26,official,0.71451,0.533939,0.765882,0.544848,,,,,
+3,26,documents,0.771765,0.534242,0.861176,0.544848,,,,,
+3,27,for,0.176078,0.550606,0.198431,0.561212,,,,,
+3,27,their,0.202745,0.550606,0.238824,0.561212,,,,,
+3,27,records.,0.243922,0.550606,0.307843,0.561212,,,,,
+3,28,Remember,0.177255,0.579697,0.265098,0.590606,,,,,
+3,28,to,0.269804,0.580606,0.285098,0.590606,,,,,
+3,28,send,0.29098,0.58,0.328627,0.590606,,,,,
+3,28,your,0.334902,0.582424,0.370588,0.593333,,,,,
+3,28,signed,0.375294,0.58,0.427059,0.59303,,,,,
+3,28,agreement,0.434118,0.580606,0.519216,0.593333,,,,,
+3,28,to,0.524706,0.580606,0.54,0.590606,,,,,
+3,28,Sister,0.546275,0.579697,0.591765,0.590606,,,,,
+3,28,Cities,0.596863,0.579697,0.641176,0.590606,,,,,
+3,28,International.,0.647843,0.58,0.749412,0.590909,,,,,
+3,28,After,0.755686,0.579697,0.794902,0.590606,,,,,
+3,28,we,0.799216,0.582424,0.822745,0.590606,,,,,
+3,29,receive,0.176863,0.596061,0.233333,0.606667,,,,,
+3,29,your,0.238824,0.598485,0.27451,0.609394,,,,,
+3,29,agreement,0.279608,0.596364,0.364706,0.609697,,,,,
+3,29,we,0.369412,0.598485,0.392941,0.606667,,,,,
+3,29,will,0.398039,0.596061,0.423137,0.606667,,,,,
+3,29,post,0.429412,0.596667,0.463529,0.609394,,,,,
+3,29,the,0.46902,0.595758,0.49451,0.606667,,,,,
+3,29,relationship,0.5,0.596061,0.591765,0.609091,,,,,
+3,29,in,0.598039,0.596061,0.610588,0.606667,,,,,
+3,29,the,0.616471,0.595758,0.641961,0.606667,,,,,
+3,29,City,0.647451,0.595758,0.678431,0.609394,,,,,
+3,29,Directory,0.684314,0.595758,0.756471,0.609394,,,,,
+3,29,and,0.761569,0.596061,0.790196,0.606667,,,,,
+3,29,make,0.797255,0.596061,0.840784,0.606667,,,,,
+3,29,sure,0.84549,0.598182,0.88,0.606667,,,,,
+3,30,it,0.176863,0.612121,0.18549,0.62303,,,,,
+3,30,is,0.191373,0.612424,0.203529,0.62303,,,,,
+3,30,included,0.20902,0.612121,0.275686,0.62303,,,,,
+3,30,in,0.283137,0.612424,0.295686,0.62303,,,,,
+3,30,our,0.301569,0.614848,0.328235,0.62303,,,,,
+3,30,Annual,0.332549,0.612424,0.387059,0.62303,,,,,
+3,30,Membership,0.393725,0.612121,0.492157,0.625758,,,,,
+3,30,Directory.,0.498824,0.612121,0.574902,0.625758,,,,,
+3,31,Remember,0.118431,0.640606,0.206667,0.651818,,,,,
+3,31,that,0.21098,0.640909,0.241961,0.651515,,,,,
+3,31,each,0.247451,0.640909,0.28549,0.651818,,,,,
+3,31,city's,0.291765,0.640909,0.333333,0.654545,,,,,
+3,31,sister,0.338431,0.640909,0.381961,0.651818,,,,,
+3,31,city,0.387059,0.640909,0.415294,0.654848,,,,,
+3,31,program,0.420392,0.64303,0.487059,0.654545,,,,,
+3,31,is,0.493725,0.640909,0.505882,0.651515,,,,,
+3,31,independent,0.511765,0.640909,0.611373,0.654545,,,,,
+3,31,and,0.616863,0.640606,0.645882,0.651818,,,,,
+3,31,can,0.652549,0.643333,0.680392,0.652121,,,,,
+3,31,impose,0.687059,0.640909,0.745098,0.654545,,,,,
+3,31,requirements,0.75098,0.640909,0.854902,0.654242,,,,,
+3,32,like,0.118039,0.65697,0.143922,0.667879,,,,,
+3,32,the,0.14902,0.65697,0.17451,0.667879,,,,,
+3,32,establishment,0.18,0.65697,0.291373,0.667576,,,,,
+3,32,of,0.296863,0.65697,0.313333,0.667879,,,,,
+3,32,a,0.317647,0.659697,0.327059,0.668182,,,,,
+3,32,"committee,",0.332549,0.657576,0.421176,0.669394,,,,,
+3,32,a,0.427451,0.66,0.436863,0.668182,,,,,
+3,32,review,0.442353,0.65697,0.493333,0.667879,,,,,
+3,32,"period,",0.499608,0.657273,0.554118,0.670303,,,,,
+3,32,sustainability/funding,0.560392,0.65697,0.729804,0.670606,,,,,
+3,32,"plan,",0.736078,0.65697,0.773333,0.670606,,,,,
+3,32,among,0.78,0.659394,0.833725,0.670606,,,,,
+3,33,"others,",0.117647,0.673333,0.172157,0.686364,,,,,
+3,33,before,0.178824,0.67303,0.229804,0.684242,,,,,
+3,33,sanctioning,0.235294,0.673636,0.326667,0.68697,,,,,
+3,33,a,0.332549,0.675758,0.341961,0.684545,,,,,
+3,33,sister,0.347451,0.673333,0.390588,0.684242,,,,,
+3,33,city,0.396078,0.673333,0.423922,0.68697,,,,,
+3,33,agreement.,0.42902,0.673939,0.518039,0.68697,,,,,
+3,33,Check,0.524706,0.672727,0.576078,0.684242,,,,,
+3,33,with,0.581176,0.673333,0.614118,0.683939,,,,,
+3,33,your,0.619216,0.675758,0.655294,0.686667,,,,,
+3,33,local,0.660392,0.67303,0.696471,0.684242,,,,,
+3,33,program,0.703137,0.675455,0.769804,0.68697,,,,,
+3,33,or,0.776078,0.676061,0.792157,0.684242,,,,,
+3,33,mayor's,0.797647,0.673636,0.861176,0.68697,,,,,
+3,34,office,0.117647,0.689394,0.161569,0.700303,,,,,
+3,34,to,0.166275,0.69,0.181961,0.7,,,,,
+3,34,see,0.187843,0.692121,0.216078,0.700303,,,,,
+3,34,if,0.221569,0.689091,0.23098,0.7,,,,,
+3,34,this,0.234902,0.689394,0.263529,0.700303,,,,,
+3,34,is,0.269412,0.689394,0.281176,0.700303,,,,,
+3,34,the,0.286667,0.689394,0.312157,0.7,,,,,
+3,34,case.,0.317255,0.692121,0.359216,0.700909,,,,,
+3,35,On,0.118039,0.717879,0.140392,0.728788,,,,,
+3,35,the,0.145882,0.718182,0.171373,0.728788,,,,,
+3,35,following,0.176471,0.718182,0.247059,0.732121,,,,,
+3,35,pages,0.253725,0.720606,0.301961,0.731818,,,,,
+3,35,you'll,0.307451,0.718182,0.349412,0.732121,,,,,
+3,35,find,0.35451,0.718182,0.383922,0.729091,,,,,
+3,35,a,0.390196,0.720909,0.399216,0.729091,,,,,
+3,35,series,0.405098,0.718485,0.45098,0.728788,,,,,
+3,35,of,0.457255,0.718182,0.472549,0.729091,,,,,
+3,35,partnership,0.478039,0.718485,0.567843,0.731818,,,,,
+3,35,agreements,0.574118,0.719091,0.667451,0.731818,,,,,
+3,35,to,0.673333,0.718788,0.68902,0.728788,,,,,
+3,35,give,0.694902,0.718182,0.727059,0.732121,,,,,
+3,35,you,0.732549,0.720909,0.761176,0.731818,,,,,
+3,35,an,0.767059,0.720909,0.78549,0.729394,,,,,
+3,35,idea,0.791765,0.718182,0.825098,0.729091,,,,,
+3,35,of,0.830588,0.717879,0.846667,0.729091,,,,,
+3,36,what,0.117255,0.734545,0.156078,0.745152,,,,,
+3,36,is,0.161961,0.734545,0.173725,0.745152,,,,,
+3,36,possible.,0.18,0.734848,0.250196,0.747879,,,,,
+3,36,While,0.256471,0.734545,0.300392,0.745455,,,,,
+3,36,you,0.305882,0.73697,0.334118,0.747879,,,,,
+3,36,should,0.340392,0.734545,0.392549,0.745152,,,,,
+3,36,feel,0.399216,0.734545,0.427451,0.745455,,,,,
+3,36,free,0.432549,0.734545,0.463529,0.745152,,,,,
+3,36,to,0.468235,0.735152,0.483529,0.745152,,,,,
+3,36,use,0.490196,0.737576,0.518039,0.745455,,,,,
+3,36,some,0.523137,0.73697,0.567059,0.745455,,,,,
+3,36,of,0.572549,0.734545,0.588627,0.745455,,,,,
+3,36,the,0.592549,0.734545,0.618039,0.745152,,,,,
+3,36,formatting,0.623137,0.734545,0.704314,0.748182,,,,,
+3,36,and,0.710588,0.734242,0.738824,0.745152,,,,,
+3,36,"language,",0.746275,0.734545,0.822745,0.748182,,,,,
+3,36,we,0.828627,0.737273,0.851765,0.745455,,,,,
+3,37,encourage,0.117647,0.753333,0.202353,0.764242,,,,,
+3,37,you,0.207451,0.753333,0.236471,0.764242,,,,,
+3,37,to,0.241569,0.751212,0.257255,0.761212,,,,,
+3,37,make,0.263529,0.750909,0.306667,0.761212,,,,,
+3,37,your,0.311373,0.753333,0.347451,0.764242,,,,,
+3,37,agreement,0.352549,0.751212,0.437647,0.764242,,,,,
+3,37,your,0.443137,0.753333,0.478824,0.763939,,,,,
+3,37,own,0.483529,0.753333,0.516471,0.761515,,,,,
+3,37,and,0.522353,0.750606,0.550588,0.761212,,,,,
+3,37,be,0.558039,0.750909,0.578039,0.761515,,,,,
+3,37,creative,0.583529,0.750606,0.64549,0.761515,,,,,
+3,37,with,0.65098,0.750606,0.683529,0.761212,,,,,
+3,37,what,0.68902,0.751212,0.727843,0.761515,,,,,
+3,37,you,0.732941,0.753333,0.761961,0.763939,,,,,
+3,37,produce.,0.768235,0.750909,0.837647,0.763939,,,,,
+3,37,If,0.845882,0.750909,0.855294,0.761515,,,,,
+3,38,you,0.117647,0.769091,0.146275,0.780303,,,,,
+3,38,are,0.152157,0.769697,0.177255,0.777879,,,,,
+3,38,unsure,0.182745,0.769394,0.236471,0.777879,,,,,
+3,38,about,0.241961,0.76697,0.287451,0.777576,,,,,
+3,38,your,0.292549,0.769394,0.328627,0.780303,,,,,
+3,38,agreement,0.333725,0.767273,0.418431,0.780303,,,,,
+3,38,or,0.423922,0.769697,0.440784,0.777576,,,,,
+3,38,want,0.445098,0.767273,0.483922,0.777273,,,,,
+3,38,advice,0.489412,0.76697,0.541176,0.777576,,,,,
+3,38,you,0.546275,0.769394,0.574902,0.780303,,,,,
+3,38,can,0.581176,0.769394,0.609412,0.777576,,,,,
+3,38,always,0.615686,0.766667,0.670196,0.780303,,,,,
+3,38,solicit,0.675294,0.766667,0.721176,0.777576,,,,,
+3,38,feedback,0.726275,0.766667,0.800392,0.777576,,,,,
+3,38,by,0.807059,0.766667,0.826275,0.78,,,,,
+3,39,sending,0.117647,0.783333,0.18,0.796364,,,,,
+3,39,it,0.186667,0.783333,0.195294,0.793333,,,,,
+3,39,to,0.200392,0.783636,0.215686,0.793333,,,,,
+3,39,our,0.221569,0.785758,0.248235,0.793939,,,,,
+3,39,Membership,0.253333,0.78303,0.352549,0.796364,,,,,
+3,39,Director,0.359216,0.782727,0.422745,0.793939,,,,,
+3,39,at,0.427843,0.783636,0.442353,0.793939,,,,,
+3,39,akaplan@sister-cities.org,0.447843,0.78303,0.648627,0.796667,,,,,
+3,39,or,0.654118,0.786061,0.670588,0.793939,,,,,
+3,39,contacting,0.675294,0.783333,0.758824,0.796364,,,,,
+3,39,us,0.765098,0.786061,0.783529,0.793939,,,,,
+3,39,at,0.789412,0.783939,0.803922,0.793636,,,,,
+3,39,(202),0.809804,0.78303,0.850196,0.796667,,,,,
+3,40,347-8630.,0.117647,0.799394,0.198431,0.809697,,,,,
+4,2,SisterCities,0.169412,0.033333,0.40902,0.061818,,,,,
+4,3,Partnership,0.516471,0.027879,0.732941,0.060606,,,,,
+4,3,Agreement,0.747843,0.028182,0.957255,0.060606,,,,,
+4,4,INTERNATIONAL,0.170196,0.066667,0.408627,0.075758,,,,,
+4,5,Connect,0.169412,0.08697,0.236078,0.097879,,,,,
+4,5,globally.,0.240784,0.087273,0.301961,0.100303,,,,,
+4,5,Thrive,0.307059,0.08697,0.35451,0.097879,,,,,
+4,5,locally.,0.358824,0.087273,0.409412,0.100303,,,,,
+4,6,Toolkit,0.830588,0.072727,0.958431,0.098485,,,,,
+4,7,jull,0.378039,0.292424,0.422745,0.310303,,,,,
+4,7,bubzig,0.427451,0.291818,0.512941,0.310303,,,,,
+4,7,2000,0.592941,0.291212,0.648235,0.310606,,,,,
+4,7,"3,312",0.654118,0.292121,0.703922,0.31,,,,,
+4,8,ABU,0.376471,0.316667,0.407451,0.332727,,,,,
+4,8,DHABI,0.412157,0.31697,0.456863,0.332727,,,,,
+4,8,MUNICIPALITY,0.461176,0.316667,0.563529,0.33303,,,,,
+4,8,&,0.567843,0.317273,0.579216,0.332727,,,,,
+4,8,TOWN,0.583137,0.31697,0.625882,0.333333,,,,,
+4,8,PLANNING,0.630196,0.31697,0.704314,0.333333,,,,,
+4,9,AN,0.260784,0.375152,0.292157,0.386364,,,,,
+4,9,AGREEMENT,0.299608,0.374545,0.444314,0.386364,,,,,
+4,9,FOR,0.45098,0.374242,0.496863,0.385758,,,,,
+4,9,THE,0.503137,0.374242,0.54902,0.385758,,,,,
+4,9,ESTABLISHMENT,0.556078,0.373636,0.749804,0.385455,,,,,
+4,9,OF,0.756471,0.373636,0.786667,0.385152,,,,,
+4,10,SISTER,0.337647,0.394545,0.421176,0.405758,,,,,
+4,10,CITIES,0.428235,0.394242,0.503922,0.405455,,,,,
+4,10,RELATIONSHIP,0.51098,0.393636,0.680784,0.405152,,,,,
+4,11,BETWEEN,0.454902,0.413636,0.56549,0.424848,,,,,
+4,12,THE,0.337255,0.433939,0.383922,0.444848,,,,,
+4,12,CITY,0.39098,0.433636,0.44549,0.444848,,,,,
+4,12,OF,0.452549,0.433333,0.482745,0.444545,,,,,
+4,12,ABU,0.489412,0.433333,0.536863,0.444545,,,,,
+4,12,DHABI,0.544314,0.43303,0.614902,0.444545,,,,,
+4,12,(,0.623137,0.43303,0.630588,0.446667,,,,,
+4,12,U.,0.637647,0.43303,0.660784,0.444545,,,,,
+4,12,A.E),0.667843,0.432727,0.712941,0.44697,,,,,
+4,13,AND,0.487843,0.452727,0.536078,0.463939,,,,,
+4,14,"HOUSTON,",0.385882,0.472424,0.511765,0.486364,,,,,
+4,14,TEXAS,0.518431,0.472121,0.593725,0.483939,,,,,
+4,14,( U.S.A),0.604706,0.471515,0.683922,0.486364,,,,,
+4,15,The,0.221961,0.52697,0.250196,0.536667,,,,,
+4,15,Sister,0.25451,0.52697,0.295686,0.536667,,,,,
+4,15,City,0.299608,0.526364,0.330588,0.538485,,,,,
+4,15,"Program,",0.336078,0.52697,0.404706,0.538788,,,,,
+4,15,administered,0.40902,0.526061,0.504314,0.535758,,,,,
+4,15,by,0.508627,0.526061,0.527059,0.537879,,,,,
+4,15,Sister,0.530588,0.525758,0.572549,0.535758,,,,,
+4,15,Cities,0.576471,0.525455,0.618039,0.535758,,,,,
+4,15,"International,",0.621961,0.525455,0.722745,0.536667,,,,,
+4,15,was,0.726667,0.528182,0.75451,0.535152,,,,,
+4,15,initiated,0.758824,0.525758,0.819216,0.535455,,,,,
+4,16,By,0.222745,0.540909,0.241569,0.552424,,,,,
+4,16,the,0.246275,0.540909,0.26902,0.550303,,,,,
+4,16,President,0.273725,0.540606,0.341569,0.55,,,,,
+4,16,of,0.345882,0.540303,0.361961,0.549697,,,,,
+4,16,the,0.364706,0.540303,0.388235,0.549697,,,,,
+4,16,United,0.392549,0.54,0.441569,0.549394,,,,,
+4,16,States,0.445882,0.54,0.489804,0.549394,,,,,
+4,16,of,0.494118,0.54,0.510196,0.549394,,,,,
+4,16,America,0.512549,0.539697,0.573725,0.549091,,,,,
+4,16,in,0.578039,0.539697,0.592549,0.549091,,,,,
+4,16,1956,0.598039,0.539394,0.631373,0.549091,,,,,
+4,16,to,0.635294,0.540606,0.649412,0.549091,,,,,
+4,16,encourage,0.653725,0.541818,0.728235,0.551212,,,,,
+4,16,greater,0.732157,0.540909,0.784706,0.551212,,,,,
+4,17,Friendship,0.222745,0.554545,0.3,0.566364,,,,,
+4,17,and,0.304314,0.554242,0.331765,0.563939,,,,,
+4,17,understanding,0.336863,0.553939,0.443922,0.565758,,,,,
+4,17,between,0.448235,0.553939,0.507059,0.563333,,,,,
+4,17,the,0.511765,0.553636,0.534902,0.56303,,,,,
+4,17,United,0.539216,0.553333,0.587843,0.563333,,,,,
+4,17,States,0.592157,0.553333,0.635686,0.56303,,,,,
+4,17,and,0.640392,0.553333,0.667843,0.562727,,,,,
+4,17,other,0.672157,0.553636,0.711765,0.562727,,,,,
+4,17,nations,0.715294,0.553333,0.768627,0.562727,,,,,
+4,17,through,0.772549,0.553333,0.83098,0.565455,,,,,
+4,18,Direct,0.222745,0.568485,0.266667,0.577879,,,,,
+4,18,personal,0.270588,0.568182,0.332157,0.580303,,,,,
+4,18,contact:,0.336471,0.568788,0.394902,0.578485,,,,,
+4,18,and,0.400392,0.567576,0.428235,0.57697,,,,,
+4,19,In,0.222353,0.595758,0.237255,0.605152,,,,,
+4,19,order,0.241961,0.595758,0.281961,0.605152,,,,,
+4,19,to,0.285882,0.596667,0.300392,0.604848,,,,,
+4,19,foster,0.304314,0.595152,0.345882,0.604848,,,,,
+4,19,those,0.349804,0.595152,0.388235,0.604545,,,,,
+4,19,"goals,",0.392157,0.595152,0.433725,0.60697,,,,,
+4,19,the,0.438039,0.595152,0.461569,0.604242,,,,,
+4,19,people,0.465098,0.594848,0.512549,0.60697,,,,,
+4,19,of,0.516471,0.594545,0.532941,0.604242,,,,,
+4,19,Abu,0.534902,0.594545,0.565882,0.603939,,,,,
+4,19,Dhabi,0.570588,0.594242,0.615294,0.603939,,,,,
+4,19,and,0.619216,0.594545,0.647059,0.604242,,,,,
+4,19,"Houston,",0.651373,0.594242,0.717255,0.605455,,,,,
+4,19,in,0.721961,0.594545,0.735686,0.603636,,,,,
+4,19,a,0.740392,0.59697,0.74902,0.603636,,,,,
+4,19,gesture,0.752941,0.595758,0.806275,0.606364,,,,,
+4,19,of,0.809804,0.594545,0.825882,0.603939,,,,,
+4,20,Friendship,0.222745,0.609394,0.3,0.621515,,,,,
+4,20,and,0.304314,0.609394,0.331765,0.618788,,,,,
+4,20,"goodwill,",0.336078,0.608788,0.402353,0.620909,,,,,
+4,20,agree,0.406667,0.611212,0.446667,0.620606,,,,,
+4,20,to,0.450588,0.61,0.465098,0.618182,,,,,
+4,20,collaborate,0.469412,0.608485,0.549804,0.618182,,,,,
+4,20,for,0.553333,0.608485,0.575294,0.617879,,,,,
+4,20,the,0.579216,0.608485,0.602353,0.617879,,,,,
+4,20,mutual,0.606275,0.608485,0.657647,0.617879,,,,,
+4,20,benefit,0.662745,0.608485,0.712941,0.617879,,,,,
+4,20,of,0.716863,0.608182,0.732941,0.617576,,,,,
+4,20,their,0.735686,0.608182,0.770588,0.617879,,,,,
+4,21,Communities,0.222353,0.62303,0.318039,0.632727,,,,,
+4,21,by,0.322353,0.622727,0.340392,0.634242,,,,,
+4,21,exploring,0.344706,0.622727,0.413725,0.634242,,,,,
+4,21,"education,",0.418039,0.622424,0.494118,0.633333,,,,,
+4,21,economic,0.498039,0.622424,0.565882,0.631818,,,,,
+4,21,and,0.569804,0.622121,0.597647,0.631818,,,,,
+4,21,cultural,0.601961,0.622121,0.658824,0.631818,,,,,
+4,21,opportunities.,0.663137,0.622121,0.764314,0.633939,,,,,
+4,22,Abu,0.221569,0.650303,0.252941,0.659697,,,,,
+4,22,Dhabi,0.257647,0.65,0.301176,0.659394,,,,,
+4,22,and,0.30549,0.65,0.332549,0.659394,,,,,
+4,22,"Houston,",0.337647,0.65,0.404314,0.660606,,,,,
+4,22,sharing,0.408235,0.649394,0.463922,0.661515,,,,,
+4,22,a,0.467843,0.652121,0.476078,0.659091,,,,,
+4,22,common,0.480784,0.652121,0.542353,0.659091,,,,,
+4,22,interest,0.546667,0.648788,0.601176,0.658485,,,,,
+4,22,in,0.60549,0.649394,0.619216,0.658485,,,,,
+4,22,"energy,",0.623922,0.651515,0.681176,0.660909,,,,,
+4,22,technology,0.68549,0.648788,0.764314,0.661212,,,,,
+4,22,and,0.768627,0.648788,0.796078,0.658485,,,,,
+4,23,"medicine,",0.222353,0.663939,0.290196,0.674545,,,,,
+4,23,and,0.29451,0.663939,0.321569,0.673333,,,,,
+4,23,the,0.326275,0.663636,0.349412,0.67303,,,,,
+4,23,desire,0.353333,0.663636,0.397647,0.672727,,,,,
+4,23,to,0.401569,0.664545,0.416078,0.67303,,,,,
+4,23,promote,0.420784,0.664545,0.481961,0.675152,,,,,
+4,23,mutual,0.485882,0.663333,0.537255,0.672424,,,,,
+4,23,understanding,0.542353,0.66303,0.647451,0.675152,,,,,
+4,23,among,0.651765,0.665455,0.701176,0.674545,,,,,
+4,23,our,0.70549,0.665758,0.731373,0.672727,,,,,
+4,23,citizens,0.734902,0.66303,0.788627,0.672424,,,,,
+4,23,do,0.792549,0.663333,0.81098,0.672424,,,,,
+4,24,hereby,0.221961,0.677879,0.270588,0.689394,,,,,
+4,24,proclaim,0.275294,0.677576,0.338431,0.689697,,,,,
+4,24,themselves,0.343137,0.677576,0.421961,0.68697,,,,,
+4,24,Sister,0.426275,0.67697,0.468235,0.686364,,,,,
+4,24,Cities,0.471765,0.676667,0.513725,0.686364,,,,,
+4,24,beginning,0.518039,0.67697,0.590196,0.689091,,,,,
+4,24,on,0.594118,0.679394,0.611373,0.686667,,,,,
+4,24,the,0.616078,0.676667,0.639608,0.686061,,,,,
+4,24,13th,0.643922,0.673636,0.670588,0.686364,,,,,
+4,24,day,0.674902,0.676364,0.701176,0.688485,,,,,
+4,24,of,0.705882,0.676364,0.721961,0.685758,,,,,
+4,24,March,0.724314,0.676667,0.772549,0.686364,,,,,
+4,24,"2001,",0.777255,0.67697,0.816471,0.687576,,,,,
+4,25,the,0.221961,0.692424,0.244314,0.701515,,,,,
+4,25,date,0.248235,0.692121,0.279608,0.701515,,,,,
+4,25,of,0.283529,0.691515,0.299216,0.701212,,,,,
+4,25,Houston,0.302353,0.691818,0.363137,0.700909,,,,,
+4,25,City,0.367843,0.690909,0.4,0.703333,,,,,
+4,25,Council,0.404314,0.690909,0.461176,0.700909,,,,,
+4,25,resolution,0.46549,0.690909,0.536863,0.700606,,,,,
+4,25,estatblishing,0.541569,0.690606,0.633333,0.702727,,,,,
+4,25,the,0.637255,0.690606,0.66,0.700303,,,,,
+4,25,Sister,0.664314,0.690606,0.706667,0.700303,,,,,
+4,25,City,0.710588,0.690303,0.741961,0.702727,,,,,
+4,26,relationship,0.221569,0.705455,0.306667,0.717576,,,,,
+4,26,became,0.31098,0.705758,0.366275,0.714545,,,,,
+4,26,effective.,0.374118,0.705152,0.439608,0.714545,,,,,
+4,27,Signed,0.221569,0.733333,0.269412,0.745455,,,,,
+4,27,on,0.273725,0.736061,0.291373,0.74303,,,,,
+4,27,this,0.296078,0.733333,0.322745,0.742727,,,,,
+4,27,26,0.327059,0.733333,0.344314,0.742424,,,,,
+4,27,of,0.348627,0.73303,0.365098,0.742727,,,,,
+4,27,October,0.371765,0.73303,0.432549,0.742424,,,,,
+4,27,"2002,",0.436471,0.732727,0.474902,0.743939,,,,,
+4,27,in,0.478824,0.732727,0.492941,0.741818,,,,,
+4,27,duplicate,0.497255,0.732424,0.564314,0.744545,,,,,
+4,27,in,0.568627,0.732424,0.582745,0.742121,,,,,
+4,27,the,0.587059,0.732424,0.610588,0.742121,,,,,
+4,27,Arabic,0.613333,0.732424,0.664314,0.741818,,,,,
+4,27,and,0.668627,0.732727,0.696078,0.741818,,,,,
+4,27,English,0.700784,0.732121,0.754902,0.744242,,,,,
+4,28,"Languages,",0.221961,0.747576,0.302745,0.759697,,,,,
+4,28,both,0.307059,0.74697,0.34,0.756364,,,,,
+4,28,text,0.345098,0.748182,0.372549,0.757273,,,,,
+4,28,being,0.376863,0.74697,0.417647,0.758788,,,,,
+4,28,equally,0.421569,0.746667,0.47451,0.758788,,,,,
+4,28,authentic.,0.478039,0.746667,0.550588,0.756061,,,,,
+4,29,A,0.344314,0.768485,0.42902,0.799091,,,,,
+4,30,Sheikh,0.245882,0.80697,0.310196,0.817576,,,,,
+4,30,Mohammed,0.316471,0.80697,0.426667,0.817273,,,,,
+4,30,bin,0.432157,0.80697,0.461176,0.81697,,,,,
+4,30,Butti,0.467843,0.806667,0.51451,0.81697,,,,,
+4,30,AI,0.52,0.806364,0.54,0.816667,,,,,
+4,30,Hamed,0.546667,0.806667,0.612549,0.81697,,,,,
+4,31,Lee,0.729412,0.806364,0.763529,0.816667,,,,,
+4,31,P.Brown,0.769804,0.806364,0.848235,0.816667,,,,,
+4,32,Chairman,0.24549,0.824545,0.336078,0.834545,,,,,
+4,32,of,0.342353,0.823939,0.362353,0.834545,,,,,
+4,32,Abu,0.366667,0.823939,0.404314,0.834242,,,,,
+4,32,Dhabi,0.41098,0.823939,0.465098,0.833939,,,,,
+4,32,Municipality,0.471373,0.823636,0.588235,0.836667,,,,,
+4,33,Mayor,0.704706,0.823333,0.763137,0.836061,,,,,
+4,33,of,0.768235,0.823333,0.788235,0.833636,,,,,
+4,33,Houston,0.793333,0.823636,0.871765,0.833939,,,,,
+4,34,&Town,0.324314,0.841515,0.391373,0.852121,,,,,
+4,34,Planning,0.398431,0.841212,0.480392,0.853939,,,,,
+5,2,SisterCities,0.169412,0.033333,0.40902,0.061818,,,,,
+5,3,Partnership,0.516078,0.027879,0.733333,0.060303,,,,,
+5,3,Agreement,0.747451,0.028182,0.957255,0.060606,,,,,
+5,4,INTERNATIONAL,0.17098,0.066667,0.408627,0.075758,,,,,
+5,5,Connect,0.169412,0.08697,0.236078,0.097879,,,,,
+5,5,globally.,0.240784,0.087273,0.301961,0.100303,,,,,
+5,5,Thrive,0.307059,0.08697,0.35451,0.097879,,,,,
+5,5,locally.,0.358824,0.087273,0.40902,0.100303,,,,,
+5,6,Toolkit,0.83098,0.072727,0.958039,0.098485,,,,,
+5,7,THE,0.438824,0.262121,0.476471,0.271818,,,,,
+5,7,CITY,0.488627,0.262121,0.531373,0.271818,,,,,
+5,7,OF,0.541961,0.263939,0.56549,0.271515,,,,,
+5,7,NEW,0.577647,0.262121,0.621569,0.271515,,,,,
+5,7,YORK,0.629804,0.262121,0.68,0.271515,,,,,
+5,8,OFFICE,0.450196,0.27697,0.516863,0.286667,,,,,
+5,8,OF,0.52902,0.278788,0.552157,0.286667,,,,,
+5,8,THE,0.562353,0.278788,0.596863,0.286667,,,,,
+5,8,MAYOR,0.609412,0.277273,0.67098,0.28697,,,,,
+5,9,NEW,0.461176,0.29303,0.500392,0.301818,,,,,
+5,9,"YORK,",0.506275,0.29303,0.555294,0.303333,,,,,
+5,9,N.Y.,0.563137,0.29303,0.595294,0.302121,,,,,
+5,9,10007,0.604314,0.294848,0.658039,0.302121,,,,,
+5,10,THE,0.267451,0.357273,0.30902,0.367576,,,,,
+5,10,NEW,0.314118,0.35697,0.361569,0.367576,,,,,
+5,10,YORK,0.366275,0.356667,0.42549,0.367273,,,,,
+5,10,CITY-LONDON,0.430196,0.355758,0.573333,0.366667,,,,,
+5,10,SISTER,0.578039,0.356061,0.648627,0.36697,,,,,
+5,10,CITY,0.652941,0.356061,0.702745,0.366667,,,,,
+5,10,PARTNERSHIP,0.707843,0.355758,0.850196,0.366667,,,,,
+5,11,Memorandum,0.420392,0.371818,0.543137,0.382424,,,,,
+5,11,of,0.547451,0.371212,0.566275,0.381818,,,,,
+5,11,Understanding,0.569412,0.371212,0.695686,0.384848,,,,,
+5,12,The,0.201176,0.403939,0.232941,0.414545,,,,,
+5,12,Sister,0.237647,0.403636,0.284706,0.414545,,,,,
+5,12,City,0.288235,0.403333,0.322745,0.416667,,,,,
+5,12,partnership,0.326667,0.402727,0.415294,0.416667,,,,,
+5,12,between,0.419608,0.402727,0.486667,0.41303,,,,,
+5,12,New,0.491373,0.402424,0.52902,0.412727,,,,,
+5,12,York,0.533725,0.402424,0.574118,0.41303,,,,,
+5,12,City,0.578824,0.402121,0.613333,0.415455,,,,,
+5,12,and,0.617647,0.402424,0.646275,0.412727,,,,,
+5,12,London,0.65098,0.402424,0.713333,0.41303,,,,,
+5,12,will,0.717647,0.402121,0.749412,0.412727,,,,,
+5,12,foster,0.75451,0.402121,0.8,0.41303,,,,,
+5,12,mutually,0.804314,0.402424,0.875686,0.415455,,,,,
+5,13,beneficial,0.201176,0.418788,0.28,0.429394,,,,,
+5,13,solutions,0.285098,0.418788,0.356471,0.429394,,,,,
+5,13,to,0.361176,0.420303,0.376471,0.428788,,,,,
+5,13,common,0.380784,0.420909,0.450196,0.428788,,,,,
+5,13,challenges,0.455294,0.417273,0.539216,0.430606,,,,,
+5,13,for,0.544706,0.417273,0.567451,0.427879,,,,,
+5,13,these,0.571373,0.417576,0.612157,0.428182,,,,,
+5,13,two,0.616471,0.419091,0.646275,0.428182,,,,,
+5,13,great,0.65098,0.419091,0.69098,0.430909,,,,,
+5,13,cosmopolitan,0.696078,0.417273,0.803529,0.430909,,,,,
+5,13,entities.,0.808627,0.417576,0.870588,0.428182,,,,,
+5,14,"Consequently,",0.201176,0.434242,0.316078,0.447879,,,,,
+5,14,the,0.320784,0.434242,0.345098,0.444545,,,,,
+5,14,Sister,0.350196,0.433939,0.395294,0.444545,,,,,
+5,14,City,0.399608,0.433333,0.433725,0.446667,,,,,
+5,14,relationship,0.438039,0.43303,0.532157,0.446364,,,,,
+5,14,between,0.536863,0.432727,0.602353,0.443636,,,,,
+5,14,the,0.606667,0.43303,0.631765,0.443939,,,,,
+5,14,two,0.635686,0.434545,0.66549,0.443636,,,,,
+5,14,will,0.670196,0.43303,0.701176,0.443636,,,,,
+5,14,be,0.706275,0.43303,0.72549,0.443636,,,,,
+5,14,one,0.730196,0.436061,0.759608,0.443333,,,,,
+5,14,of,0.763922,0.432727,0.783529,0.443636,,,,,
+5,14,the,0.78549,0.432727,0.810196,0.443333,,,,,
+5,14,most,0.81451,0.434242,0.854118,0.443333,,,,,
+5,15,important,0.201176,0.450303,0.28,0.463636,,,,,
+5,15,in,0.284314,0.449697,0.299608,0.460909,,,,,
+5,15,their,0.304314,0.449697,0.341176,0.460606,,,,,
+5,15,network,0.345098,0.449394,0.409804,0.46,,,,,
+5,15,of,0.414118,0.448788,0.433333,0.459697,,,,,
+5,15,global,0.435686,0.448485,0.48549,0.462121,,,,,
+5,15,"partnerships,",0.489804,0.448182,0.591373,0.461818,,,,,
+5,15,as,0.596471,0.451515,0.612941,0.459091,,,,,
+5,15,it,0.618039,0.448182,0.62902,0.459394,,,,,
+5,15,strives,0.633725,0.448788,0.684706,0.459091,,,,,
+5,15,to:,0.689804,0.450303,0.710196,0.459091,,,,,
+5,16,Encourage,0.230588,0.482727,0.316471,0.495758,,,,,
+5,16,and,0.320784,0.481818,0.349412,0.492727,,,,,
+5,16,publicize,0.353333,0.481212,0.426667,0.495152,,,,,
+5,16,existing,0.430588,0.480606,0.49451,0.494242,,,,,
+5,16,exchanges,0.499216,0.480606,0.581176,0.493939,,,,,
+5,16,between,0.58549,0.480606,0.651373,0.491515,,,,,
+5,16,London,0.655294,0.480909,0.718431,0.491515,,,,,
+5,16,and,0.723137,0.480606,0.751765,0.491515,,,,,
+5,16,New,0.756471,0.480606,0.79451,0.491212,,,,,
+5,16,York,0.799216,0.480606,0.84,0.491515,,,,,
+5,16,City,0.845098,0.480303,0.879608,0.493636,,,,,
+5,16,so,0.884314,0.483333,0.902353,0.491212,,,,,
+5,17,that,0.230588,0.497879,0.261176,0.508788,,,,,
+5,17,they,0.26549,0.498485,0.300392,0.511515,,,,,
+5,17,can,0.304314,0.500606,0.332549,0.508182,,,,,
+5,17,flourish,0.337255,0.497273,0.398039,0.508182,,,,,
+5,17,to,0.402353,0.498788,0.418039,0.507576,,,,,
+5,17,benefit,0.422353,0.496364,0.478824,0.507576,,,,,
+5,17,a,0.483529,0.499394,0.492941,0.506667,,,,,
+5,17,wider,0.496863,0.496061,0.542745,0.50697,,,,,
+5,17,cross-section,0.546667,0.496364,0.649804,0.507273,,,,,
+5,17,of,0.653725,0.496364,0.673725,0.507273,,,,,
+5,17,the,0.675294,0.496364,0.700392,0.50697,,,,,
+5,17,citizens,0.704706,0.496061,0.766667,0.50697,,,,,
+5,17,of,0.770588,0.496061,0.790588,0.506667,,,,,
+5,17,both;,0.792549,0.496061,0.832941,0.508485,,,,,
+5,18,Support,0.230196,0.514848,0.294118,0.528182,,,,,
+5,18,and,0.298431,0.514242,0.327451,0.524848,,,,,
+5,18,promote,0.331373,0.515152,0.397647,0.527879,,,,,
+5,18,the,0.401961,0.513636,0.426667,0.524242,,,,,
+5,18,development,0.43098,0.51303,0.53451,0.526364,,,,,
+5,18,of,0.538039,0.512727,0.557255,0.523333,,,,,
+5,18,new,0.559216,0.516061,0.592549,0.523333,,,,,
+5,18,"social,",0.597255,0.512727,0.647843,0.525758,,,,,
+5,18,"economic,",0.653333,0.512727,0.734902,0.525152,,,,,
+5,18,academic,0.740392,0.512424,0.815686,0.523333,,,,,
+5,18,and,0.820392,0.512727,0.84902,0.523333,,,,,
+5,19,community,0.229804,0.529697,0.321176,0.54303,,,,,
+5,19,programs,0.32549,0.532121,0.4,0.54303,,,,,
+5,19,to,0.404314,0.530606,0.42,0.539697,,,,,
+5,19,encourage,0.425098,0.531515,0.507059,0.541818,,,,,
+5,19,both,0.511373,0.528182,0.546667,0.538788,,,,,
+5,19,cities',0.55098,0.528485,0.598431,0.538788,,,,,
+5,19,citizens,0.603922,0.528485,0.664706,0.539091,,,,,
+5,19,to,0.66902,0.53,0.684706,0.538788,,,,,
+5,19,share,0.689412,0.528485,0.732549,0.538788,,,,,
+5,19,their,0.736863,0.528182,0.774118,0.538788,,,,,
+5,19,experiences,0.778824,0.527879,0.872549,0.541515,,,,,
+5,19,as,0.876863,0.531212,0.894118,0.538485,,,,,
+5,19,a,0.898824,0.531212,0.907843,0.538788,,,,,
+5,20,medium,0.229804,0.545152,0.295686,0.556061,,,,,
+5,20,for,0.300784,0.545152,0.323922,0.555758,,,,,
+5,20,learning,0.328235,0.544848,0.392941,0.557879,,,,,
+5,20,from,0.397647,0.544242,0.435686,0.555152,,,,,
+5,20,one,0.440392,0.54697,0.469804,0.554545,,,,,
+5,20,another;,0.47451,0.543636,0.539216,0.556364,,,,,
+5,21,Generate,0.229804,0.562121,0.301961,0.57303,,,,,
+5,21,an,0.306275,0.564545,0.32549,0.572424,,,,,
+5,21,improvement,0.330196,0.561515,0.434902,0.574848,,,,,
+5,21,of,0.439608,0.560606,0.459216,0.571515,,,,,
+5,21,the,0.461176,0.560606,0.486275,0.571515,,,,,
+5,21,operation,0.490588,0.560303,0.565882,0.573939,,,,,
+5,21,of,0.569804,0.560606,0.589412,0.571212,,,,,
+5,21,the,0.59098,0.560606,0.616078,0.571212,,,,,
+5,21,cities',0.62,0.560303,0.667451,0.571212,,,,,
+5,21,various,0.672941,0.560303,0.732157,0.571212,,,,,
+5,21,government,0.737647,0.561818,0.832549,0.574242,,,,,
+5,21,agencies,0.836863,0.56,0.905882,0.573636,,,,,
+5,22,by,0.22902,0.577576,0.250196,0.590606,,,,,
+5,22,serving,0.25451,0.577273,0.313333,0.590303,,,,,
+5,22,as,0.318039,0.58,0.33451,0.587273,,,,,
+5,22,a,0.339216,0.579697,0.348627,0.587273,,,,,
+5,22,conduit,0.352549,0.576364,0.412549,0.587273,,,,,
+5,22,of,0.416471,0.576061,0.436078,0.58697,,,,,
+5,22,information;,0.438039,0.575758,0.537255,0.588182,,,,,
+5,23,Identify,0.229412,0.593636,0.292941,0.60697,,,,,
+5,23,"themes,",0.297255,0.593636,0.358039,0.606061,,,,,
+5,23,common,0.362745,0.595758,0.432157,0.603939,,,,,
+5,23,to,0.436471,0.593939,0.452157,0.603333,,,,,
+5,23,"both,",0.456863,0.592424,0.497255,0.604848,,,,,
+5,23,that,0.502353,0.592121,0.532941,0.60303,,,,,
+5,23,can,0.537255,0.595152,0.564706,0.602727,,,,,
+5,23,generate,0.569412,0.593939,0.636078,0.605758,,,,,
+5,23,new,0.640392,0.595152,0.673725,0.60303,,,,,
+5,23,initiatives,0.678039,0.592121,0.757647,0.60303,,,,,
+5,23,to,0.762353,0.593939,0.777647,0.602727,,,,,
+5,23,further,0.783137,0.591818,0.837647,0.602727,,,,,
+5,23,and,0.841569,0.592121,0.870588,0.602727,,,,,
+5,24,nurture,0.22902,0.611212,0.287451,0.62,,,,,
+5,24,the,0.291765,0.609091,0.316471,0.619697,,,,,
+5,24,increasingly,0.320784,0.608182,0.417255,0.621818,,,,,
+5,24,powerful,0.421569,0.607879,0.49451,0.621818,,,,,
+5,24,"financial,",0.499216,0.607576,0.572549,0.620303,,,,,
+5,24,social,0.577255,0.607576,0.623137,0.618485,,,,,
+5,24,and,0.627843,0.607879,0.656471,0.618485,,,,,
+5,24,cultural,0.660784,0.607576,0.722353,0.618788,,,,,
+5,24,relationships,0.727059,0.607576,0.829412,0.621212,,,,,
+5,24,between,0.833725,0.607576,0.900392,0.618485,,,,,
+5,25,the,0.22902,0.625152,0.254118,0.635455,,,,,
+5,25,cities;,0.258431,0.624545,0.30549,0.637273,,,,,
+5,26,Promote,0.228627,0.641515,0.297255,0.652424,,,,,
+5,26,key,0.301176,0.641212,0.330588,0.654545,,,,,
+5,26,mayoral,0.33451,0.640606,0.399216,0.654242,,,,,
+5,26,priorities,0.403529,0.64,0.476078,0.654242,,,,,
+5,26,relevant,0.480392,0.64,0.544706,0.650303,,,,,
+5,26,to,0.548627,0.641212,0.564314,0.650606,,,,,
+5,26,both,0.568627,0.64,0.604706,0.650606,,,,,
+5,26,London,0.60902,0.64,0.67098,0.650606,,,,,
+5,26,and,0.674902,0.64,0.703922,0.650303,,,,,
+5,26,New,0.708627,0.64,0.747059,0.650303,,,,,
+5,26,York,0.751765,0.639697,0.792941,0.650303,,,,,
+5,26,City;,0.797647,0.639394,0.837255,0.652727,,,,,
+5,27,Provide,0.228627,0.657879,0.291373,0.668788,,,,,
+5,27,financial,0.296078,0.657273,0.364706,0.668182,,,,,
+5,27,or,0.369412,0.660303,0.386667,0.667879,,,,,
+5,27,in,0.390588,0.65697,0.406275,0.667576,,,,,
+5,27,kind,0.410588,0.65697,0.446275,0.667576,,,,,
+5,27,support,0.451373,0.657879,0.51098,0.67,,,,,
+5,27,to,0.515294,0.657879,0.531373,0.66697,,,,,
+5,27,community-led,0.535294,0.656364,0.655686,0.669697,,,,,
+5,27,programs,0.660392,0.658788,0.736863,0.67,,,,,
+5,27,that,0.740784,0.656364,0.771765,0.66697,,,,,
+5,27,advance,0.776078,0.656061,0.840784,0.66697,,,,,
+5,27,the,0.84549,0.656364,0.870196,0.666667,,,,,
+5,28,aims,0.22902,0.673636,0.267451,0.683636,,,,,
+5,28,of,0.271765,0.67303,0.291373,0.683939,,,,,
+5,28,the,0.292941,0.673333,0.317647,0.683636,,,,,
+5,28,Sister,0.322353,0.67303,0.367843,0.683636,,,,,
+5,28,City,0.371765,0.672424,0.406275,0.685758,,,,,
+5,28,partnership;,0.410588,0.672121,0.504706,0.686061,,,,,
+5,29,With,0.198824,0.704545,0.239608,0.715152,,,,,
+5,29,the,0.243529,0.704545,0.26902,0.715152,,,,,
+5,29,above,0.273725,0.704242,0.320784,0.714848,,,,,
+5,29,purposes,0.325098,0.706667,0.395294,0.717576,,,,,
+5,29,in,0.4,0.703333,0.415686,0.713939,,,,,
+5,29,"mind,",0.42,0.703333,0.465882,0.715758,,,,,
+5,29,the,0.470588,0.703333,0.495686,0.713333,,,,,
+5,29,Mayor,0.500392,0.70303,0.553333,0.716061,,,,,
+5,29,of,0.557255,0.702727,0.576471,0.713333,,,,,
+5,29,the,0.578039,0.70303,0.603137,0.713333,,,,,
+5,29,City,0.607451,0.702727,0.641961,0.716061,,,,,
+5,29,of,0.645882,0.702727,0.665098,0.713333,,,,,
+5,29,New,0.667059,0.70303,0.705098,0.713333,,,,,
+5,29,York,0.710196,0.702727,0.751373,0.713636,,,,,
+5,29,and,0.756078,0.702727,0.784706,0.713333,,,,,
+5,29,the,0.789412,0.702727,0.814118,0.71303,,,,,
+5,29,Mayor,0.818824,0.70303,0.873725,0.716061,,,,,
+5,29,of,0.876078,0.702424,0.896471,0.713333,,,,,
+5,30,London,0.198824,0.720303,0.261569,0.731212,,,,,
+5,30,solemnly,0.266275,0.72,0.338824,0.732727,,,,,
+5,30,confirm,0.343137,0.719091,0.405882,0.73,,,,,
+5,30,that,0.409804,0.718788,0.440392,0.729394,,,,,
+5,30,these,0.444314,0.718788,0.486275,0.728788,,,,,
+5,30,two,0.490196,0.720303,0.520392,0.729091,,,,,
+5,30,cities,0.524706,0.718182,0.566667,0.728788,,,,,
+5,30,are,0.571373,0.721515,0.596078,0.729091,,,,,
+5,30,united,0.6,0.718485,0.649412,0.728788,,,,,
+5,30,by,0.653725,0.718485,0.67451,0.731818,,,,,
+5,30,an,0.678824,0.721515,0.697647,0.729091,,,,,
+5,30,official,0.702353,0.718182,0.760784,0.729091,,,,,
+5,30,partnership,0.765098,0.718182,0.85451,0.731818,,,,,
+5,30,by,0.859216,0.718182,0.88,0.731515,,,,,
+5,30,the,0.884706,0.718182,0.90902,0.728485,,,,,
+5,31,protocol,0.198431,0.736061,0.26549,0.749394,,,,,
+5,31,of,0.270196,0.735455,0.289804,0.746061,,,,,
+5,31,this,0.291765,0.735152,0.320784,0.745758,,,,,
+5,31,Memorandum,0.32549,0.734242,0.436471,0.745758,,,,,
+5,31,of,0.441176,0.733939,0.460784,0.744848,,,,,
+5,31,Understanding.,0.462745,0.733939,0.583137,0.747273,,,,,
+5,32,This,0.310196,0.782424,0.34549,0.79303,,,,,
+5,32,agreement,0.350196,0.783333,0.431765,0.795455,,,,,
+5,32,will,0.436078,0.781212,0.467451,0.792121,,,,,
+5,32,go,0.472549,0.783939,0.492157,0.794242,,,,,
+5,32,into,0.497255,0.780909,0.527843,0.791515,,,,,
+5,32,effect,0.532941,0.780606,0.578039,0.791515,,,,,
+5,32,from,0.582745,0.780909,0.620784,0.791818,,,,,
+5,32,the,0.62549,0.780909,0.650196,0.791818,,,,,
+5,32,date,0.654118,0.781212,0.687843,0.791818,,,,,
+5,32,of,0.692157,0.780909,0.711765,0.791818,,,,,
+5,32,signatures.,0.713333,0.781212,0.798431,0.794242,,,,,
+5,33,Signed,0.455686,0.796667,0.511373,0.810303,,,,,
+5,33,in,0.516078,0.796364,0.531373,0.80697,,,,,
+5,33,March,0.536078,0.796667,0.587059,0.80697,,,,,
+5,33,of,0.591765,0.796667,0.610588,0.807273,,,,,
+5,33,2001,0.612941,0.79697,0.650196,0.807273,,,,,
+5,34,Thedder,0.178824,0.795455,0.281961,0.896364,,,,,
+5,34,Rudolph,0.258039,0.878788,0.327451,0.892121,,,,,
+5,34,W.,0.331765,0.878485,0.353725,0.888788,,,,,
+5,34,Giuliani,0.359608,0.877576,0.423529,0.889091,,,,,
+5,35,Mayor,0.311373,0.894848,0.365098,0.907576,,,,,
+5,36,Ken,0.672157,0.877879,0.705098,0.888182,,,,,
+5,36,Mayor,0.706667,0.893636,0.760392,0.906667,,,,,
+5,36,Livingstone,0.710196,0.877576,0.80549,0.891212,,,,,
+5,37,New,0.287843,0.91,0.324706,0.920303,,,,,
+5,37,York,0.329804,0.909394,0.369804,0.92,,,,,
+5,37,City,0.374902,0.909091,0.40902,0.922727,,,,,
+5,38,London,0.701961,0.909091,0.763922,0.919697,,,,,
+6,2,SisterCities,0.169412,0.03303,0.409412,0.061515,,,,,
+6,3,Partnership,0.515686,0.027576,0.732941,0.060909,,,,,
+6,3,Agreement,0.746667,0.027879,0.957647,0.060606,,,,,
+6,4,INTERNATIONAL,0.169804,0.066667,0.408235,0.075758,,,,,
+6,5,Connect,0.169412,0.08697,0.236471,0.097879,,,,,
+6,5,globally.,0.240392,0.087273,0.301961,0.100303,,,,,
+6,5,Thrive,0.306667,0.08697,0.35451,0.097879,,,,,
+6,5,locally.,0.359216,0.087273,0.409412,0.100303,,,,,
+6,6,Toolkit,0.83098,0.072727,0.958824,0.098788,,,,,
+6,7,CHIC,0.247451,0.205455,0.269412,0.226061,,,,,
+6,7,OF,0.275686,0.190606,0.293333,0.201212,,,,,
+6,7,STATE,0.356471,0.197576,0.389804,0.227273,,,,,
+6,8,City,0.388627,0.196667,0.497647,0.260909,,,,,
+6,8,of,0.505098,0.216061,0.557647,0.26303,,,,,
+6,8,Long,0.55451,0.203636,0.695294,0.257576,,,,,
+6,8,Beach,0.698431,0.203636,0.86549,0.258485,,,,,
+6,9,California,0.551373,0.257273,0.687843,0.290606,,,,,
+6,10,Sister,0.321961,0.306667,0.418824,0.331515,,,,,
+6,10,City,0.42902,0.305455,0.505882,0.340909,,,,,
+6,10,Agreement,0.513333,0.30697,0.700392,0.340606,,,,,
+6,11,between,0.464706,0.352727,0.521569,0.362121,,,,,
+6,11,the,0.526275,0.352727,0.54902,0.362727,,,,,
+6,12,City,0.38,0.378788,0.435294,0.395758,,,,,
+6,12,of,0.447059,0.378485,0.475294,0.395455,,,,,
+6,12,Long,0.483922,0.379394,0.54549,0.395758,,,,,
+6,12,Beach,0.556863,0.378788,0.632549,0.391818,,,,,
+6,13,"California,",0.4,0.397576,0.544706,0.413939,,,,,
+6,13,USA,0.558824,0.397879,0.611373,0.410303,,,,,
+6,14,and,0.48,0.415152,0.507059,0.424242,,,,,
+6,14,the,0.511765,0.415152,0.533333,0.424242,,,,,
+6,15,City,0.321569,0.429091,0.376863,0.446061,,,,,
+6,15,of,0.38902,0.428788,0.417255,0.445758,,,,,
+6,15,San,0.425882,0.429394,0.470588,0.441818,,,,,
+6,15,Pablo,0.483137,0.429091,0.556863,0.441818,,,,,
+6,15,de,0.56902,0.429091,0.596471,0.441818,,,,,
+6,15,Manta,0.607843,0.429091,0.691373,0.441818,,,,,
+6,16,"Ecuador,",0.347451,0.448182,0.460392,0.46303,,,,,
+6,16,South,0.473333,0.447879,0.546667,0.460909,,,,,
+6,16,America,0.558039,0.448182,0.665098,0.460909,,,,,
+6,17,In,0.261569,0.483333,0.276471,0.492424,,,,,
+6,17,accordance,0.279608,0.483333,0.347059,0.492121,,,,,
+6,17,with,0.350196,0.482727,0.378431,0.492424,,,,,
+6,17,the,0.381961,0.48303,0.401569,0.492121,,,,,
+6,17,authorization,0.403922,0.482727,0.485882,0.492121,,,,,
+6,17,and,0.48902,0.482424,0.513725,0.492121,,,,,
+6,17,approval,0.516078,0.482727,0.569804,0.494242,,,,,
+6,17,expressed,0.572941,0.482727,0.632157,0.493939,,,,,
+6,17,by,0.634902,0.482424,0.65098,0.493636,,,,,
+6,17,the,0.654118,0.482727,0.674118,0.492121,,,,,
+6,17,City,0.676863,0.482424,0.70549,0.494242,,,,,
+6,17,of,0.708235,0.482424,0.723137,0.494242,,,,,
+6,17,Long,0.724314,0.482424,0.755686,0.493939,,,,,
+6,17,"Beach,",0.76,0.482121,0.798431,0.492424,,,,,
+6,18,"California,",0.217647,0.493333,0.282353,0.504242,,,,,
+6,18,"USA,",0.287843,0.493333,0.321961,0.503636,,,,,
+6,18,and,0.325882,0.493333,0.350588,0.502424,,,,,
+6,18,the,0.35451,0.493333,0.373725,0.502424,,,,,
+6,18,City,0.377647,0.493333,0.405882,0.504545,,,,,
+6,18,of,0.408235,0.49303,0.423137,0.505758,,,,,
+6,18,San,0.425098,0.49303,0.44902,0.502727,,,,,
+6,18,Pablo,0.453333,0.49303,0.487843,0.502424,,,,,
+6,18,de,0.491765,0.493636,0.506275,0.502424,,,,,
+6,18,"Manta,",0.509804,0.493333,0.55451,0.50303,,,,,
+6,18,"Ecundor,",0.559608,0.49303,0.614118,0.50303,,,,,
+6,18,South,0.618039,0.49303,0.654902,0.502424,,,,,
+6,18,"America,",0.659216,0.493333,0.715294,0.50303,,,,,
+6,18,it,0.719608,0.493333,0.729412,0.501818,,,,,
+6,18,is,0.734118,0.493333,0.744706,0.501818,,,,,
+6,18,declared,0.748235,0.492727,0.799216,0.501818,,,,,
+6,19,that,0.217647,0.503939,0.243137,0.512727,,,,,
+6,19,a,0.246667,0.505758,0.254902,0.512424,,,,,
+6,19,"""Sister",0.261569,0.503333,0.303529,0.51303,,,,,
+6,19,City,0.308235,0.503636,0.336863,0.515152,,,,,
+6,19,Agreement,0.341176,0.503939,0.408235,0.515152,,,,,
+6,19,between,0.420392,0.503636,0.469412,0.512424,,,,,
+6,19,the,0.474118,0.503333,0.493725,0.512121,,,,,
+6,19,two,0.498431,0.504242,0.521569,0.512121,,,,,
+6,19,cities,0.526275,0.503636,0.558039,0.512424,,,,,
+6,19,is,0.563137,0.503636,0.574118,0.512121,,,,,
+6,19,hereby,0.578431,0.503333,0.62,0.514848,,,,,
+6,19,established,0.623529,0.50303,0.69098,0.512424,,,,,
+6,19,for,0.692549,0.502727,0.713333,0.514848,,,,,
+6,19,the,0.718039,0.50303,0.737647,0.512121,,,,,
+6,19,following,0.738824,0.502727,0.799216,0.514848,,,,,
+6,20,purposes:,0.216863,0.516061,0.275294,0.525455,,,,,
+6,21,(1),0.278824,0.533636,0.297647,0.544242,,,,,
+6,21,to,0.307451,0.534848,0.32,0.54303,,,,,
+6,21,promote,0.322745,0.534848,0.373725,0.545152,,,,,
+6,21,and,0.376471,0.533636,0.401176,0.542727,,,,,
+6,21,expand,0.403922,0.533636,0.44902,0.544848,,,,,
+6,21,the,0.451765,0.533939,0.471765,0.542727,,,,,
+6,21,effective,0.47451,0.533333,0.524314,0.545152,,,,,
+6,21,and,0.527451,0.533333,0.551373,0.542727,,,,,
+6,21,mutually,0.55451,0.533333,0.611373,0.545152,,,,,
+6,21,beneficial,0.614118,0.53303,0.672941,0.545152,,,,,
+6,21,cooperation,0.675294,0.533636,0.746275,0.544848,,,,,
+6,21,between,0.750196,0.532727,0.799216,0.542424,,,,,
+6,22,the,0.218039,0.544242,0.237647,0.553333,,,,,
+6,22,people,0.239216,0.544242,0.278431,0.555758,,,,,
+6,22,of,0.280392,0.543939,0.29451,0.555758,,,,,
+6,22,Long,0.295294,0.544242,0.326667,0.556061,,,,,
+6,22,Beach,0.330196,0.544242,0.36549,0.553333,,,,,
+6,22,and,0.368235,0.544242,0.392157,0.552424,,,,,
+6,22,the,0.39451,0.543939,0.414118,0.55303,,,,,
+6,22,people,0.416078,0.543939,0.45451,0.555455,,,,,
+6,22,of,0.456471,0.543939,0.470588,0.555455,,,,,
+6,22,San,0.471765,0.544242,0.494902,0.55303,,,,,
+6,22,Pablo,0.498431,0.543636,0.532549,0.552727,,,,,
+6,22,de,0.53451,0.543636,0.549412,0.55303,,,,,
+6,22,Manta;,0.551373,0.543939,0.596863,0.553939,,,,,
+6,22,and,0.599216,0.543939,0.623529,0.552727,,,,,
+6,23,(2),0.279216,0.563939,0.298039,0.574545,,,,,
+6,23,to,0.307451,0.565152,0.320392,0.573333,,,,,
+6,23,promote,0.32902,0.565152,0.380392,0.575152,,,,,
+6,23,international,0.389412,0.563939,0.469412,0.57303,,,,,
+6,23,"goodwill,",0.478039,0.563939,0.533725,0.575758,,,,,
+6,23,"understanding,",0.544314,0.563636,0.637255,0.575455,,,,,
+6,23,and,0.646275,0.563636,0.671373,0.57303,,,,,
+6,23,expanded,0.679608,0.563636,0.738431,0.575455,,,,,
+6,23,business,0.747451,0.56303,0.8,0.572727,,,,,
+6,24,relations,0.218039,0.574545,0.271373,0.583939,,,,,
+6,24,between,0.27451,0.574545,0.323529,0.583939,,,,,
+6,24,the,0.327059,0.574848,0.346275,0.583636,,,,,
+6,24,two,0.34902,0.575455,0.372157,0.583636,,,,,
+6,24,cities,0.37451,0.574848,0.407451,0.583636,,,,,
+6,24,and,0.409412,0.574545,0.434118,0.583333,,,,,
+6,24,their,0.436078,0.574242,0.46549,0.583636,,,,,
+6,24,respective,0.468235,0.574848,0.528235,0.585758,,,,,
+6,24,nations,0.53098,0.574545,0.575686,0.583636,,,,,
+6,24,by,0.579216,0.574242,0.594902,0.585152,,,,,
+6,24,the,0.597647,0.574545,0.617255,0.583333,,,,,
+6,24,exchange,0.619608,0.574545,0.676078,0.585758,,,,,
+6,24,of,0.678431,0.574242,0.692549,0.585758,,,,,
+6,24,"people,",0.693333,0.574242,0.735294,0.585455,,,,,
+6,24,"ideas,",0.738824,0.573939,0.772549,0.583939,,,,,
+6,24,and,0.775686,0.573636,0.8,0.582727,,,,,
+6,25,information,0.218039,0.584848,0.290196,0.596364,,,,,
+6,25,in,0.300392,0.585152,0.313333,0.593939,,,,,
+6,25,a,0.322353,0.58697,0.33098,0.593636,,,,,
+6,25,unide,0.340392,0.585152,0.370196,0.593939,,,,,
+6,25,variety,0.378824,0.585152,0.422745,0.596061,,,,,
+6,25,of,0.43098,0.584848,0.446275,0.594848,,,,,
+6,25,"economic,",0.452157,0.585152,0.512157,0.594848,,,,,
+6,25,"social,",0.521569,0.584545,0.559608,0.594545,,,,,
+6,25,"cultural,",0.568627,0.584242,0.622353,0.594545,,,,,
+6,25,"municipal,",0.631765,0.584848,0.697255,0.596061,,,,,
+6,25,"environmental,",0.707059,0.584242,0.799608,0.594242,,,,,
+6,26,"professional,",0.217647,0.595455,0.293333,0.607273,,,,,
+6,26,"technical,",0.297255,0.594848,0.355294,0.605455,,,,,
+6,26,"youth,",0.357647,0.594848,0.396863,0.60697,,,,,
+6,26,and,0.399608,0.595152,0.423922,0.603939,,,,,
+6,26,other,0.42549,0.595455,0.458039,0.603939,,,,,
+6,26,endeavors;,0.46,0.595152,0.523922,0.605152,,,,,
+6,26,and,0.526275,0.594848,0.55098,0.603636,,,,,
+6,27,(3),0.279608,0.615152,0.298824,0.625455,,,,,
+6,27,to,0.307843,0.616364,0.320784,0.624848,,,,,
+6,27,foster,0.322353,0.614848,0.36,0.626667,,,,,
+6,27,and,0.363922,0.615152,0.38902,0.624242,,,,,
+6,27,encourage,0.392549,0.617273,0.455294,0.626667,,,,,
+6,27,"charitable,",0.459608,0.614545,0.522353,0.625455,,,,,
+6,27,"scientific,",0.527059,0.614545,0.585882,0.626667,,,,,
+6,27,trade,0.590588,0.614848,0.623137,0.624242,,,,,
+6,27,and,0.627059,0.614545,0.651765,0.623636,,,,,
+6,27,"commerce,",0.654902,0.61697,0.721176,0.625152,,,,,
+6,27,literary,0.72549,0.614545,0.772157,0.626667,,,,,
+6,27,and,0.775686,0.613939,0.800392,0.623333,,,,,
+6,28,educational,0.218039,0.625455,0.288627,0.634848,,,,,
+6,28,activities,0.290588,0.626061,0.346667,0.634545,,,,,
+6,28,between,0.348627,0.625455,0.398431,0.634545,,,,,
+6,28,the,0.400784,0.625455,0.420784,0.634242,,,,,
+6,28,two,0.423137,0.626364,0.446275,0.634242,,,,,
+6,28,cities;,0.448235,0.625455,0.484314,0.635152,,,,,
+6,29,This,0.263137,0.645455,0.291373,0.654848,,,,,
+6,29,Sister,0.29451,0.645758,0.330196,0.654848,,,,,
+6,29,City,0.333333,0.645455,0.361569,0.657273,,,,,
+6,29,Agreement,0.364314,0.645758,0.431373,0.655758,,,,,
+6,29,shall,0.434118,0.644848,0.463922,0.654242,,,,,
+6,29,be,0.466667,0.645152,0.481176,0.654545,,,,,
+6,29,officially,0.482745,0.645152,0.536471,0.65697,,,,,
+6,29,established,0.538824,0.645152,0.605098,0.654242,,,,,
+6,29,and,0.607059,0.645152,0.631765,0.653939,,,,,
+6,29,shall,0.634118,0.644848,0.664314,0.654242,,,,,
+6,29,become,0.667059,0.645152,0.711765,0.654242,,,,,
+6,29,effective,0.713725,0.644545,0.764314,0.654545,,,,,
+6,29,when,0.766667,0.644545,0.8,0.653939,,,,,
+6,30,this,0.218824,0.655758,0.242353,0.664545,,,,,
+6,30,document,0.246275,0.656061,0.306667,0.665152,,,,,
+6,30,has,0.31098,0.655758,0.332157,0.664545,,,,,
+6,30,been,0.336471,0.656061,0.364314,0.664545,,,,,
+6,30,duly,0.368235,0.655758,0.398039,0.666667,,,,,
+6,30,executed,0.401176,0.655758,0.454118,0.664848,,,,,
+6,30,by,0.458039,0.655455,0.473725,0.666364,,,,,
+6,30,the,0.478039,0.656061,0.497647,0.664242,,,,,
+6,30,Mayor,0.501569,0.655758,0.542745,0.666667,,,,,
+6,30,of,0.546275,0.655455,0.561176,0.66697,,,,,
+6,30,Long,0.563137,0.655758,0.59451,0.667576,,,,,
+6,30,"Beach,",0.599216,0.655455,0.638824,0.665758,,,,,
+6,30,"California,",0.643137,0.655455,0.70902,0.667273,,,,,
+6,30,"USA,",0.714118,0.655152,0.748627,0.665455,,,,,
+6,30,and,0.752549,0.654848,0.777647,0.663939,,,,,
+6,30,the,0.781176,0.654848,0.800784,0.663939,,,,,
+6,31,Mayor,0.218431,0.666364,0.260784,0.677879,,,,,
+6,31,of,0.262745,0.666061,0.276863,0.677879,,,,,
+6,31,San,0.277647,0.666061,0.301176,0.675152,,,,,
+6,31,Pablo,0.304706,0.666061,0.338824,0.675152,,,,,
+6,31,de,0.341176,0.666364,0.356471,0.674848,,,,,
+6,31,"Manta,",0.358431,0.666061,0.403529,0.676061,,,,,
+6,31,"Ecundor,",0.407059,0.665758,0.460392,0.676061,,,,,
+6,31,South,0.463529,0.665758,0.499608,0.674545,,,,,
+6,31,America.,0.502353,0.666364,0.557255,0.674848,,,,,
+6,32,STATE,0.276471,0.739394,0.301176,0.762424,,,,,
+6,32,OFFICE,0.280392,0.713636,0.327451,0.737879,,,,,
+6,33,Beverly,0.587451,0.73697,0.647843,0.750303,,,,,
+6,33,0,0.651765,0.736667,0.66549,0.747273,,,,,
+6,33,Neill,0.667843,0.736667,0.709804,0.74697,,,,,
+6,34,"Mayor,",0.542353,0.751818,0.6,0.764848,,,,,
+6,34,City,0.604314,0.752121,0.639608,0.764545,,,,,
+6,34,of,0.643137,0.751515,0.661961,0.764848,,,,,
+6,34,Long,0.663137,0.751515,0.702745,0.764545,,,,,
+6,34,Beach,0.706275,0.751212,0.753333,0.761818,,,,,
+6,35,"California,",0.582745,0.765758,0.667843,0.779091,,,,,
+6,35,USA,0.672941,0.766061,0.708235,0.776667,,,,,
+6,36,10.2aulus,0.490588,0.771818,0.711373,0.834545,,,,,
+6,37,Ing.,0.527059,0.825152,0.556471,0.838485,,,,,
+6,37,Jorge,0.559608,0.825455,0.601176,0.838485,,,,,
+6,37,O.,0.604706,0.825152,0.624314,0.835455,,,,,
+6,37,Zambrano,0.627059,0.825152,0.709412,0.835455,,,,,
+6,37,Cedeño,0.713725,0.825152,0.769804,0.835152,,,,,
+6,38,"Mayor,",0.505098,0.840303,0.562353,0.85303,,,,,
+6,38,City,0.566275,0.839697,0.601176,0.853333,,,,,
+6,38,of,0.604314,0.839697,0.623922,0.85303,,,,,
+6,38,San,0.624314,0.839697,0.653725,0.850303,,,,,
+6,38,Pablo,0.658039,0.839697,0.704706,0.85,,,,,
+6,38,de,0.707843,0.839394,0.726667,0.85,,,,,
+6,38,Manta,0.729412,0.839697,0.782745,0.849394,,,,,
+6,39,"Ecuador,",0.551765,0.854545,0.620392,0.866061,,,,,
+6,39,South,0.624314,0.854242,0.67098,0.864545,,,,,
+6,39,America,0.673725,0.854242,0.74,0.864545,,,,,
+6,40,Dated:,0.544706,0.883333,0.597255,0.893939,,,,,
+6,40,September,0.600392,0.883636,0.682353,0.896667,,,,,
+6,40,"19,",0.68549,0.883636,0.707451,0.895455,,,,,
+6,40,2000,0.710588,0.883333,0.747451,0.893333,,,,,
+7,2,SisterCities,0.169412,0.03303,0.409804,0.061818,,,,,
+7,3,Partnership,0.516078,0.027879,0.733333,0.060606,,,,,
+7,3,Agreement,0.747843,0.027879,0.957647,0.060606,,,,,
+7,4,INTERNATIONAL,0.170196,0.066667,0.408235,0.075758,,,,,
+7,5,Connect,0.169412,0.08697,0.236078,0.097879,,,,,
+7,5,globally.,0.240784,0.087273,0.301569,0.100303,,,,,
+7,5,Thrive,0.307059,0.08697,0.354902,0.097879,,,,,
+7,5,locally.,0.358824,0.087273,0.40902,0.100303,,,,,
+7,6,Toolkit,0.83098,0.072727,0.958431,0.098788,,,,,
+7,7,REAFFIRMATION,0.324706,0.165152,0.490588,0.178182,,,,,
+7,7,OF,0.493725,0.16697,0.522353,0.178485,,,,,
+7,7,SISTER,0.525098,0.16697,0.594118,0.179091,,,,,
+7,7,CITIES,0.598431,0.167273,0.663137,0.179091,,,,,
+7,7,DECLARATION,0.667059,0.166061,0.808235,0.178788,,,,,
+7,8,adopted,0.2,0.213939,0.257255,0.227273,,,,,
+7,8,by,0.261961,0.213333,0.280784,0.22697,,,,,
+7,9,THE,0.396078,0.214242,0.433333,0.225455,,,,,
+7,9,HONORABLE,0.438824,0.214848,0.551373,0.226061,,,,,
+7,9,RICHARD,0.556471,0.215152,0.638431,0.226364,,,,,
+7,9,M.,0.643137,0.215455,0.664314,0.226667,,,,,
+7,9,DALEY,0.670588,0.215152,0.731765,0.226061,,,,,
+7,10,MAYOR,0.472549,0.231212,0.541569,0.242121,,,,,
+7,10,OF,0.545882,0.231212,0.570588,0.242121,,,,,
+7,10,CHICAGO,0.575294,0.231212,0.658431,0.242727,,,,,
+7,11,and,0.199608,0.260909,0.226275,0.271515,,,,,
+7,12,THE,0.401961,0.261212,0.44,0.271818,,,,,
+7,12,HONORABLE,0.445098,0.261212,0.558431,0.272121,,,,,
+7,12,ZHANG,0.563137,0.261515,0.626667,0.272424,,,,,
+7,12,RONGMAO,0.631765,0.261212,0.725098,0.272424,,,,,
+7,13,MAYOR,0.463529,0.273636,0.532941,0.284545,,,,,
+7,13,OF,0.537255,0.273636,0.561569,0.284545,,,,,
+7,13,SHENYANG,0.566275,0.273636,0.666275,0.285152,,,,,
+7,14,ON,0.551765,0.298182,0.578824,0.31,,,,,
+7,15,JUNE,0.500392,0.323636,0.558824,0.336061,,,,,
+7,15,"5,",0.563137,0.323939,0.578431,0.338485,,,,,
+7,15,1995,0.58549,0.323939,0.628627,0.336667,,,,,
+7,16,On,0.255686,0.363939,0.278824,0.374848,,,,,
+7,16,this,0.283529,0.363939,0.311765,0.374242,,,,,
+7,16,the,0.315686,0.363939,0.339608,0.374545,,,,,
+7,16,tenth,0.343922,0.364545,0.381569,0.374848,,,,,
+7,16,anniversary,0.386667,0.364848,0.476078,0.377879,,,,,
+7,16,of,0.480392,0.364848,0.499608,0.377576,,,,,
+7,16,the,0.501961,0.365152,0.525882,0.375152,,,,,
+7,16,signing,0.530588,0.364848,0.586275,0.378182,,,,,
+7,16,of,0.590588,0.364848,0.609412,0.377879,,,,,
+7,16,a,0.61098,0.367879,0.620784,0.375455,,,,,
+7,16,sister,0.625098,0.364848,0.664706,0.375758,,,,,
+7,16,city,0.668627,0.364848,0.69451,0.377576,,,,,
+7,16,"agreement,",0.699216,0.365455,0.780392,0.377576,,,,,
+7,16,in,0.787843,0.363333,0.803137,0.373636,,,,,
+7,16,order,0.808235,0.36303,0.849412,0.373636,,,,,
+7,16,to,0.853725,0.365152,0.867843,0.373636,,,,,
+7,16,further,0.870196,0.363636,0.927059,0.376364,,,,,
+7,17,the,0.198824,0.380909,0.222353,0.391212,,,,,
+7,17,traditional,0.228235,0.379697,0.307059,0.390909,,,,,
+7,17,links,0.311765,0.379394,0.348627,0.390909,,,,,
+7,17,of,0.353725,0.379697,0.372157,0.39303,,,,,
+7,17,friendship,0.372941,0.380303,0.452941,0.393939,,,,,
+7,17,between,0.458039,0.380606,0.52,0.391515,,,,,
+7,17,Chicago,0.526667,0.380303,0.588235,0.394242,,,,,
+7,17,and,0.593725,0.380909,0.623529,0.391818,,,,,
+7,17,Shenyang,0.628235,0.380606,0.702353,0.394242,,,,,
+7,17,and,0.707059,0.38,0.737647,0.391212,,,,,
+7,17,to,0.741961,0.381515,0.756078,0.390606,,,,,
+7,17,reaffirm,0.761569,0.379091,0.82549,0.392424,,,,,
+7,17,their,0.83098,0.378788,0.867451,0.389697,,,,,
+7,17,mutual,0.873725,0.38,0.926667,0.390606,,,,,
+7,18,aspiration,0.199608,0.396667,0.273333,0.409697,,,,,
+7,18,to,0.280392,0.397879,0.29451,0.406667,,,,,
+7,18,work,0.301961,0.396061,0.339216,0.406667,,,,,
+7,18,in,0.344706,0.396061,0.360392,0.406364,,,,,
+7,18,unison,0.367843,0.396667,0.419608,0.407273,,,,,
+7,18,for,0.423922,0.396667,0.451373,0.409697,,,,,
+7,18,the,0.456471,0.396667,0.481569,0.407576,,,,,
+7,18,benefit,0.488235,0.396667,0.541569,0.409697,,,,,
+7,18,of,0.547451,0.39697,0.566275,0.409697,,,,,
+7,18,their,0.57098,0.39697,0.606275,0.407879,,,,,
+7,18,cities,0.612549,0.396667,0.650588,0.407576,,,,,
+7,18,and,0.656863,0.396364,0.686667,0.407273,,,,,
+7,18,"nations,",0.691765,0.396364,0.750588,0.408485,,,,,
+7,18,the,0.759608,0.395455,0.783137,0.406061,,,,,
+7,18,Honorable,0.790196,0.394848,0.87098,0.405758,,,,,
+7,18,Mayor,0.876863,0.395455,0.927451,0.408485,,,,,
+7,19,Richard,0.199216,0.412121,0.260784,0.42303,,,,,
+7,19,M.,0.264314,0.412121,0.285098,0.422424,,,,,
+7,19,"Daley,",0.292941,0.411818,0.34,0.424848,,,,,
+7,19,Mayor,0.347059,0.411818,0.398039,0.424848,,,,,
+7,19,of,0.402745,0.412424,0.421961,0.425152,,,,,
+7,19,the,0.424706,0.412727,0.449412,0.42303,,,,,
+7,19,City,0.455686,0.412727,0.486275,0.425758,,,,,
+7,19,of,0.492157,0.412727,0.511373,0.425758,,,,,
+7,19,"Chicago,",0.514902,0.412424,0.580784,0.425758,,,,,
+7,19,and,0.588235,0.41303,0.617647,0.423636,,,,,
+7,19,the,0.622745,0.412727,0.646667,0.423636,,,,,
+7,19,Honorable,0.651373,0.412121,0.731765,0.423333,,,,,
+7,19,Zhang,0.736863,0.411818,0.786667,0.424545,,,,,
+7,19,"Rongmao,",0.792941,0.411515,0.868235,0.424242,,,,,
+7,19,Mayor,0.876078,0.411212,0.926667,0.424242,,,,,
+7,20,of,0.199216,0.428788,0.218039,0.441515,,,,,
+7,20,the,0.22,0.428788,0.243529,0.439091,,,,,
+7,20,City,0.248627,0.427879,0.278431,0.440909,,,,,
+7,20,of,0.282353,0.427879,0.300784,0.440909,,,,,
+7,20,"Shenyang,",0.302745,0.427576,0.380784,0.441212,,,,,
+7,20,on,0.386667,0.431212,0.406667,0.438485,,,,,
+7,20,this,0.411373,0.428485,0.439608,0.438788,,,,,
+7,20,fifth,0.440784,0.428788,0.477647,0.441515,,,,,
+7,20,day,0.481569,0.428788,0.509804,0.441818,,,,,
+7,20,of,0.513333,0.428788,0.532941,0.441515,,,,,
+7,20,June,0.534118,0.428788,0.573333,0.439091,,,,,
+7,20,"1995,",0.578431,0.428788,0.617647,0.441212,,,,,
+7,20,do,0.623137,0.428788,0.642745,0.439091,,,,,
+7,20,hereby,0.647059,0.428485,0.697647,0.441818,,,,,
+7,20,acknowledge,0.700784,0.427879,0.796471,0.440606,,,,,
+7,20,and,0.801176,0.42697,0.83098,0.437273,,,,,
+7,20,reaffirm,0.83451,0.42697,0.897255,0.439697,,,,,
+7,20,the,0.901569,0.427576,0.925098,0.437879,,,,,
+7,21,sister,0.199608,0.444242,0.239216,0.455152,,,,,
+7,21,cities,0.242745,0.443636,0.280392,0.454545,,,,,
+7,21,agreement,0.284314,0.445152,0.362353,0.45697,,,,,
+7,21,between,0.365882,0.443636,0.425882,0.454545,,,,,
+7,21,the,0.430588,0.444242,0.455294,0.454545,,,,,
+7,21,City,0.46,0.444242,0.490588,0.457273,,,,,
+7,21,of,0.49451,0.444545,0.513725,0.457576,,,,,
+7,21,Chicago,0.516078,0.444242,0.577647,0.457879,,,,,
+7,21,and,0.581569,0.444545,0.611765,0.455152,,,,,
+7,21,the,0.614902,0.444242,0.638431,0.455152,,,,,
+7,21,City,0.643137,0.443939,0.672157,0.457273,,,,,
+7,21,of,0.676471,0.444242,0.694902,0.457576,,,,,
+7,21,Shenyang.,0.696471,0.443939,0.774118,0.45697,,,,,
+7,22,The,0.256078,0.475152,0.285098,0.486364,,,,,
+7,22,City,0.292941,0.474242,0.322745,0.487879,,,,,
+7,22,of,0.329412,0.474848,0.348627,0.487879,,,,,
+7,22,Chicago,0.353725,0.474545,0.415686,0.489091,,,,,
+7,22,and,0.423529,0.475455,0.454118,0.486364,,,,,
+7,22,the,0.460392,0.475455,0.48549,0.486364,,,,,
+7,22,City,0.493725,0.475455,0.523922,0.488788,,,,,
+7,22,of,0.531765,0.475455,0.55098,0.488788,,,,,
+7,22,Shenyang,0.556078,0.475152,0.630588,0.489091,,,,,
+7,22,on,0.638039,0.478485,0.657647,0.486364,,,,,
+7,22,the,0.66549,0.475758,0.68902,0.486364,,,,,
+7,22,basis,0.696471,0.474848,0.734118,0.486061,,,,,
+7,22,of,0.740784,0.474545,0.76,0.487879,,,,,
+7,22,friendly,0.762353,0.473939,0.823529,0.487576,,,,,
+7,22,"cooperation,",0.831373,0.473939,0.921569,0.487273,,,,,
+7,23,equality,0.2,0.491515,0.258431,0.504545,,,,,
+7,23,and,0.263922,0.491212,0.294118,0.501515,,,,,
+7,23,mutual,0.299216,0.491212,0.353333,0.501212,,,,,
+7,23,benefit,0.358431,0.491212,0.412157,0.503939,,,,,
+7,23,will,0.417647,0.491515,0.446275,0.501818,,,,,
+7,23,continue,0.452157,0.491818,0.518431,0.502121,,,,,
+7,23,to,0.524706,0.493333,0.539608,0.502424,,,,,
+7,23,develop,0.545882,0.491818,0.602353,0.504848,,,,,
+7,23,a,0.608235,0.494545,0.618431,0.501818,,,,,
+7,23,sister,0.624314,0.491818,0.663922,0.502121,,,,,
+7,23,cities,0.66902,0.491212,0.706275,0.501818,,,,,
+7,23,relationship,0.711765,0.490303,0.800392,0.503333,,,,,
+7,23,to,0.806667,0.490909,0.821176,0.5,,,,,
+7,23,promote,0.825882,0.491212,0.889412,0.502424,,,,,
+7,23,and,0.895294,0.490909,0.924706,0.500606,,,,,
+7,24,broaden,0.199216,0.507273,0.26,0.518182,,,,,
+7,24,economic,0.264314,0.506364,0.335294,0.517576,,,,,
+7,24,cooperation,0.339608,0.507273,0.427059,0.519697,,,,,
+7,24,and,0.431373,0.507273,0.461961,0.517879,,,,,
+7,24,cultural,0.46549,0.50697,0.526667,0.518182,,,,,
+7,24,exchanges,0.530196,0.507273,0.607451,0.520606,,,,,
+7,24,between,0.611765,0.507273,0.670196,0.517879,,,,,
+7,24,the,0.675294,0.50697,0.698431,0.517879,,,,,
+7,24,two,0.702353,0.508485,0.728627,0.517576,,,,,
+7,24,cities.,0.732941,0.506061,0.773725,0.517273,,,,,
+7,25,The,0.255294,0.538788,0.284706,0.549394,,,,,
+7,25,two,0.289412,0.540303,0.315294,0.549091,,,,,
+7,25,cities,0.321176,0.538485,0.358824,0.549394,,,,,
+7,25,do,0.363137,0.538485,0.382353,0.549394,,,,,
+7,25,hereby,0.387843,0.538788,0.439216,0.552121,,,,,
+7,25,declare,0.443922,0.539394,0.499608,0.550303,,,,,
+7,25,their,0.50549,0.539091,0.541961,0.549697,,,,,
+7,25,interest,0.546667,0.539091,0.601961,0.55,,,,,
+7,25,in,0.607059,0.539394,0.621961,0.55,,,,,
+7,25,exploring,0.628235,0.539394,0.698431,0.552727,,,,,
+7,25,the,0.703529,0.538788,0.727059,0.549394,,,,,
+7,25,establishment,0.732157,0.537879,0.834118,0.549091,,,,,
+7,25,of,0.838431,0.537273,0.858039,0.550303,,,,,
+7,25,business,0.860784,0.537273,0.923922,0.548485,,,,,
+7,26,and,0.198824,0.555152,0.22902,0.566061,,,,,
+7,26,trade,0.232157,0.555152,0.270588,0.565455,,,,,
+7,26,relations,0.274118,0.554848,0.338824,0.564848,,,,,
+7,26,between,0.341961,0.554848,0.401961,0.565152,,,,,
+7,26,Chicago,0.407451,0.554545,0.469804,0.568182,,,,,
+7,26,and,0.47451,0.555152,0.505098,0.565455,,,,,
+7,26,Shenyang.,0.508235,0.555152,0.587059,0.568485,,,,,
+7,27,In,0.254118,0.586061,0.271765,0.59697,,,,,
+7,27,"addition,",0.277255,0.585758,0.341176,0.597576,,,,,
+7,27,exchanges,0.34902,0.586364,0.427843,0.599394,,,,,
+7,27,will,0.433725,0.586364,0.462353,0.596667,,,,,
+7,27,be,0.466667,0.586364,0.48549,0.59697,,,,,
+7,27,promoted,0.490196,0.586667,0.563137,0.599697,,,,,
+7,27,in,0.567843,0.586364,0.583137,0.596667,,,,,
+7,27,the,0.588627,0.586364,0.612941,0.59697,,,,,
+7,27,area,0.617647,0.589091,0.651765,0.59697,,,,,
+7,27,of,0.656471,0.586364,0.675686,0.599091,,,,,
+7,27,the,0.678431,0.586667,0.701569,0.596667,,,,,
+7,27,arts,0.706667,0.587273,0.735294,0.596364,,,,,
+7,27,such,0.74,0.585152,0.775686,0.595758,,,,,
+7,27,as,0.781176,0.587273,0.798431,0.595152,,,,,
+7,27,"exhibits,",0.803922,0.583939,0.86549,0.596667,,,,,
+7,27,"music,",0.872941,0.584848,0.920784,0.59697,,,,,
+7,28,dance,0.198431,0.602424,0.243137,0.612121,,,,,
+7,28,and,0.247059,0.601818,0.276863,0.611515,,,,,
+7,28,other,0.28,0.601515,0.320392,0.611515,,,,,
+7,28,cultural,0.323529,0.601212,0.382745,0.611515,,,,,
+7,28,activities.,0.385882,0.601515,0.454902,0.611818,,,,,
+7,29,In,0.254118,0.631818,0.27098,0.642727,,,,,
+7,29,"addition,",0.277255,0.631515,0.341176,0.644242,,,,,
+7,29,exchanges,0.349412,0.632121,0.427843,0.645152,,,,,
+7,29,will,0.433725,0.632121,0.462745,0.642727,,,,,
+7,29,be,0.467843,0.632727,0.485882,0.643333,,,,,
+7,29,promoted,0.489804,0.632424,0.563922,0.646061,,,,,
+7,29,in,0.56902,0.632424,0.585098,0.643333,,,,,
+7,29,education,0.591765,0.632727,0.663922,0.64303,,,,,
+7,29,and,0.66902,0.632121,0.698039,0.64303,,,,,
+7,29,the,0.703137,0.632121,0.727059,0.642727,,,,,
+7,29,establishment,0.732941,0.630606,0.83451,0.642424,,,,,
+7,29,of,0.839216,0.630303,0.858824,0.643636,,,,,
+7,29,contacts,0.861961,0.632121,0.922745,0.641515,,,,,
+7,30,within,0.198824,0.648182,0.24549,0.658788,,,,,
+7,30,educational,0.250196,0.647576,0.336471,0.658182,,,,,
+7,30,institutions,0.339608,0.647273,0.422745,0.658788,,,,,
+7,30,encouraged.,0.426667,0.648182,0.518824,0.661818,,,,,
+7,31,In,0.253725,0.679394,0.270588,0.69,,,,,
+7,31,"addition,",0.276078,0.678788,0.34,0.692121,,,,,
+7,31,we,0.347451,0.682121,0.367843,0.69,,,,,
+7,31,declare,0.372157,0.679394,0.427059,0.69,,,,,
+7,31,our,0.431765,0.68303,0.460392,0.691212,,,,,
+7,31,intention,0.464314,0.679697,0.532549,0.690303,,,,,
+7,31,to,0.537255,0.681515,0.551765,0.69,,,,,
+7,31,promote,0.555294,0.681212,0.618431,0.693333,,,,,
+7,31,exchanges,0.623922,0.680303,0.700784,0.69303,,,,,
+7,31,in,0.70549,0.68,0.720784,0.689697,,,,,
+7,31,such,0.725882,0.678788,0.761176,0.690303,,,,,
+7,31,fields,0.763922,0.678182,0.807059,0.691818,,,,,
+7,31,as,0.812549,0.681212,0.82902,0.688788,,,,,
+7,31,science,0.833725,0.678182,0.88902,0.688485,,,,,
+7,31,and,0.892941,0.678485,0.922745,0.688788,,,,,
+7,32,"technology,",0.198039,0.695758,0.281961,0.709091,,,,,
+7,32,"sports,",0.29098,0.696667,0.338039,0.708182,,,,,
+7,32,"health,",0.346275,0.695152,0.398039,0.708485,,,,,
+7,32,youth,0.40549,0.695758,0.450196,0.708788,,,,,
+7,32,and,0.457647,0.695455,0.48902,0.706667,,,,,
+7,32,any,0.494902,0.698788,0.523529,0.708788,,,,,
+7,32,areas,0.529804,0.698788,0.571373,0.706667,,,,,
+7,32,that,0.577647,0.696061,0.608235,0.706364,,,,,
+7,32,will,0.614902,0.696061,0.642353,0.706364,,,,,
+7,32,contribute,0.648627,0.695455,0.722745,0.706364,,,,,
+7,32,to,0.72902,0.696667,0.743529,0.705758,,,,,
+7,32,the,0.751373,0.695152,0.774902,0.705152,,,,,
+7,32,prosperity,0.780392,0.694242,0.85451,0.707879,,,,,
+7,32,and,0.861961,0.693636,0.891765,0.704545,,,,,
+7,32,the,0.897647,0.694545,0.920784,0.704545,,,,,
+7,33,further,0.194902,0.712121,0.252157,0.725152,,,,,
+7,33,development,0.254902,0.711515,0.347843,0.724545,,,,,
+7,33,of,0.35098,0.711515,0.369804,0.724545,,,,,
+7,33,friendship,0.368627,0.711515,0.448235,0.724545,,,,,
+7,33,between,0.452549,0.711818,0.513725,0.722727,,,,,
+7,33,the,0.518039,0.711818,0.542353,0.722424,,,,,
+7,33,people,0.544706,0.711818,0.594902,0.725152,,,,,
+7,33,of,0.598431,0.712121,0.616863,0.724545,,,,,
+7,33,our,0.618824,0.715455,0.646275,0.722727,,,,,
+7,33,two,0.64902,0.713333,0.675294,0.722424,,,,,
+7,33,cities.,0.679608,0.711818,0.72,0.722424,,,,,
+7,34,3h.5.,0.593725,0.750606,0.812157,0.813939,,,,,
+7,35,THE,0.197255,0.822727,0.231373,0.832121,,,,,
+7,35,HONORABLE,0.23451,0.821818,0.337255,0.831818,,,,,
+7,35,RICHARD,0.34,0.821515,0.414902,0.831515,,,,,
+7,35,M.,0.418039,0.822121,0.438431,0.831515,,,,,
+7,35,DALEY,0.444314,0.822121,0.501176,0.831515,,,,,
+7,36,THE,0.588627,0.821818,0.622353,0.831515,,,,,
+7,36,HONORABLE,0.62549,0.821818,0.727451,0.831515,,,,,
+7,36,ZHANG,0.730196,0.820606,0.788235,0.830606,,,,,
+7,36,RONGMAO,0.790588,0.819394,0.876471,0.829697,,,,,
+7,37,MAYOR,0.195686,0.83697,0.255686,0.846364,,,,,
+7,37,OF,0.259608,0.836364,0.283137,0.846364,,,,,
+7,37,CHICAGO,0.286275,0.835758,0.360392,0.845758,,,,,
+7,38,MAYOR,0.587451,0.836364,0.646667,0.845758,,,,,
+7,38,OF,0.650196,0.835455,0.673333,0.845758,,,,,
+7,38,SHENYANG,0.675686,0.835455,0.76549,0.845758,,,,,
diff --git a/example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv b/example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv
new file mode 100644
index 0000000000000000000000000000000000000000..6602870d9590d8574b49718fab7472d5f8aaf202
--- /dev/null
+++ b/example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv
@@ -0,0 +1,923 @@
+page,text,left,top,width,height,line
+1,5-Point Networking Email,0.404314,0.050606,0.189804,0.012121,1
+1,"Steve Dalton, the author of The 2-Hour Job Search believes the perfect networking email is a ""5-Point E-mail"". The five",0.058824,0.086061,0.859608,0.012727,2
+1,points are as follows:,0.059216,0.10303,0.152941,0.012727,3
+1,1. 100 words or less,0.088627,0.136667,0.156078,0.010303,4
+1,2. No mention of jobs (in subject or body),0.088235,0.153333,0.31451,0.012727,5
+1,"3. Connection goes first (e.g., ND connection)",0.087843,0.170606,0.341569,0.01303,6
+1,4. Generalize your interest,0.087843,0.187879,0.205098,0.012424,7
+1,5. Maintain control of the follow up,0.088627,0.204545,0.27098,0.012727,8
+1,Here's an example of what a 5-Point email would look like:,0.059608,0.255455,0.42549,0.012727,9
+1,Subject: Notre Dame MBA Student Seeking Your Advice,0.117255,0.289394,0.414118,0.012424,10
+1,"Dear Mr. Jones,",0.118039,0.323939,0.112549,0.011515,11
+1,"My name is Brooke Franklin, and I'm a first-year Notre Dame MBA student who found your",0.118431,0.35697,0.661569,0.01303,12
+1,information in the ND alumni database. May I have 15 minutes of your time to ask you about,0.118039,0.374242,0.677255,0.012727,13
+1,your experience with IBM? I'm trying to learn more about marketing careers at technology,0.117255,0.391212,0.660784,0.01303,14
+1,companies and your insights would be very helpful.,0.117647,0.407879,0.373333,0.01303,15
+1,"I realize this may be a busy time for you, so if we're unable to connect this week, I'll try again",0.118039,0.442121,0.674902,0.012727,16
+1,next week to see whether that is more convenient.,0.118039,0.459091,0.370588,0.010303,17
+1,"Thank you for your time,",0.117255,0.492727,0.179216,0.012727,18
+1,Brooke,0.118431,0.51,0.050588,0.01,19
+1,The most important part of this email may be the follow-up; an email like this allows you to reach out again in a week if,0.058431,0.543333,0.872157,0.01303,20
+1,you haven't heard back without feeling like you're bothering the person at the other end. If you don't hear anything,0.058431,0.560606,0.843922,0.01303,21
+1,"after the second attempt, you can probably cross him/her off your list and move on to the next contact.",0.058824,0.577273,0.755686,0.01303,22
+2,36 Westmoreland Drive,0.705764,0.026796,0.209996,0.011403,1
+2,Newcastle upon Tyne,0.723499,0.04333,0.192664,0.013968,2
+2,NE1 8LT,0.836759,0.059863,0.079807,0.011117,3
+2,Mr Mark Wilson,0.083837,0.076112,0.138251,0.011403,4
+2,UK Health Trust,0.083837,0.09236,0.143087,0.011403,5
+2,18 Whitehall Square,0.084643,0.108609,0.179766,0.013968,6
+2,London,0.083837,0.125428,0.066102,0.011117,7
+2,SW1 9LT,0.083837,0.141391,0.083031,0.011403,8
+2,11th January 2015,0.755744,0.154789,0.161225,0.017389,9
+2,Dear Mr Wilson,0.083837,0.174173,0.137042,0.011403,10
+2,Re: Community Health Development Officer [HD/12/2014],0.083837,0.201539,0.544135,0.014253,11
+2,"I am writing to apply for the above post, as advertised on the Health UK recruitment site. I am",0.08424,0.228905,0.828295,0.014253,12
+2,a sociology graduate with a 2: 1from Newcastle University. I have relevant health awareness,0.083434,0.245439,0.822249,0.014253,13
+2,"experience, and I am looking for a position where I can employ my knowledge and skills in",0.083434,0.261973,0.802499,0.013968,14
+2,support of health and community development. I enclose my CV for your attention.,0.083434,0.277936,0.731963,0.014253,15
+2,I am eager to work for UK Health Trust because of your ground-breaking work within the field,0.08424,0.305302,0.825877,0.014253,16
+2,of community health. I became aware of the work of the Trust when carrying out my,0.083434,0.322121,0.744055,0.013968,17
+2,"dissertation, 'Generational Change in Local Health Awareness, where I researched health",0.083031,0.338084,0.798468,0.014253,18
+2,awareness of children and elderly people in a deprived location. I referred to a number of,0.083031,0.354618,0.792019,0.013968,19
+2,publications produced by UK Health Trust and was impressed by the innovative techniques,0.083837,0.371152,0.809351,0.013968,20
+2,your organisation uses to engage local community members in projects. The Community,0.083031,0.387685,0.788795,0.014253,21
+2,Health Development Officer position would further develop my existing abilities and my,0.08424,0.403934,0.771463,0.014253,22
+2,"understanding of community development, allowing me to contribute in a practical way to",0.083837,0.420468,0.789601,0.013968,23
+2,enhancing the health of disadvantaged people.,0.083434,0.436716,0.415961,0.013968,24
+2,The volunteer development aspect of the position particularly appeals to me. I have worked,0.083031,0.469213,0.811769,0.014538,25
+2,"in the voluntary sector, providing services tackling health inequalities and promoting healthy",0.083837,0.485747,0.814994,0.014253,26
+2,living in Newcastle. I promoted health awareness through one to one sessions and in large,0.083434,0.501995,0.805723,0.014253,27
+2,"groups and developed interpersonal skills, confidence and patience when engaging and",0.083031,0.518529,0.787183,0.014253,28
+2,"motivating participants. While raising the group's profile using social media, the local press",0.083434,0.534778,0.804917,0.013968,29
+2,"and at presentations to youth clubs, faith meetings and care homes I recognised the need to",0.083434,0.551596,0.820637,0.013968,30
+2,"change my delivery style to suit the audience. As a volunteer teacher in Ghana, I developed",0.083434,0.56756,0.8158,0.014253,31
+2,communication and team-building skills essential to your advertised role; liaising with,0.083434,0.584094,0.753325,0.013968,32
+2,colleagues and parents and a lively group of twenty-five 7-8 year olds to arrange a,0.083434,0.600627,0.731963,0.014253,33
+2,"community event. My retail experience, coupled with my extracurricular activities additionally",0.083434,0.617161,0.822249,0.013968,34
+2,"enhanced my ability to develop others, as I was responsible for inducting and training my",0.083434,0.633409,0.79081,0.014253,35
+2,peers.,0.083837,0.652509,0.05401,0.011117,36
+2,"In relation to the fundraising and budgeting aspect of the role, I have experience of raising",0.08424,0.68244,0.798065,0.014253,37
+2,"substantial amounts of money through several successful charity events, including a well -",0.083031,0.698404,0.802096,0.014538,38
+2,attended fashion show. I was also elected Treasurer of NU Sociology Society with,0.083434,0.715222,0.728335,0.014253,39
+2,responsibility for managing a budget of £3000.,0.083434,0.731471,0.411528,0.014538,40
+2,The necessity to travel to identify community issues only adds to the appeal of the position. I,0.083031,0.758837,0.82104,0.014253,41
+2,"enjoy driving, hold a full clean driving licence and I am very interested in relocating to London",0.083434,0.775086,0.828295,0.014538,42
+2,to work for UK Health Trust.,0.083031,0.791619,0.247481,0.011688,43
+2,Thank you for considering my application. I look forward to hearing from you.,0.083434,0.824401,0.68158,0.014253,44
+2,Yours sincerely,0.082628,0.857184,0.138251,0.014253,45
+2,Rachel Sullivan,0.083837,0.889966,0.137042,0.011403,46
+3,SisterCities,0.169804,0.033333,0.238431,0.028182,1
+3,Partnership Agreement,0.516078,0.027879,0.440784,0.032424,2
+3,INTERNATIONAL,0.170196,0.06697,0.237647,0.008788,3
+3,Connect globally. Thrive locally.,0.169804,0.08697,0.238824,0.01303,4
+3,Toolkit,0.830588,0.07303,0.126667,0.025152,5
+3,Types of Affiliations,0.117255,0.157576,0.241961,0.02,6
+3,Sister City Relationship,0.117647,0.187273,0.196863,0.013939,7
+3,"A Sister City relationship is formed when the mayor or highest elected official (or, if elections",0.117255,0.211212,0.738824,0.013636,8
+3,"do not take place, highest appointed official) from a U.S. community and a community in",0.117647,0.227273,0.70902,0.013939,9
+3,another country or territory sign a formal agreement on behalf of their communities endorsing a,0.117647,0.243636,0.761961,0.013636,10
+3,"""sister city/sister cities"" relationship. Sister city agreements shall be considered active/valid",0.118039,0.259697,0.731373,0.013939,11
+3,unless otherwise indicated by one or both of the respective communities.,0.118039,0.276061,0.58549,0.013636,12
+3,Sister Cities International shall formally recognize only those relationships by cities/members in,0.118039,0.299697,0.758824,0.013636,13
+3,good standing (i.e. who are current on membership dues) in its Membership Directory or on its,0.117647,0.316061,0.754902,0.013636,14
+3,"website. However, Sister Cities International shall not assert as invalid or otherwise impugn the",0.116863,0.332121,0.760784,0.013636,15
+3,legitimacy of those relationships formed by non-members.,0.118039,0.348485,0.466275,0.013636,16
+3,Friendship City,0.118039,0.372121,0.127059,0.013939,17
+3,"A Friendship City or Friendship Cities relationship is often formed by cities as a ""stepping",0.117255,0.395758,0.714118,0.013636,18
+3,"stone"" to a more formal ""Sister City"" agreement. Typically Friendship City agreements are",0.117647,0.411515,0.720392,0.014242,19
+3,referred to as such in the formal documents that are signed. Sister Cities International shall,0.118039,0.428182,0.72549,0.013636,20
+3,recognize Friendship City relationships by members in its Membership Directory and website.,0.118039,0.444242,0.747843,0.013636,21
+3,As per Sister Cities International Board of Directors:,0.117255,0.467879,0.413333,0.013636,22
+3,Sister Cities International will recognize a new sister cities affiliation between a,0.169412,0.492121,0.626667,0.013333,23
+3,"U.S. and an international community, even though another affiliation may exist",0.169412,0.507879,0.625098,0.013636,24
+3,"between that international community and a different U.S. community, only if a",0.169412,0.524545,0.62902,0.013636,25
+3,cooperative agreement among all involved communities is filed with Sister Cities,0.16902,0.540606,0.643137,0.013636,26
+3,"International. If a cooperative agreement is denied, or no response to the request",0.170196,0.556667,0.647843,0.013333,27
+3,"is received within a reasonable amount of time, Sister Cities International will",0.169412,0.57303,0.612157,0.012727,28
+3,recognize the partnership as a friendship city and it will be delineated as such,0.169412,0.589091,0.621176,0.013636,29
+3,with a symbol in the membership directories.,0.168627,0.605455,0.358824,0.013333,30
+3,The cooperative agreement must be sent by the Mayor/County,0.168627,0.628788,0.509412,0.013939,31
+3,"Executive/Governor of the requesting community, and must be sent to the",0.169804,0.645152,0.595294,0.014242,32
+3,Mayor/County Executive/Governor of each of the existing partnership,0.169804,0.661212,0.555294,0.013636,33
+3,communities. Although the Mayor/County Executive/Governor may request input,0.16902,0.677879,0.647451,0.013636,34
+3,"from, or may be given input by, the sister cities program, it is up to the discretion",0.168627,0.693939,0.647059,0.013939,35
+3,of the Mayor/County Executive/Governor to sign the cooperative agreement.,0.16902,0.709697,0.612941,0.013939,36
+3,Although Sister Cities International will help with the cooperative agreement,0.168627,0.726364,0.605882,0.013636,37
+3,"process, it is up to the requesting community to get the agreement signed. Sister",0.169412,0.742121,0.650196,0.013939,38
+3,"Cities International will not, in any way, force a community to ""share"" and sign",0.16902,0.758182,0.623922,0.014242,39
+3,the cooperative agreement.,0.168627,0.774848,0.219216,0.013333,40
+3,"To place a relationship into Emeritus status, the mayor or highest elected official of the U.S.",0.117255,0.798485,0.736471,0.013939,41
+3,community must write a letter to the mayor of the foreign city indicating that they wish to,0.118039,0.814545,0.70902,0.013636,42
+3,"remain sister cities, but understand that the relationship will remain inactive until such time as",0.118039,0.831212,0.747451,0.013333,43
+3,both cities are able to sustain an active relationship. Sister Cities International should be,0.118039,0.847273,0.705098,0.013636,44
+3,informed in writing by the mayor of the U.S. city of the situation. Sister Cities International will,0.118039,0.863333,0.746275,0.013636,45
+4,SisterCities,0.169804,0.033333,0.238824,0.028182,1
+4,Partnership Agreement,0.516078,0.027879,0.440784,0.032424,2
+4,INTERNATIONAL,0.170196,0.06697,0.237647,0.008788,3
+4,Connect globally. Thrive locally.,0.169804,0.08697,0.239216,0.01303,4
+4,Toolkit,0.83098,0.072727,0.127059,0.025455,5
+4,then place the partnership into Emeritus Status and will reflect this status in directories and all,0.117255,0.132424,0.751373,0.013333,6
+4,lists of sister city programs.,0.118039,0.148788,0.218431,0.013333,7
+4,"If a community wishes to terminate a sister city relationship, then a letter from the mayor or",0.118431,0.172424,0.732549,0.013333,8
+4,highest elected official of the U.S. city should be sent to the mayor of the sister city. Sister,0.118039,0.188485,0.721569,0.013636,9
+4,Cities International should be informed of this action in writing by the mayor of the U.S. city,0.118039,0.204848,0.72902,0.013333,10
+4,and Sister Cities International will then remove the partnership from its directories and all lists,0.117647,0.221212,0.746275,0.013333,11
+4,of sister city programs. We do not recommend terminating a relationship simply because it is,0.117647,0.237273,0.743529,0.013333,12
+4,"dormant. Many partnerships wax and wane over the years, and in many cases a dormant",0.117647,0.253939,0.713333,0.013333,13
+4,partnership may be reinvigorated by local members years after it has been inactive.,0.118039,0.269697,0.664314,0.013636,14
+4,General Guidelines,0.118039,0.295152,0.231765,0.016061,15
+4,In order for a sister city/county/state partnership to be recognized by Sister Cities International,0.118431,0.324242,0.754902,0.013636,16
+4,"(SCI), the two communities must sign formal documents which clearly endorse the link. This",0.118039,0.340606,0.74,0.013636,17
+4,presumes several key items: that the U.S. community is already a member of SCI and has,0.118039,0.35697,0.718039,0.013636,18
+4,followed proper procedures (e.g. passed a city council resolution declaring the intent to twin,0.117255,0.373333,0.737647,0.013636,19
+4,with the specific city); that both communities share a mutual commitment to the relationship;,0.117255,0.389394,0.740784,0.013636,20
+4,and that both have secured the necessary support structure to build a lasting relationship. You,0.117647,0.405455,0.758039,0.013333,21
+4,should check with your local sister city program to see if they have any additional requirements,0.117647,0.421818,0.760784,0.013636,22
+4,before pursuing a sister city relationship.,0.118039,0.437879,0.323137,0.013636,23
+4,"SCI often refers to these agreements as a ""Sister City Agreement"" or ""Memorandum of",0.118039,0.461515,0.696863,0.013939,24
+4,"Understanding."" However, as the following examples show, the actual name and format of",0.118039,0.477576,0.729804,0.013636,25
+4,your documents is left up to you.,0.117255,0.494242,0.262745,0.013636,26
+4,A few things to keep in mind as you draft your agreement:,0.117255,0.517879,0.463137,0.013636,27
+4,"Your agreement can range from the ceremonial, with language focusing on each city's",0.176471,0.542121,0.69098,0.013939,28
+4,"commitment to fostering understanding, cooperation, and mutual benefit to the precise,",0.176471,0.558485,0.701961,0.013333,29
+4,"with particular areas of interest, specific programs/activities, or more concrete goals",0.176078,0.574848,0.673725,0.013636,30
+4,related to anything from numbers of exchanges to economic development.,0.176863,0.591212,0.596863,0.013636,31
+4,"Don't try to include everything you plan to do. Some specifics, like particular areas of",0.177255,0.620303,0.681176,0.013939,32
+4,"interest or participating institutions are good to include. However, there's no need to",0.176471,0.636667,0.675686,0.013636,33
+4,include all the programs you plan to do if it makes the document too lengthy or limits,0.176863,0.652727,0.678824,0.013939,34
+4,the scope of projects. This is a formal document to establish the relationship; specific,0.176078,0.668788,0.684706,0.013636,35
+4,"tasks, responsibilities, or other nuts-and-bolts text related to implementation or",0.176078,0.685455,0.635686,0.013333,36
+4,administration of the partnership can be expressed more fully in a separate,0.176471,0.701212,0.600392,0.013636,37
+4,memorandum between the respective sister city committees. Your partnership,0.177255,0.717576,0.626667,0.013636,38
+4,agreement is a historical document and should not be dated or limited by being aligned,0.176471,0.733636,0.699216,0.013636,39
+4,with very specific tasks.,0.176078,0.750606,0.190196,0.013333,40
+4,Work with your counterparts. Remember that this is signed by both cities. You should,0.176078,0.779697,0.68549,0.013636,41
+4,share drafts of your agreement with your international partners and solicit feedback on,0.176471,0.795758,0.691765,0.013333,42
+4,what they'd like to see in the agreement. Be flexible to cultural or municipal priorities.,0.176471,0.811818,0.679216,0.013939,43
+4,Ask your counterparts to translate the agreement if it is drafted in English. It is,0.176078,0.841515,0.623137,0.013636,44
+4,important for the citizens of your partner community to be able to read and understand,0.176863,0.857576,0.693725,0.013939,45
+4,the commitment their city has made. Have someone in your own community who,0.176078,0.873939,0.649804,0.013636,46
+5,SisterCities,0.169804,0.033333,0.239216,0.028182,1
+5,Partnership Agreement,0.516078,0.027879,0.441176,0.032121,2
+5,INTERNATIONAL,0.170196,0.06697,0.237255,0.008788,3
+5,Connect globally. Thrive locally.,0.169804,0.08697,0.239216,0.01303,4
+5,Toolkit,0.83098,0.07303,0.126667,0.025152,5
+5,speaks that language check the foreign-language version to make sure it mirrors what,0.176471,0.132424,0.688235,0.013333,6
+5,you have in your own agreement.,0.176471,0.148788,0.264706,0.013333,7
+5,Keep it to one page. Ceremonial documents such as these partnership agreements,0.176863,0.178485,0.66549,0.013636,8
+5,work best if they can be posted in their entirety.,0.176078,0.194545,0.380392,0.013636,9
+5,Most sister city agreements include some acknowledgement of the founding principles,0.177255,0.224242,0.694902,0.013636,10
+5,"of the sister city movement- to promote peace through mutual respect, understanding,",0.176471,0.240303,0.698431,0.013333,11
+5,and cooperation.,0.176471,0.25697,0.13451,0.013333,12
+5,Consider using official letterhead and/or other embellishments such as city seals or,0.176863,0.286061,0.665882,0.013333,13
+5,logos to reflect your enhance the document. Sister city agreements are often posted at,0.176863,0.302121,0.695686,0.013636,14
+5,city hall or other municipal offices and should reflect their historical importance,0.176471,0.318485,0.630588,0.013333,15
+5,Look at other agreements your city has signed. These agreements may give you an idea,0.177255,0.347879,0.705098,0.013636,16
+5,"of what is acceptable or possible, and they may be in an easily replicable format. If you",0.176471,0.364242,0.695686,0.013636,17
+5,"cannot access older agreements please contact Sister Cities International, we may",0.176863,0.380303,0.663137,0.013636,18
+5,"have them on file, although we do not have copies of all partnership agreements.",0.176863,0.396667,0.64549,0.013636,19
+5,Documents must be signed by the top elected official of both communities.,0.177255,0.426364,0.601569,0.013333,20
+5,"Check with your mayor, city council, town clerk, et al. to make sure that the agreement",0.176863,0.455758,0.694118,0.013636,21
+5,"is OK with them. The mayor is the one putting his or her name on the paper, and you",0.176863,0.471818,0.677255,0.013333,22
+5,don't want to spend time developing an agreement which will never be signed.,0.176863,0.488182,0.629412,0.013636,23
+5,Official documents are usually signed during a formal ceremony recognizing the,0.176863,0.517576,0.638431,0.013636,24
+5,partnership. Be sure both communities receive a signed set of the official documents,0.177255,0.533939,0.683922,0.013636,25
+5,for their records.,0.176078,0.550606,0.131373,0.010606,26
+5,Remember to send your signed agreement to Sister Cities International. After we,0.177255,0.579697,0.645098,0.013636,27
+5,receive your agreement we will post the relationship in the City Directory and make sure,0.176863,0.595758,0.703137,0.013636,28
+5,it is included in our Annual Membership Directory.,0.176863,0.612121,0.398039,0.013333,29
+5,Remember that each city's sister city program is independent and can impose requirements,0.118431,0.640606,0.736471,0.013939,30
+5,"like the establishment of a committee, a review period, sustainability/funding plan, among",0.118039,0.65697,0.715686,0.013636,31
+5,"others, before sanctioning a sister city agreement. Check with your local program or mayor's",0.117647,0.672727,0.743529,0.014242,32
+5,office to see if this is the case.,0.117647,0.689091,0.241176,0.011515,33
+5,On the following pages you'll find a series of partnership agreements to give you an idea of,0.118039,0.717879,0.728627,0.013939,34
+5,"what is possible. While you should feel free to use some of the formatting and language, we",0.117255,0.734242,0.73451,0.013636,35
+5,encourage you to make your agreement your own and be creative with what you produce. If,0.117647,0.750606,0.737647,0.013636,36
+5,you are unsure about your agreement or want advice you can always solicit feedback by,0.117647,0.766667,0.708627,0.013636,37
+5,sending it to our Membership Director at akaplan@sister-cities.org or contacting us at (202),0.117647,0.782727,0.732157,0.013636,38
+5,347-8630.,0.117647,0.799394,0.080392,0.010303,39
+6,SisterCities,0.169412,0.033333,0.239608,0.028485,1
+6,Partnership Agreement,0.516471,0.027879,0.440784,0.032727,2
+6,INTERNATIONAL,0.170196,0.066667,0.238431,0.009091,3
+6,Connect globally. Thrive locally.,0.169412,0.08697,0.239608,0.013333,4
+6,Toolkit,0.830588,0.072727,0.127843,0.025758,5
+6,"jull bubzig 2000 3,312",0.378039,0.291212,0.32549,0.019394,6
+6,ABU DHABI MUNICIPALITY & TOWN PLANNING,0.376471,0.316667,0.327451,0.016667,7
+6,AN AGREEMENT FOR THE ESTABLISHMENT OF,0.260784,0.373636,0.52549,0.012727,8
+6,SISTER CITIES RELATIONSHIP,0.337647,0.393636,0.342745,0.012121,9
+6,BETWEEN,0.454902,0.413636,0.110588,0.011212,10
+6,THE CITY OF ABU DHABI ( U. A.E),0.337255,0.432727,0.375686,0.013939,11
+6,AND,0.487843,0.452727,0.048235,0.011212,12
+6,"HOUSTON, TEXAS ( U.S.A)",0.385882,0.471515,0.298039,0.014848,13
+6,"The Sister City Program, administered by Sister Cities International, was initiated",0.221961,0.525455,0.597255,0.01303,14
+6,By the President of the United States of America in 1956 to encourage greater,0.222745,0.539394,0.561961,0.012727,15
+6,Friendship and understanding between the United States and other nations through,0.222745,0.553333,0.608235,0.012727,16
+6,Direct personal contact: and,0.222745,0.567576,0.20549,0.012424,17
+6,"In order to foster those goals, the people of Abu Dhabi and Houston, in a gesture of",0.222353,0.594242,0.603529,0.012424,18
+6,"Friendship and goodwill, agree to collaborate for the mutual benefit of their",0.222745,0.608182,0.547843,0.01303,19
+6,"Communities by exploring education, economic and cultural opportunities.",0.222353,0.622121,0.541961,0.012121,20
+6,"Abu Dhabi and Houston, sharing a common interest in energy, technology and",0.221569,0.648788,0.574118,0.012424,21
+6,"medicine, and the desire to promote mutual understanding among our citizens do",0.222353,0.66303,0.588235,0.012121,22
+6,"hereby proclaim themselves Sister Cities beginning on the 13th day of March 2001,",0.221961,0.673636,0.594118,0.015758,23
+6,the date of Houston City Council resolution estatblishing the Sister City,0.221961,0.690303,0.519608,0.01303,24
+6,relationship became effective.,0.221569,0.705152,0.217647,0.012424,25
+6,"Signed on this 26 of October 2002, in duplicate in the Arabic and English",0.221569,0.732121,0.533333,0.01303,26
+6,"Languages, both text being equally authentic.",0.221961,0.746667,0.328627,0.012727,27
+6,A,0.344314,0.768485,0.084706,0.030303,28
+6,Sheikh Mohammed bin Butti AI Hamed,0.245882,0.806364,0.366275,0.010909,29
+6,Lee P.Brown,0.729412,0.806364,0.118824,0.010303,30
+6,Chairman of Abu Dhabi Municipality,0.24549,0.823636,0.342353,0.012727,31
+6,Mayor of Houston,0.704706,0.823333,0.166667,0.012424,32
+6,&Town Planning,0.324314,0.841212,0.155686,0.012424,33
+7,SisterCities,0.169412,0.033333,0.239608,0.028485,1
+7,Partnership Agreement,0.516078,0.027879,0.441176,0.032424,2
+7,INTERNATIONAL,0.17098,0.066667,0.237255,0.009091,3
+7,Connect globally. Thrive locally.,0.169412,0.08697,0.239216,0.013333,4
+7,Toolkit,0.83098,0.072727,0.127059,0.025758,5
+7,THE CITY OF NEW YORK,0.438824,0.262121,0.240784,0.009697,6
+7,OFFICE OF THE MAYOR,0.450196,0.27697,0.220392,0.009697,7
+7,"NEW YORK, N.Y. 10007",0.461176,0.29303,0.196863,0.010303,8
+7,THE NEW YORK CITY-LONDON SISTER CITY PARTNERSHIP,0.267451,0.355758,0.582745,0.011818,9
+7,Memorandum of Understanding,0.420392,0.371212,0.274902,0.013333,10
+7,The Sister City partnership between New York City and London will foster mutually,0.201176,0.402121,0.674118,0.014242,11
+7,beneficial solutions to common challenges for these two great cosmopolitan entities.,0.201176,0.417273,0.66902,0.013636,12
+7,"Consequently, the Sister City relationship between the two will be one of the most",0.201176,0.432727,0.652549,0.015152,13
+7,"important in their network of global partnerships, as it strives to:",0.201176,0.448182,0.50902,0.015455,14
+7,Encourage and publicize existing exchanges between London and New York City so,0.230588,0.480303,0.671373,0.015152,15
+7,that they can flourish to benefit a wider cross-section of the citizens of both;,0.230588,0.496061,0.602353,0.015152,16
+7,"Support and promote the development of new social, economic, academic and",0.230196,0.512424,0.618431,0.015455,17
+7,community programs to encourage both cities' citizens to share their experiences as a,0.229804,0.527879,0.678039,0.014848,18
+7,medium for learning from one another;,0.229804,0.543636,0.309412,0.013939,19
+7,Generate an improvement of the operation of the cities' various government agencies,0.229804,0.56,0.676078,0.014545,20
+7,by serving as a conduit of information;,0.22902,0.575758,0.307843,0.014848,21
+7,"Identify themes, common to both, that can generate new initiatives to further and",0.229412,0.591818,0.640784,0.015152,22
+7,"nurture the increasingly powerful financial, social and cultural relationships between",0.22902,0.607576,0.671373,0.014242,23
+7,the cities;,0.22902,0.624545,0.076471,0.012424,24
+7,Promote key mayoral priorities relevant to both London and New York City;,0.228627,0.639394,0.608627,0.015152,25
+7,Provide financial or in kind support to community-led programs that advance the,0.228627,0.656061,0.641569,0.013636,26
+7,aims of the Sister City partnership;,0.22902,0.672121,0.275294,0.013636,27
+7,"With the above purposes in mind, the Mayor of the City of New York and the Mayor of",0.198824,0.702424,0.697647,0.014848,28
+7,London solemnly confirm that these two cities are united by an official partnership by the,0.198824,0.718182,0.710196,0.014545,29
+7,protocol of this Memorandum of Understanding.,0.198431,0.733939,0.384314,0.015152,30
+7,This agreement will go into effect from the date of signatures.,0.310196,0.780606,0.488235,0.014545,31
+7,Signed in March of 2001,0.455686,0.796364,0.19451,0.013636,32
+7,Thedder Rudolph W. Giuliani,0.178824,0.795455,0.244314,0.100909,33
+7,Mayor,0.311373,0.894848,0.053333,0.012727,34
+7,Ken Mayor Livingstone,0.672157,0.877576,0.132941,0.029091,35
+7,New York City,0.287843,0.909091,0.121176,0.013333,36
+7,London,0.701961,0.909091,0.061569,0.010606,37
+8,SisterCities,0.169412,0.03303,0.24,0.028182,1
+8,Partnership Agreement,0.515686,0.027576,0.441961,0.03303,2
+8,INTERNATIONAL,0.169804,0.066667,0.238431,0.009091,3
+8,Connect globally. Thrive locally.,0.169412,0.08697,0.239608,0.013333,4
+8,Toolkit,0.83098,0.072727,0.127451,0.025758,5
+8,CHIC OF STATE,0.247451,0.190606,0.141961,0.036364,6
+8,City of Long Beach,0.388627,0.196667,0.476471,0.066364,7
+8,California,0.551373,0.257273,0.136471,0.033333,8
+8,Sister City Agreement,0.321961,0.305455,0.378431,0.035152,9
+8,between the,0.464706,0.352727,0.084314,0.009697,10
+8,City of Long Beach,0.38,0.378485,0.252549,0.01697,11
+8,"California, USA",0.4,0.397576,0.21098,0.016061,12
+8,and the,0.48,0.415152,0.053333,0.009091,13
+8,City of San Pablo de Manta,0.321569,0.428788,0.369804,0.01697,14
+8,"Ecuador, South America",0.347451,0.447879,0.317255,0.015152,15
+8,"In accordance with the authorization and approval expressed by the City of Long Beach,",0.261569,0.482121,0.536863,0.012121,16
+8,"California, USA, and the City of San Pablo de Manta, Ecundor, South America, it is declared",0.217647,0.492727,0.581176,0.01303,17
+8,"that a ""Sister City Agreement between the two cities is hereby established for the following",0.217647,0.502727,0.581569,0.012121,18
+8,purposes:,0.216863,0.516061,0.058039,0.009394,19
+8,(1) to promote and expand the effective and mutually beneficial cooperation between,0.278824,0.532727,0.520392,0.012424,20
+8,the people of Long Beach and the people of San Pablo de Manta; and,0.218039,0.543636,0.40549,0.012424,21
+8,"(2) to promote international goodwill, understanding, and expanded business",0.279216,0.56303,0.520784,0.012424,22
+8,"relations between the two cities and their respective nations by the exchange of people, ideas, and",0.218039,0.573636,0.581569,0.012121,23
+8,"information in a unide variety of economic, social, cultural, municipal, environmental,",0.218039,0.584242,0.581176,0.012121,24
+8,"professional, technical, youth, and other endeavors; and",0.217647,0.594848,0.333333,0.012121,25
+8,"(3) to foster and encourage charitable, scientific, trade and commerce, literary and",0.279608,0.613939,0.520784,0.012727,26
+8,educational activities between the two cities;,0.218039,0.625455,0.265882,0.009697,27
+8,This Sister City Agreement shall be officially established and shall become effective when,0.263137,0.644545,0.536863,0.012727,28
+8,"this document has been duly executed by the Mayor of Long Beach, California, USA, and the",0.218824,0.654848,0.581961,0.012424,29
+8,"Mayor of San Pablo de Manta, Ecundor, South America.",0.218431,0.665758,0.338824,0.012121,30
+8,STATE OFFICE,0.276471,0.713636,0.050588,0.048788,31
+8,Beverly 0 Neill,0.587451,0.736667,0.121961,0.013636,32
+8,"Mayor, City of Long Beach",0.542353,0.751212,0.21098,0.013636,33
+8,"California, USA",0.582745,0.765758,0.125098,0.01303,34
+8,10.2aulus,0.490588,0.771818,0.220392,0.062424,35
+8,Ing. Jorge O. Zambrano Cedeño,0.527059,0.825152,0.242745,0.013333,36
+8,"Mayor, City of San Pablo de Manta",0.505098,0.839394,0.277647,0.013636,37
+8,"Ecuador, South America",0.551765,0.854242,0.188235,0.011818,38
+8,"Dated: September 19, 2000",0.544706,0.883333,0.202745,0.01303,39
+9,SisterCities,0.169412,0.03303,0.24,0.028485,1
+9,Partnership Agreement,0.516078,0.027879,0.441176,0.032424,2
+9,INTERNATIONAL,0.170196,0.066667,0.237647,0.009091,3
+9,Connect globally. Thrive locally.,0.169412,0.08697,0.239216,0.013333,4
+9,Toolkit,0.83098,0.072727,0.127451,0.025758,5
+9,REAFFIRMATION OF SISTER CITIES DECLARATION,0.324706,0.165152,0.483529,0.013939,6
+9,adopted by,0.2,0.213333,0.080392,0.013636,7
+9,THE HONORABLE RICHARD M. DALEY,0.396078,0.214242,0.335686,0.012424,8
+9,MAYOR OF CHICAGO,0.472549,0.231212,0.18549,0.011515,9
+9,and,0.199608,0.260909,0.026275,0.010606,10
+9,THE HONORABLE ZHANG RONGMAO,0.401961,0.261212,0.323137,0.011212,11
+9,MAYOR OF SHENYANG,0.463529,0.273636,0.202353,0.011212,12
+9,ON,0.551765,0.298182,0.026667,0.011515,13
+9,"JUNE 5, 1995",0.500392,0.323636,0.128235,0.014848,14
+9,"On this the tenth anniversary of the signing of a sister city agreement, in order to further",0.255686,0.36303,0.67098,0.015152,15
+9,the traditional links of friendship between Chicago and Shenyang and to reaffirm their mutual,0.198824,0.378788,0.727843,0.015455,16
+9,"aspiration to work in unison for the benefit of their cities and nations, the Honorable Mayor",0.199608,0.394848,0.727843,0.014848,17
+9,"Richard M. Daley, Mayor of the City of Chicago, and the Honorable Zhang Rongmao, Mayor",0.199216,0.411212,0.727451,0.014242,18
+9,"of the City of Shenyang, on this fifth day of June 1995, do hereby acknowledge and reaffirm the",0.199216,0.42697,0.72549,0.014848,19
+9,sister cities agreement between the City of Chicago and the City of Shenyang.,0.199608,0.443636,0.57451,0.014242,20
+9,"The City of Chicago and the City of Shenyang on the basis of friendly cooperation,",0.256078,0.473939,0.665098,0.015152,21
+9,equality and mutual benefit will continue to develop a sister cities relationship to promote and,0.2,0.490303,0.724706,0.014242,22
+9,broaden economic cooperation and cultural exchanges between the two cities.,0.199216,0.506061,0.57451,0.014242,23
+9,The two cities do hereby declare their interest in exploring the establishment of business,0.255294,0.537273,0.668235,0.015455,24
+9,and trade relations between Chicago and Shenyang.,0.198824,0.554545,0.387843,0.013636,25
+9,"In addition, exchanges will be promoted in the area of the arts such as exhibits, music,",0.254118,0.583939,0.666667,0.015455,26
+9,dance and other cultural activities.,0.198431,0.601212,0.256471,0.010606,27
+9,"In addition, exchanges will be promoted in education and the establishment of contacts",0.254118,0.630303,0.668627,0.015758,28
+9,within educational institutions encouraged.,0.198824,0.647273,0.32,0.014242,29
+9,"In addition, we declare our intention to promote exchanges in such fields as science and",0.253725,0.678182,0.668627,0.014848,30
+9,"technology, sports, health, youth and any areas that will contribute to the prosperity and the",0.198039,0.693636,0.722745,0.015152,31
+9,further development of friendship between the people of our two cities.,0.194902,0.711515,0.525098,0.013636,32
+9,3h.5.,0.593725,0.750606,0.218039,0.06303,33
+9,THE HONORABLE RICHARD M. DALEY,0.197255,0.821515,0.303529,0.010606,34
+9,THE HONORABLE ZHANG RONGMAO,0.588627,0.819394,0.287843,0.011818,35
+9,MAYOR OF CHICAGO,0.195686,0.835758,0.164706,0.010606,36
+9,MAYOR OF SHENYANG,0.587451,0.835455,0.177647,0.010303,37
+10,Skills_based_CV.qxd 5/8/11 3:55 pm Page,0.17777,0.135381,0.308796,0.008545,1
+10,agcas,0.726169,0.191722,0.053368,0.011749,2
+10,Example of a skills-based CV,0.3894,0.205874,0.224144,0.011482,3
+10,ASHLEY GILL,0.459698,0.246195,0.082812,0.008278,4
+10,3 Lappage Court,0.2212,0.259012,0.080972,0.008545,5
+10,Telephone: 01882 652349,0.592565,0.259012,0.129555,0.008278,6
+10,"Tyler Green, Bucks.",0.220464,0.269159,0.092381,0.008278,7
+10,Mobile: 07717 121824,0.593669,0.269159,0.112992,0.006676,8
+10,HP8 4JD,0.2212,0.279306,0.040486,0.006409,9
+10,Email: ashleygill2023@gotmail.com,0.594038,0.279039,0.178874,0.008545,10
+10,Personal Details,0.221568,0.299332,0.095326,0.007744,11
+10,Summary,0.220832,0.321495,0.048215,0.008278,12
+10,Business studies with Spanish undergraduate.,0.273463,0.340988,0.229297,0.008812,13
+10,Ability to speak French and Spanish.,0.272727,0.351135,0.179242,0.008545,14
+10,Extensive business experience including an internship with Top Choice Holidays.,0.273095,0.361015,0.398233,0.008812,15
+10,Education And Qualifications,0.2212,0.381041,0.144277,0.008278,16
+10,2008 present,0.220832,0.401602,0.074715,0.008011,17
+10,Buckinghamshire Edge University,0.386824,0.401068,0.167096,0.008545,18
+10,BA International Business Studies with Spanish (expected 2:1),0.386824,0.410681,0.308796,0.008812,19
+10,Relate your degree to,0.230033,0.420027,0.100847,0.008278,20
+10,Study semester at The University of Valloid (Spain).,0.399338,0.420828,0.252852,0.008812,21
+10,the job by listing your,0.229665,0.429105,0.101583,0.008278,22
+10,Six-month work placement in Madrid.,0.399338,0.431242,0.188811,0.008545,23
+10,relevant modules/,0.230033,0.438718,0.085388,0.007744,24
+10,Relevant modules included: Business Planning; Sales Promotion and,0.399338,0.441389,0.338241,0.008545,25
+10,dissertation.,0.230033,0.448064,0.057784,0.006676,26
+10,Marketing; and Business Operations Management.,0.398969,0.451268,0.25322,0.008812,27
+10,2000 2007,0.2212,0.467824,0.061833,0.006409,28
+10,Freebridge School,0.386824,0.46729,0.087965,0.008545,29
+10,"A-Levels: Business Studies (B), French (C)",0.386088,0.476903,0.200221,0.008812,30
+10,"8 GCSEs including Maths, English, Spanish and French",0.386824,0.487583,0.266838,0.008545,31
+10,Work History,0.220832,0.509212,0.065513,0.008278,32
+10,2008 2011,0.220832,0.529506,0.061833,0.006409,33
+10,Buckinghamshire Edge University Librarian/tour guide,0.386824,0.528972,0.277144,0.008812,34
+10,General administrative and customer service roles.,0.399338,0.539119,0.25138,0.006676,35
+10,Briefly list,0.707766,0.536716,0.045639,0.008011,36
+10,your relevant,0.70703,0.546061,0.061465,0.008011,37
+10,2011 (Feb-Aug),0.2212,0.55514,0.078027,0.008812,38
+10,Audigest S.A. (Madrid) - Audit Assistant,0.386456,0.554873,0.199485,0.009079,39
+10,duties.,0.707398,0.555674,0.030916,0.006409,40
+10,Six months' work experience in an international bank.,0.399338,0.565287,0.267575,0.008545,41
+10,Liaising with colleagues and clients in English and Spanish.,0.399338,0.575434,0.292602,0.008545,42
+10,2010 (June-Dec),0.220832,0.591188,0.082444,0.008278,43
+10,Finsbury's supermarket (Hazelbridge) — Supervisor,0.386824,0.591188,0.250644,0.008812,44
+10,Managing a small team.,0.398969,0.601602,0.121089,0.008545,45
+10,Customer service in a busy competitive environment.,0.398969,0.611215,0.264262,0.008545,46
+10,2010 (Jan-Aug),0.2212,0.627236,0.077291,0.008812,47
+10,Top Choice Holidays and Flights Ltd (Low Wycombe),0.386088,0.627503,0.257637,0.008812,48
+10,Financial Assistant/Supervisor,0.386824,0.637383,0.15127,0.008812,49
+10,Working in a range of teams to manage complex financial processes.,0.398969,0.64753,0.341921,0.008812,50
+10,2007 (Jul-Aug),0.220832,0.663284,0.074347,0.008812,51
+10,Dogs Protection League - General Assistant,0.386824,0.663818,0.216783,0.008812,52
+10,Dealing with enquiries and selling packages to a range of clients.,0.399706,0.673431,0.321678,0.009079,53
+10,2006 (Jan-Dec),0.220832,0.689453,0.076187,0.009079,54
+10,McHenry's Restaurant (Low Wycombe) - Supervisor,0.386456,0.68972,0.256533,0.009079,55
+10,Voluntary Experience,0.220464,0.708411,0.106367,0.008545,56
+10,2007/2011,0.220832,0.728438,0.055208,0.008011,57
+10,Teaching English in Mexico/Spain,0.386088,0.727904,0.167832,0.009079,58
+10,Interests,0.2212,0.748465,0.043062,0.006676,59
+10,Active member of University Business Club — Winner of the 'Bucks Best Business Pitch' award in 2010 Enterprise,0.220464,0.768224,0.556864,0.009079,60
+10,"week, judged by Michael Eavis.",0.220464,0.778104,0.15311,0.008812,61
+11,Skills_based_CV.qxd 5/8/11 3:55 pm Page,0.17777,0.135381,0.308428,0.008545,1
+11,Make sure you carefully assess,0.468531,0.23498,0.142068,0.008011,2
+11,Skills And Achievements,0.220832,0.245394,0.121457,0.006676,3
+11,the job advert/job description,0.468163,0.244326,0.139124,0.008278,4
+11,and address all the skills they,0.468531,0.253672,0.13618,0.008278,5
+11,Effective communication,0.2212,0.265421,0.123298,0.006676,6
+11,require.,0.468531,0.263017,0.034965,0.008011,7
+11,"Able to communicate effectively with a wide range of clients and colleagues, by showing interest, carefully",0.233714,0.275567,0.530364,0.008545,8
+11,"listening to needs and appropriately adjusting my message, as demonstrated during my time at Finsbury's",0.23445,0.285447,0.528892,0.008812,9
+11,Supermarket.,0.234082,0.295861,0.066618,0.008278,10
+11,Strong presentation skills and confidence demonstrated by experience of delivering presentations in different,0.23445,0.305474,0.543614,0.008812,11
+11,languages to groups of five to fifty.,0.234082,0.315621,0.172617,0.008812,12
+11,Customer service,0.220832,0.335915,0.085388,0.006676,13
+11,Ability to quickly build rapport with customers and calmly deal with any problems as shown during my retail,0.233714,0.345527,0.541038,0.008812,14
+11,experience in high pressure environments.,0.234082,0.355941,0.210526,0.008278,15
+11,"Capacity to maintain professional relationships through email and other written correspondence, for example,",0.234082,0.365554,0.548767,0.008812,16
+11,"at Audigest in Madrid, where I built longstanding business relationships with customers and colleagues across",0.233714,0.375701,0.549871,0.008812,17
+11,the globe.,0.233714,0.385848,0.049687,0.008278,18
+11,Teamwork,0.220464,0.406142,0.052632,0.006409,19
+11,"At Top Choice Holidays demonstrated excellent teamwork skills in a busy financial environment, such as an",0.233346,0.415754,0.532573,0.008812,20
+11,"ability to listen to clients and managers, perform my role to a high level and support colleagues, resulting in",0.234082,0.425634,0.535885,0.008812,21
+11,promotion.,0.234082,0.436048,0.05484,0.008545,22
+11,Administration,0.220464,0.456075,0.075083,0.006409,23
+11,Prove you have each of the,0.639676,0.453672,0.123666,0.008278,24
+11,"Excellent ability to plan ahead and manage time effectively, for example,",0.23445,0.465688,0.360692,0.008812,25
+11,skills required by outlining,0.63894,0.463017,0.12293,0.008278,26
+11,managing complex roles during my internship at Top Choice Holidays.,0.23445,0.476101,0.346338,0.008545,27
+11,where you performed them,0.63894,0.472363,0.128082,0.008278,28
+11,Gathered data from a wide range of sources during my dissertation,0.234082,0.485714,0.334928,0.008812,29
+11,and how you performed,0.639308,0.481709,0.111888,0.008278,30
+11,them well.,0.63894,0.491055,0.048951,0.006409,31
+11,"whilst balancing my other studies and two jobs, resulting in a 73% grade.",0.233346,0.495861,0.365109,0.008812,32
+11,Experience of travellers' needs,0.2212,0.515888,0.150534,0.008545,33
+11,Recent travel consultancy experience gives me an in-depth understanding of the expectations of holiday,0.23445,0.525768,0.518955,0.008812,34
+11,customers and the competitive nature of the industry.,0.234082,0.535915,0.269047,0.008812,35
+11,International travel experience and language ability give me an empathy with travellers and a passion for,0.234082,0.545794,0.524107,0.008812,36
+11,helping them find a unique holiday experience.,0.234082,0.555941,0.23445,0.008812,37
+11,Initiative,0.2212,0.576235,0.044166,0.006676,38
+11,Self-funding an evening course in bookkeeping during my first accountancy role demonstrated my ability to,0.234082,0.585848,0.535149,0.008812,39
+11,plan ahead and take control of my career.,0.23445,0.595995,0.205006,0.008545,40
+11,Successful study and work in Spain and Mexico show that I can creatively develop my skills and experience and,0.234082,0.605874,0.551711,0.008545,41
+11,adapt to new and different environments.,0.234082,0.616288,0.208686,0.008278,42
+11,Sales knowledge,0.220464,0.636315,0.083916,0.008011,43
+11,Wide experience of financial roles gives me an awareness of the tight monetary pressures which drive UK,0.234082,0.645928,0.525212,0.009346,44
+11,service industries.,0.234082,0.656609,0.088333,0.006943,45
+11,Raised sales at The Dogs Protection League by 12% by up selling add-on packages to new and existing,0.23445,0.665955,0.505705,0.009079,46
+11,customers.,0.234082,0.67717,0.054472,0.006142,47
+11,Language ability,0.2212,0.696395,0.082444,0.008812,48
+11,"Spanish fluency obtained working overseas, French semi-fluent.",0.233714,0.706008,0.323151,0.009079,49
+11,Referees,0.2212,0.726569,0.041958,0.006676,50
+11,Include all your referee details including their email and,0.351859,0.722029,0.259109,0.008545,51
+11,phone number (but ask for their permission first).,0.352227,0.731108,0.230401,0.008545,52
+11,"Professional: Mr. Jose Andreas, Management Accountant, Audigest, Avenida de Concha Espina 2, Madrid, ES-",0.2212,0.746328,0.537725,0.008812,53
+11,"28036, +34 91 398 5476, j.andreas@audigest.es",0.2212,0.756475,0.238498,0.008278,54
+11,"Academic: Dr. Jane Luffle, Personal Tutor, Buckinghamshire Edge University, Due Road, Low Wycombe, Bucks,",0.220464,0.776502,0.536621,0.008812,55
+11,"HD15 3DL, 01628 435 6784, j.luffle@bedge.ac.uk",0.2212,0.786382,0.244755,0.008545,56
+12,5-Point Networking Email,0.404314,0.050606,0.189804,0.012121,1
+12,"Steve Dalton, the author of The 2-Hour Job Search believes the perfect networking email is a ""5-Point E-mail"". The five",0.058824,0.086061,0.859608,0.012727,2
+12,points are as follows:,0.059216,0.10303,0.152941,0.012727,3
+12,1. 100 words or less,0.088627,0.136667,0.156078,0.010303,4
+12,2. No mention of jobs (in subject or body),0.088235,0.153333,0.31451,0.012727,5
+12,"3. Connection goes first (e.g., ND connection)",0.087843,0.170606,0.341569,0.01303,6
+12,4. Generalize your interest,0.087843,0.187879,0.205098,0.012424,7
+12,5. Maintain control of the follow up,0.088627,0.204545,0.27098,0.012727,8
+12,Here's an example of what a 5-Point email would look like:,0.059608,0.255455,0.42549,0.012727,9
+12,Subject: Notre Dame MBA Student Seeking Your Advice,0.117255,0.289394,0.414118,0.012424,10
+12,"Dear Mr. Jones,",0.118039,0.323939,0.112549,0.011515,11
+12,"My name is Brooke Franklin, and I'm a first-year Notre Dame MBA student who found your",0.118431,0.35697,0.661569,0.01303,12
+12,information in the ND alumni database. May I have 15 minutes of your time to ask you about,0.118039,0.374242,0.677255,0.012727,13
+12,your experience with IBM? I'm trying to learn more about marketing careers at technology,0.117255,0.391212,0.660784,0.01303,14
+12,companies and your insights would be very helpful.,0.117647,0.407879,0.373333,0.01303,15
+12,"I realize this may be a busy time for you, so if we're unable to connect this week, I'll try again",0.118039,0.442121,0.674902,0.012727,16
+12,next week to see whether that is more convenient.,0.118039,0.459091,0.370588,0.010303,17
+12,"Thank you for your time,",0.117255,0.492727,0.179216,0.012727,18
+12,Brooke,0.118431,0.51,0.050588,0.01,19
+12,The most important part of this email may be the follow-up; an email like this allows you to reach out again in a week if,0.058431,0.543333,0.872157,0.01303,20
+12,you haven't heard back without feeling like you're bothering the person at the other end. If you don't hear anything,0.058431,0.560606,0.843922,0.01303,21
+12,"after the second attempt, you can probably cross him/her off your list and move on to the next contact.",0.058824,0.577273,0.755686,0.01303,22
+13,36 Westmoreland Drive,0.705764,0.026796,0.209996,0.011403,1
+13,Newcastle upon Tyne,0.723499,0.04333,0.192664,0.013968,2
+13,NE1 8LT,0.836759,0.059863,0.079807,0.011117,3
+13,Mr Mark Wilson,0.083837,0.076112,0.138251,0.011403,4
+13,UK Health Trust,0.083837,0.09236,0.143087,0.011403,5
+13,18 Whitehall Square,0.084643,0.108609,0.179766,0.013968,6
+13,London,0.083837,0.125428,0.066102,0.011117,7
+13,SW1 9LT,0.083837,0.141391,0.083031,0.011403,8
+13,11th January 2015,0.755744,0.154789,0.161225,0.017389,9
+13,Dear Mr Wilson,0.083837,0.174173,0.137042,0.011403,10
+13,Re: Community Health Development Officer [HD/12/2014],0.083837,0.201539,0.544135,0.014253,11
+13,"I am writing to apply for the above post, as advertised on the Health UK recruitment site. I am",0.08424,0.228905,0.828295,0.014253,12
+13,a sociology graduate with a 2: 1from Newcastle University. I have relevant health awareness,0.083434,0.245439,0.822249,0.014253,13
+13,"experience, and I am looking for a position where I can employ my knowledge and skills in",0.083434,0.261973,0.802499,0.013968,14
+13,support of health and community development. I enclose my CV for your attention.,0.083434,0.277936,0.731963,0.014253,15
+13,I am eager to work for UK Health Trust because of your ground-breaking work within the field,0.08424,0.305302,0.825877,0.014253,16
+13,of community health. I became aware of the work of the Trust when carrying out my,0.083434,0.322121,0.744055,0.013968,17
+13,"dissertation, 'Generational Change in Local Health Awareness, where I researched health",0.083031,0.338084,0.798468,0.014253,18
+13,awareness of children and elderly people in a deprived location. I referred to a number of,0.083031,0.354618,0.792019,0.013968,19
+13,publications produced by UK Health Trust and was impressed by the innovative techniques,0.083837,0.371152,0.809351,0.013968,20
+13,your organisation uses to engage local community members in projects. The Community,0.083031,0.387685,0.788795,0.014253,21
+13,Health Development Officer position would further develop my existing abilities and my,0.08424,0.403934,0.771463,0.014253,22
+13,"understanding of community development, allowing me to contribute in a practical way to",0.083837,0.420468,0.789601,0.013968,23
+13,enhancing the health of disadvantaged people.,0.083434,0.436716,0.415961,0.013968,24
+13,The volunteer development aspect of the position particularly appeals to me. I have worked,0.083031,0.469213,0.811769,0.014538,25
+13,"in the voluntary sector, providing services tackling health inequalities and promoting healthy",0.083837,0.485747,0.814994,0.014253,26
+13,living in Newcastle. I promoted health awareness through one to one sessions and in large,0.083434,0.501995,0.805723,0.014253,27
+13,"groups and developed interpersonal skills, confidence and patience when engaging and",0.083031,0.518529,0.787183,0.014253,28
+13,"motivating participants. While raising the group's profile using social media, the local press",0.083434,0.534778,0.804917,0.013968,29
+13,"and at presentations to youth clubs, faith meetings and care homes I recognised the need to",0.083434,0.551596,0.820637,0.013968,30
+13,"change my delivery style to suit the audience. As a volunteer teacher in Ghana, I developed",0.083434,0.56756,0.8158,0.014253,31
+13,communication and team-building skills essential to your advertised role; liaising with,0.083434,0.584094,0.753325,0.013968,32
+13,colleagues and parents and a lively group of twenty-five 7-8 year olds to arrange a,0.083434,0.600627,0.731963,0.014253,33
+13,"community event. My retail experience, coupled with my extracurricular activities additionally",0.083434,0.617161,0.822249,0.013968,34
+13,"enhanced my ability to develop others, as I was responsible for inducting and training my",0.083434,0.633409,0.79081,0.014253,35
+13,peers.,0.083837,0.652509,0.05401,0.011117,36
+13,"In relation to the fundraising and budgeting aspect of the role, I have experience of raising",0.08424,0.68244,0.798065,0.014253,37
+13,"substantial amounts of money through several successful charity events, including a well -",0.083031,0.698404,0.802096,0.014538,38
+13,attended fashion show. I was also elected Treasurer of NU Sociology Society with,0.083434,0.715222,0.728335,0.014253,39
+13,responsibility for managing a budget of £3000.,0.083434,0.731471,0.411528,0.014538,40
+13,The necessity to travel to identify community issues only adds to the appeal of the position. I,0.083031,0.758837,0.82104,0.014253,41
+13,"enjoy driving, hold a full clean driving licence and I am very interested in relocating to London",0.083434,0.775086,0.828295,0.014538,42
+13,to work for UK Health Trust.,0.083031,0.791619,0.247481,0.011688,43
+13,Thank you for considering my application. I look forward to hearing from you.,0.083434,0.824401,0.68158,0.014253,44
+13,Yours sincerely,0.082628,0.857184,0.138251,0.014253,45
+13,Rachel Sullivan,0.083837,0.889966,0.137042,0.011403,46
+14,SisterCities,0.169804,0.033333,0.238431,0.028182,1
+14,Partnership Agreement,0.516078,0.027879,0.440784,0.032424,2
+14,INTERNATIONAL,0.170196,0.06697,0.237647,0.008788,3
+14,Connect globally. Thrive locally.,0.169804,0.08697,0.238824,0.01303,4
+14,Toolkit,0.830588,0.07303,0.126667,0.025152,5
+14,Types of Affiliations,0.117255,0.157576,0.241961,0.02,6
+14,Sister City Relationship,0.117647,0.187273,0.196863,0.013939,7
+14,"A Sister City relationship is formed when the mayor or highest elected official (or, if elections",0.117255,0.211212,0.738824,0.013636,8
+14,"do not take place, highest appointed official) from a U.S. community and a community in",0.117647,0.227273,0.70902,0.013939,9
+14,another country or territory sign a formal agreement on behalf of their communities endorsing a,0.117647,0.243636,0.761961,0.013636,10
+14,"""sister city/sister cities"" relationship. Sister city agreements shall be considered active/valid",0.118039,0.259697,0.731373,0.013939,11
+14,unless otherwise indicated by one or both of the respective communities.,0.118039,0.276061,0.58549,0.013636,12
+14,Sister Cities International shall formally recognize only those relationships by cities/members in,0.118039,0.299697,0.758824,0.013636,13
+14,good standing (i.e. who are current on membership dues) in its Membership Directory or on its,0.117647,0.316061,0.754902,0.013636,14
+14,"website. However, Sister Cities International shall not assert as invalid or otherwise impugn the",0.116863,0.332121,0.760784,0.013636,15
+14,legitimacy of those relationships formed by non-members.,0.118039,0.348485,0.466275,0.013636,16
+14,Friendship City,0.118039,0.372121,0.127059,0.013939,17
+14,"A Friendship City or Friendship Cities relationship is often formed by cities as a ""stepping",0.117255,0.395758,0.714118,0.013636,18
+14,"stone"" to a more formal ""Sister City"" agreement. Typically Friendship City agreements are",0.117647,0.411515,0.720392,0.014242,19
+14,referred to as such in the formal documents that are signed. Sister Cities International shall,0.118039,0.428182,0.72549,0.013636,20
+14,recognize Friendship City relationships by members in its Membership Directory and website.,0.118039,0.444242,0.747843,0.013636,21
+14,As per Sister Cities International Board of Directors:,0.117255,0.467879,0.413333,0.013636,22
+14,Sister Cities International will recognize a new sister cities affiliation between a,0.169412,0.492121,0.626667,0.013333,23
+14,"U.S. and an international community, even though another affiliation may exist",0.169412,0.507879,0.625098,0.013636,24
+14,"between that international community and a different U.S. community, only if a",0.169412,0.524545,0.62902,0.013636,25
+14,cooperative agreement among all involved communities is filed with Sister Cities,0.16902,0.540606,0.643137,0.013636,26
+14,"International. If a cooperative agreement is denied, or no response to the request",0.170196,0.556667,0.647843,0.013333,27
+14,"is received within a reasonable amount of time, Sister Cities International will",0.169412,0.57303,0.612157,0.012727,28
+14,recognize the partnership as a friendship city and it will be delineated as such,0.169412,0.589091,0.621176,0.013636,29
+14,with a symbol in the membership directories.,0.168627,0.605455,0.358824,0.013333,30
+14,The cooperative agreement must be sent by the Mayor/County,0.168627,0.628788,0.509412,0.013939,31
+14,"Executive/Governor of the requesting community, and must be sent to the",0.169804,0.645152,0.595294,0.014242,32
+14,Mayor/County Executive/Governor of each of the existing partnership,0.169804,0.661212,0.555294,0.013636,33
+14,communities. Although the Mayor/County Executive/Governor may request input,0.16902,0.677879,0.647451,0.013636,34
+14,"from, or may be given input by, the sister cities program, it is up to the discretion",0.168627,0.693939,0.647059,0.013939,35
+14,of the Mayor/County Executive/Governor to sign the cooperative agreement.,0.16902,0.709697,0.612941,0.013939,36
+14,Although Sister Cities International will help with the cooperative agreement,0.168627,0.726364,0.605882,0.013636,37
+14,"process, it is up to the requesting community to get the agreement signed. Sister",0.169412,0.742121,0.650196,0.013939,38
+14,"Cities International will not, in any way, force a community to ""share"" and sign",0.16902,0.758182,0.623922,0.014242,39
+14,the cooperative agreement.,0.168627,0.774848,0.219216,0.013333,40
+14,"To place a relationship into Emeritus status, the mayor or highest elected official of the U.S.",0.117255,0.798485,0.736471,0.013939,41
+14,community must write a letter to the mayor of the foreign city indicating that they wish to,0.118039,0.814545,0.70902,0.013636,42
+14,"remain sister cities, but understand that the relationship will remain inactive until such time as",0.118039,0.831212,0.747451,0.013333,43
+14,both cities are able to sustain an active relationship. Sister Cities International should be,0.118039,0.847273,0.705098,0.013636,44
+14,informed in writing by the mayor of the U.S. city of the situation. Sister Cities International will,0.118039,0.863333,0.746275,0.013636,45
+15,SisterCities,0.169804,0.033333,0.238824,0.028182,1
+15,Partnership Agreement,0.516078,0.027879,0.440784,0.032424,2
+15,INTERNATIONAL,0.170196,0.06697,0.237647,0.008788,3
+15,Connect globally. Thrive locally.,0.169804,0.08697,0.239216,0.01303,4
+15,Toolkit,0.83098,0.072727,0.127059,0.025455,5
+15,then place the partnership into Emeritus Status and will reflect this status in directories and all,0.117255,0.132424,0.751373,0.013333,6
+15,lists of sister city programs.,0.118039,0.148788,0.218431,0.013333,7
+15,"If a community wishes to terminate a sister city relationship, then a letter from the mayor or",0.118431,0.172424,0.732549,0.013333,8
+15,highest elected official of the U.S. city should be sent to the mayor of the sister city. Sister,0.118039,0.188485,0.721569,0.013636,9
+15,Cities International should be informed of this action in writing by the mayor of the U.S. city,0.118039,0.204848,0.72902,0.013333,10
+15,and Sister Cities International will then remove the partnership from its directories and all lists,0.117647,0.221212,0.746275,0.013333,11
+15,of sister city programs. We do not recommend terminating a relationship simply because it is,0.117647,0.237273,0.743529,0.013333,12
+15,"dormant. Many partnerships wax and wane over the years, and in many cases a dormant",0.117647,0.253939,0.713333,0.013333,13
+15,partnership may be reinvigorated by local members years after it has been inactive.,0.118039,0.269697,0.664314,0.013636,14
+15,General Guidelines,0.118039,0.295152,0.231765,0.016061,15
+15,In order for a sister city/county/state partnership to be recognized by Sister Cities International,0.118431,0.324242,0.754902,0.013636,16
+15,"(SCI), the two communities must sign formal documents which clearly endorse the link. This",0.118039,0.340606,0.74,0.013636,17
+15,presumes several key items: that the U.S. community is already a member of SCI and has,0.118039,0.35697,0.718039,0.013636,18
+15,followed proper procedures (e.g. passed a city council resolution declaring the intent to twin,0.117255,0.373333,0.737647,0.013636,19
+15,with the specific city); that both communities share a mutual commitment to the relationship;,0.117255,0.389394,0.740784,0.013636,20
+15,and that both have secured the necessary support structure to build a lasting relationship. You,0.117647,0.405455,0.758039,0.013333,21
+15,should check with your local sister city program to see if they have any additional requirements,0.117647,0.421818,0.760784,0.013636,22
+15,before pursuing a sister city relationship.,0.118039,0.437879,0.323137,0.013636,23
+15,"SCI often refers to these agreements as a ""Sister City Agreement"" or ""Memorandum of",0.118039,0.461515,0.696863,0.013939,24
+15,"Understanding."" However, as the following examples show, the actual name and format of",0.118039,0.477576,0.729804,0.013636,25
+15,your documents is left up to you.,0.117255,0.494242,0.262745,0.013636,26
+15,A few things to keep in mind as you draft your agreement:,0.117255,0.517879,0.463137,0.013636,27
+15,"Your agreement can range from the ceremonial, with language focusing on each city's",0.176471,0.542121,0.69098,0.013939,28
+15,"commitment to fostering understanding, cooperation, and mutual benefit to the precise,",0.176471,0.558485,0.701961,0.013333,29
+15,"with particular areas of interest, specific programs/activities, or more concrete goals",0.176078,0.574848,0.673725,0.013636,30
+15,related to anything from numbers of exchanges to economic development.,0.176863,0.591212,0.596863,0.013636,31
+15,"Don't try to include everything you plan to do. Some specifics, like particular areas of",0.177255,0.620303,0.681176,0.013939,32
+15,"interest or participating institutions are good to include. However, there's no need to",0.176471,0.636667,0.675686,0.013636,33
+15,include all the programs you plan to do if it makes the document too lengthy or limits,0.176863,0.652727,0.678824,0.013939,34
+15,the scope of projects. This is a formal document to establish the relationship; specific,0.176078,0.668788,0.684706,0.013636,35
+15,"tasks, responsibilities, or other nuts-and-bolts text related to implementation or",0.176078,0.685455,0.635686,0.013333,36
+15,administration of the partnership can be expressed more fully in a separate,0.176471,0.701212,0.600392,0.013636,37
+15,memorandum between the respective sister city committees. Your partnership,0.177255,0.717576,0.626667,0.013636,38
+15,agreement is a historical document and should not be dated or limited by being aligned,0.176471,0.733636,0.699216,0.013636,39
+15,with very specific tasks.,0.176078,0.750606,0.190196,0.013333,40
+15,Work with your counterparts. Remember that this is signed by both cities. You should,0.176078,0.779697,0.68549,0.013636,41
+15,share drafts of your agreement with your international partners and solicit feedback on,0.176471,0.795758,0.691765,0.013333,42
+15,what they'd like to see in the agreement. Be flexible to cultural or municipal priorities.,0.176471,0.811818,0.679216,0.013939,43
+15,Ask your counterparts to translate the agreement if it is drafted in English. It is,0.176078,0.841515,0.623137,0.013636,44
+15,important for the citizens of your partner community to be able to read and understand,0.176863,0.857576,0.693725,0.013939,45
+15,the commitment their city has made. Have someone in your own community who,0.176078,0.873939,0.649804,0.013636,46
+16,SisterCities,0.169804,0.033333,0.239216,0.028182,1
+16,Partnership Agreement,0.516078,0.027879,0.441176,0.032121,2
+16,INTERNATIONAL,0.170196,0.06697,0.237255,0.008788,3
+16,Connect globally. Thrive locally.,0.169804,0.08697,0.239216,0.01303,4
+16,Toolkit,0.83098,0.07303,0.126667,0.025152,5
+16,speaks that language check the foreign-language version to make sure it mirrors what,0.176471,0.132424,0.688235,0.013333,6
+16,you have in your own agreement.,0.176471,0.148788,0.264706,0.013333,7
+16,Keep it to one page. Ceremonial documents such as these partnership agreements,0.176863,0.178485,0.66549,0.013636,8
+16,work best if they can be posted in their entirety.,0.176078,0.194545,0.380392,0.013636,9
+16,Most sister city agreements include some acknowledgement of the founding principles,0.177255,0.224242,0.694902,0.013636,10
+16,"of the sister city movement- to promote peace through mutual respect, understanding,",0.176471,0.240303,0.698431,0.013333,11
+16,and cooperation.,0.176471,0.25697,0.13451,0.013333,12
+16,Consider using official letterhead and/or other embellishments such as city seals or,0.176863,0.286061,0.665882,0.013333,13
+16,logos to reflect your enhance the document. Sister city agreements are often posted at,0.176863,0.302121,0.695686,0.013636,14
+16,city hall or other municipal offices and should reflect their historical importance,0.176471,0.318485,0.630588,0.013333,15
+16,Look at other agreements your city has signed. These agreements may give you an idea,0.177255,0.347879,0.705098,0.013636,16
+16,"of what is acceptable or possible, and they may be in an easily replicable format. If you",0.176471,0.364242,0.695686,0.013636,17
+16,"cannot access older agreements please contact Sister Cities International, we may",0.176863,0.380303,0.663137,0.013636,18
+16,"have them on file, although we do not have copies of all partnership agreements.",0.176863,0.396667,0.64549,0.013636,19
+16,Documents must be signed by the top elected official of both communities.,0.177255,0.426364,0.601569,0.013333,20
+16,"Check with your mayor, city council, town clerk, et al. to make sure that the agreement",0.176863,0.455758,0.694118,0.013636,21
+16,"is OK with them. The mayor is the one putting his or her name on the paper, and you",0.176863,0.471818,0.677255,0.013333,22
+16,don't want to spend time developing an agreement which will never be signed.,0.176863,0.488182,0.629412,0.013636,23
+16,Official documents are usually signed during a formal ceremony recognizing the,0.176863,0.517576,0.638431,0.013636,24
+16,partnership. Be sure both communities receive a signed set of the official documents,0.177255,0.533939,0.683922,0.013636,25
+16,for their records.,0.176078,0.550606,0.131373,0.010606,26
+16,Remember to send your signed agreement to Sister Cities International. After we,0.177255,0.579697,0.645098,0.013636,27
+16,receive your agreement we will post the relationship in the City Directory and make sure,0.176863,0.595758,0.703137,0.013636,28
+16,it is included in our Annual Membership Directory.,0.176863,0.612121,0.398039,0.013333,29
+16,Remember that each city's sister city program is independent and can impose requirements,0.118431,0.640606,0.736471,0.013939,30
+16,"like the establishment of a committee, a review period, sustainability/funding plan, among",0.118039,0.65697,0.715686,0.013636,31
+16,"others, before sanctioning a sister city agreement. Check with your local program or mayor's",0.117647,0.672727,0.743529,0.014242,32
+16,office to see if this is the case.,0.117647,0.689091,0.241176,0.011515,33
+16,On the following pages you'll find a series of partnership agreements to give you an idea of,0.118039,0.717879,0.728627,0.013939,34
+16,"what is possible. While you should feel free to use some of the formatting and language, we",0.117255,0.734242,0.73451,0.013636,35
+16,encourage you to make your agreement your own and be creative with what you produce. If,0.117647,0.750606,0.737647,0.013636,36
+16,you are unsure about your agreement or want advice you can always solicit feedback by,0.117647,0.766667,0.708627,0.013636,37
+16,sending it to our Membership Director at akaplan@sister-cities.org or contacting us at (202),0.117647,0.782727,0.732157,0.013636,38
+16,347-8630.,0.117647,0.799394,0.080392,0.010303,39
+17,SisterCities,0.169412,0.033333,0.239608,0.028485,1
+17,Partnership Agreement,0.516471,0.027879,0.440784,0.032727,2
+17,INTERNATIONAL,0.170196,0.066667,0.238431,0.009091,3
+17,Connect globally. Thrive locally.,0.169412,0.08697,0.239608,0.013333,4
+17,Toolkit,0.830588,0.072727,0.127843,0.025758,5
+17,"jull bubzig 2000 3,312",0.378039,0.291212,0.32549,0.019394,6
+17,ABU DHABI MUNICIPALITY & TOWN PLANNING,0.376471,0.316667,0.327451,0.016667,7
+17,AN AGREEMENT FOR THE ESTABLISHMENT OF,0.260784,0.373636,0.52549,0.012727,8
+17,SISTER CITIES RELATIONSHIP,0.337647,0.393636,0.342745,0.012121,9
+17,BETWEEN,0.454902,0.413636,0.110588,0.011212,10
+17,THE CITY OF ABU DHABI ( U. A.E),0.337255,0.432727,0.375686,0.013939,11
+17,AND,0.487843,0.452727,0.048235,0.011212,12
+17,"HOUSTON, TEXAS ( U.S.A)",0.385882,0.471515,0.298039,0.014848,13
+17,"The Sister City Program, administered by Sister Cities International, was initiated",0.221961,0.525455,0.597255,0.01303,14
+17,By the President of the United States of America in 1956 to encourage greater,0.222745,0.539394,0.561961,0.012727,15
+17,Friendship and understanding between the United States and other nations through,0.222745,0.553333,0.608235,0.012727,16
+17,Direct personal contact: and,0.222745,0.567576,0.20549,0.012424,17
+17,"In order to foster those goals, the people of Abu Dhabi and Houston, in a gesture of",0.222353,0.594242,0.603529,0.012424,18
+17,"Friendship and goodwill, agree to collaborate for the mutual benefit of their",0.222745,0.608182,0.547843,0.01303,19
+17,"Communities by exploring education, economic and cultural opportunities.",0.222353,0.622121,0.541961,0.012121,20
+17,"Abu Dhabi and Houston, sharing a common interest in energy, technology and",0.221569,0.648788,0.574118,0.012424,21
+17,"medicine, and the desire to promote mutual understanding among our citizens do",0.222353,0.66303,0.588235,0.012121,22
+17,"hereby proclaim themselves Sister Cities beginning on the 13th day of March 2001,",0.221961,0.673636,0.594118,0.015758,23
+17,the date of Houston City Council resolution estatblishing the Sister City,0.221961,0.690303,0.519608,0.01303,24
+17,relationship became effective.,0.221569,0.705152,0.217647,0.012424,25
+17,"Signed on this 26 of October 2002, in duplicate in the Arabic and English",0.221569,0.732121,0.533333,0.01303,26
+17,"Languages, both text being equally authentic.",0.221961,0.746667,0.328627,0.012727,27
+17,A,0.344314,0.768485,0.084706,0.030303,28
+17,Sheikh Mohammed bin Butti AI Hamed,0.245882,0.806364,0.366275,0.010909,29
+17,Lee P.Brown,0.729412,0.806364,0.118824,0.010303,30
+17,Chairman of Abu Dhabi Municipality,0.24549,0.823636,0.342353,0.012727,31
+17,Mayor of Houston,0.704706,0.823333,0.166667,0.012424,32
+17,&Town Planning,0.324314,0.841212,0.155686,0.012424,33
+18,SisterCities,0.169412,0.033333,0.239608,0.028485,1
+18,Partnership Agreement,0.516078,0.027879,0.441176,0.032424,2
+18,INTERNATIONAL,0.17098,0.066667,0.237255,0.009091,3
+18,Connect globally. Thrive locally.,0.169412,0.08697,0.239216,0.013333,4
+18,Toolkit,0.83098,0.072727,0.127059,0.025758,5
+18,THE CITY OF NEW YORK,0.438824,0.262121,0.240784,0.009697,6
+18,OFFICE OF THE MAYOR,0.450196,0.27697,0.220392,0.009697,7
+18,"NEW YORK, N.Y. 10007",0.461176,0.29303,0.196863,0.010303,8
+18,THE NEW YORK CITY-LONDON SISTER CITY PARTNERSHIP,0.267451,0.355758,0.582745,0.011818,9
+18,Memorandum of Understanding,0.420392,0.371212,0.274902,0.013333,10
+18,The Sister City partnership between New York City and London will foster mutually,0.201176,0.402121,0.674118,0.014242,11
+18,beneficial solutions to common challenges for these two great cosmopolitan entities.,0.201176,0.417273,0.66902,0.013636,12
+18,"Consequently, the Sister City relationship between the two will be one of the most",0.201176,0.432727,0.652549,0.015152,13
+18,"important in their network of global partnerships, as it strives to:",0.201176,0.448182,0.50902,0.015455,14
+18,Encourage and publicize existing exchanges between London and New York City so,0.230588,0.480303,0.671373,0.015152,15
+18,that they can flourish to benefit a wider cross-section of the citizens of both;,0.230588,0.496061,0.602353,0.015152,16
+18,"Support and promote the development of new social, economic, academic and",0.230196,0.512424,0.618431,0.015455,17
+18,community programs to encourage both cities' citizens to share their experiences as a,0.229804,0.527879,0.678039,0.014848,18
+18,medium for learning from one another;,0.229804,0.543636,0.309412,0.013939,19
+18,Generate an improvement of the operation of the cities' various government agencies,0.229804,0.56,0.676078,0.014545,20
+18,by serving as a conduit of information;,0.22902,0.575758,0.307843,0.014848,21
+18,"Identify themes, common to both, that can generate new initiatives to further and",0.229412,0.591818,0.640784,0.015152,22
+18,"nurture the increasingly powerful financial, social and cultural relationships between",0.22902,0.607576,0.671373,0.014242,23
+18,the cities;,0.22902,0.624545,0.076471,0.012424,24
+18,Promote key mayoral priorities relevant to both London and New York City;,0.228627,0.639394,0.608627,0.015152,25
+18,Provide financial or in kind support to community-led programs that advance the,0.228627,0.656061,0.641569,0.013636,26
+18,aims of the Sister City partnership;,0.22902,0.672121,0.275294,0.013636,27
+18,"With the above purposes in mind, the Mayor of the City of New York and the Mayor of",0.198824,0.702424,0.697647,0.014848,28
+18,London solemnly confirm that these two cities are united by an official partnership by the,0.198824,0.718182,0.710196,0.014545,29
+18,protocol of this Memorandum of Understanding.,0.198431,0.733939,0.384314,0.015152,30
+18,This agreement will go into effect from the date of signatures.,0.310196,0.780606,0.488235,0.014545,31
+18,Signed in March of 2001,0.455686,0.796364,0.19451,0.013636,32
+18,Thedder Rudolph W. Giuliani,0.178824,0.795455,0.244314,0.100909,33
+18,Mayor,0.311373,0.894848,0.053333,0.012727,34
+18,Ken Mayor Livingstone,0.672157,0.877576,0.132941,0.029091,35
+18,New York City,0.287843,0.909091,0.121176,0.013333,36
+18,London,0.701961,0.909091,0.061569,0.010606,37
+19,SisterCities,0.169412,0.03303,0.24,0.028182,1
+19,Partnership Agreement,0.515686,0.027576,0.441961,0.03303,2
+19,INTERNATIONAL,0.169804,0.066667,0.238431,0.009091,3
+19,Connect globally. Thrive locally.,0.169412,0.08697,0.239608,0.013333,4
+19,Toolkit,0.83098,0.072727,0.127451,0.025758,5
+19,CHIC OF STATE,0.247451,0.190606,0.141961,0.036364,6
+19,City of Long Beach,0.388627,0.196667,0.476471,0.066364,7
+19,California,0.551373,0.257273,0.136471,0.033333,8
+19,Sister City Agreement,0.321961,0.305455,0.378431,0.035152,9
+19,between the,0.464706,0.352727,0.084314,0.009697,10
+19,City of Long Beach,0.38,0.378485,0.252549,0.01697,11
+19,"California, USA",0.4,0.397576,0.21098,0.016061,12
+19,and the,0.48,0.415152,0.053333,0.009091,13
+19,City of San Pablo de Manta,0.321569,0.428788,0.369804,0.01697,14
+19,"Ecuador, South America",0.347451,0.447879,0.317255,0.015152,15
+19,"In accordance with the authorization and approval expressed by the City of Long Beach,",0.261569,0.482121,0.536863,0.012121,16
+19,"California, USA, and the City of San Pablo de Manta, Ecundor, South America, it is declared",0.217647,0.492727,0.581176,0.01303,17
+19,"that a ""Sister City Agreement between the two cities is hereby established for the following",0.217647,0.502727,0.581569,0.012121,18
+19,purposes:,0.216863,0.516061,0.058039,0.009394,19
+19,(1) to promote and expand the effective and mutually beneficial cooperation between,0.278824,0.532727,0.520392,0.012424,20
+19,the people of Long Beach and the people of San Pablo de Manta; and,0.218039,0.543636,0.40549,0.012424,21
+19,"(2) to promote international goodwill, understanding, and expanded business",0.279216,0.56303,0.520784,0.012424,22
+19,"relations between the two cities and their respective nations by the exchange of people, ideas, and",0.218039,0.573636,0.581569,0.012121,23
+19,"information in a unide variety of economic, social, cultural, municipal, environmental,",0.218039,0.584242,0.581176,0.012121,24
+19,"professional, technical, youth, and other endeavors; and",0.217647,0.594848,0.333333,0.012121,25
+19,"(3) to foster and encourage charitable, scientific, trade and commerce, literary and",0.279608,0.613939,0.520784,0.012727,26
+19,educational activities between the two cities;,0.218039,0.625455,0.265882,0.009697,27
+19,This Sister City Agreement shall be officially established and shall become effective when,0.263137,0.644545,0.536863,0.012727,28
+19,"this document has been duly executed by the Mayor of Long Beach, California, USA, and the",0.218824,0.654848,0.581961,0.012424,29
+19,"Mayor of San Pablo de Manta, Ecundor, South America.",0.218431,0.665758,0.338824,0.012121,30
+19,STATE OFFICE,0.276471,0.713636,0.050588,0.048788,31
+19,Beverly 0 Neill,0.587451,0.736667,0.121961,0.013636,32
+19,"Mayor, City of Long Beach",0.542353,0.751212,0.21098,0.013636,33
+19,"California, USA",0.582745,0.765758,0.125098,0.01303,34
+19,10.2aulus,0.490588,0.771818,0.220392,0.062424,35
+19,Ing. Jorge O. Zambrano Cedeño,0.527059,0.825152,0.242745,0.013333,36
+19,"Mayor, City of San Pablo de Manta",0.505098,0.839394,0.277647,0.013636,37
+19,"Ecuador, South America",0.551765,0.854242,0.188235,0.011818,38
+19,"Dated: September 19, 2000",0.544706,0.883333,0.202745,0.01303,39
+20,SisterCities,0.169412,0.03303,0.24,0.028485,1
+20,Partnership Agreement,0.516078,0.027879,0.441176,0.032424,2
+20,INTERNATIONAL,0.170196,0.066667,0.237647,0.009091,3
+20,Connect globally. Thrive locally.,0.169412,0.08697,0.239216,0.013333,4
+20,Toolkit,0.83098,0.072727,0.127451,0.025758,5
+20,REAFFIRMATION OF SISTER CITIES DECLARATION,0.324706,0.165152,0.483529,0.013939,6
+20,adopted by,0.2,0.213333,0.080392,0.013636,7
+20,THE HONORABLE RICHARD M. DALEY,0.396078,0.214242,0.335686,0.012424,8
+20,MAYOR OF CHICAGO,0.472549,0.231212,0.18549,0.011515,9
+20,and,0.199608,0.260909,0.026275,0.010606,10
+20,THE HONORABLE ZHANG RONGMAO,0.401961,0.261212,0.323137,0.011212,11
+20,MAYOR OF SHENYANG,0.463529,0.273636,0.202353,0.011212,12
+20,ON,0.551765,0.298182,0.026667,0.011515,13
+20,"JUNE 5, 1995",0.500392,0.323636,0.128235,0.014848,14
+20,"On this the tenth anniversary of the signing of a sister city agreement, in order to further",0.255686,0.36303,0.67098,0.015152,15
+20,the traditional links of friendship between Chicago and Shenyang and to reaffirm their mutual,0.198824,0.378788,0.727843,0.015455,16
+20,"aspiration to work in unison for the benefit of their cities and nations, the Honorable Mayor",0.199608,0.394848,0.727843,0.014848,17
+20,"Richard M. Daley, Mayor of the City of Chicago, and the Honorable Zhang Rongmao, Mayor",0.199216,0.411212,0.727451,0.014242,18
+20,"of the City of Shenyang, on this fifth day of June 1995, do hereby acknowledge and reaffirm the",0.199216,0.42697,0.72549,0.014848,19
+20,sister cities agreement between the City of Chicago and the City of Shenyang.,0.199608,0.443636,0.57451,0.014242,20
+20,"The City of Chicago and the City of Shenyang on the basis of friendly cooperation,",0.256078,0.473939,0.665098,0.015152,21
+20,equality and mutual benefit will continue to develop a sister cities relationship to promote and,0.2,0.490303,0.724706,0.014242,22
+20,broaden economic cooperation and cultural exchanges between the two cities.,0.199216,0.506061,0.57451,0.014242,23
+20,The two cities do hereby declare their interest in exploring the establishment of business,0.255294,0.537273,0.668235,0.015455,24
+20,and trade relations between Chicago and Shenyang.,0.198824,0.554545,0.387843,0.013636,25
+20,"In addition, exchanges will be promoted in the area of the arts such as exhibits, music,",0.254118,0.583939,0.666667,0.015455,26
+20,dance and other cultural activities.,0.198431,0.601212,0.256471,0.010606,27
+20,"In addition, exchanges will be promoted in education and the establishment of contacts",0.254118,0.630303,0.668627,0.015758,28
+20,within educational institutions encouraged.,0.198824,0.647273,0.32,0.014242,29
+20,"In addition, we declare our intention to promote exchanges in such fields as science and",0.253725,0.678182,0.668627,0.014848,30
+20,"technology, sports, health, youth and any areas that will contribute to the prosperity and the",0.198039,0.693636,0.722745,0.015152,31
+20,further development of friendship between the people of our two cities.,0.194902,0.711515,0.525098,0.013636,32
+20,3h.5.,0.593725,0.750606,0.218039,0.06303,33
+20,THE HONORABLE RICHARD M. DALEY,0.197255,0.821515,0.303529,0.010606,34
+20,THE HONORABLE ZHANG RONGMAO,0.588627,0.819394,0.287843,0.011818,35
+20,MAYOR OF CHICAGO,0.195686,0.835758,0.164706,0.010606,36
+20,MAYOR OF SHENYANG,0.587451,0.835455,0.177647,0.010303,37
+21,Skills_based_CV.qxd 5/8/11 3:55 pm Page,0.17777,0.135381,0.308796,0.008545,1
+21,agcas,0.726169,0.191722,0.053368,0.011749,2
+21,Example of a skills-based CV,0.3894,0.205874,0.224144,0.011482,3
+21,ASHLEY GILL,0.459698,0.246195,0.082812,0.008278,4
+21,3 Lappage Court,0.2212,0.259012,0.080972,0.008545,5
+21,Telephone: 01882 652349,0.592565,0.259012,0.129555,0.008278,6
+21,"Tyler Green, Bucks.",0.220464,0.269159,0.092381,0.008278,7
+21,Mobile: 07717 121824,0.593669,0.269159,0.112992,0.006676,8
+21,HP8 4JD,0.2212,0.279306,0.040486,0.006409,9
+21,Email: ashleygill2023@gotmail.com,0.594038,0.279039,0.178874,0.008545,10
+21,Personal Details,0.221568,0.299332,0.095326,0.007744,11
+21,Summary,0.220832,0.321495,0.048215,0.008278,12
+21,Business studies with Spanish undergraduate.,0.273463,0.340988,0.229297,0.008812,13
+21,Ability to speak French and Spanish.,0.272727,0.351135,0.179242,0.008545,14
+21,Extensive business experience including an internship with Top Choice Holidays.,0.273095,0.361015,0.398233,0.008812,15
+21,Education And Qualifications,0.2212,0.381041,0.144277,0.008278,16
+21,2008 present,0.220832,0.401602,0.074715,0.008011,17
+21,Buckinghamshire Edge University,0.386824,0.401068,0.167096,0.008545,18
+21,BA International Business Studies with Spanish (expected 2:1),0.386824,0.410681,0.308796,0.008812,19
+21,Relate your degree to,0.230033,0.420027,0.100847,0.008278,20
+21,Study semester at The University of Valloid (Spain).,0.399338,0.420828,0.252852,0.008812,21
+21,the job by listing your,0.229665,0.429105,0.101583,0.008278,22
+21,Six-month work placement in Madrid.,0.399338,0.431242,0.188811,0.008545,23
+21,relevant modules/,0.230033,0.438718,0.085388,0.007744,24
+21,Relevant modules included: Business Planning; Sales Promotion and,0.399338,0.441389,0.338241,0.008545,25
+21,dissertation.,0.230033,0.448064,0.057784,0.006676,26
+21,Marketing; and Business Operations Management.,0.398969,0.451268,0.25322,0.008812,27
+21,2000 2007,0.2212,0.467824,0.061833,0.006409,28
+21,Freebridge School,0.386824,0.46729,0.087965,0.008545,29
+21,"A-Levels: Business Studies (B), French (C)",0.386088,0.476903,0.200221,0.008812,30
+21,"8 GCSEs including Maths, English, Spanish and French",0.386824,0.487583,0.266838,0.008545,31
+21,Work History,0.220832,0.509212,0.065513,0.008278,32
+21,2008 2011,0.220832,0.529506,0.061833,0.006409,33
+21,Buckinghamshire Edge University Librarian/tour guide,0.386824,0.528972,0.277144,0.008812,34
+21,General administrative and customer service roles.,0.399338,0.539119,0.25138,0.006676,35
+21,Briefly list,0.707766,0.536716,0.045639,0.008011,36
+21,your relevant,0.70703,0.546061,0.061465,0.008011,37
+21,2011 (Feb-Aug),0.2212,0.55514,0.078027,0.008812,38
+21,Audigest S.A. (Madrid) - Audit Assistant,0.386456,0.554873,0.199485,0.009079,39
+21,duties.,0.707398,0.555674,0.030916,0.006409,40
+21,Six months' work experience in an international bank.,0.399338,0.565287,0.267575,0.008545,41
+21,Liaising with colleagues and clients in English and Spanish.,0.399338,0.575434,0.292602,0.008545,42
+21,2010 (June-Dec),0.220832,0.591188,0.082444,0.008278,43
+21,Finsbury's supermarket (Hazelbridge) — Supervisor,0.386824,0.591188,0.250644,0.008812,44
+21,Managing a small team.,0.398969,0.601602,0.121089,0.008545,45
+21,Customer service in a busy competitive environment.,0.398969,0.611215,0.264262,0.008545,46
+21,2010 (Jan-Aug),0.2212,0.627236,0.077291,0.008812,47
+21,Top Choice Holidays and Flights Ltd (Low Wycombe),0.386088,0.627503,0.257637,0.008812,48
+21,Financial Assistant/Supervisor,0.386824,0.637383,0.15127,0.008812,49
+21,Working in a range of teams to manage complex financial processes.,0.398969,0.64753,0.341921,0.008812,50
+21,2007 (Jul-Aug),0.220832,0.663284,0.074347,0.008812,51
+21,Dogs Protection League - General Assistant,0.386824,0.663818,0.216783,0.008812,52
+21,Dealing with enquiries and selling packages to a range of clients.,0.399706,0.673431,0.321678,0.009079,53
+21,2006 (Jan-Dec),0.220832,0.689453,0.076187,0.009079,54
+21,McHenry's Restaurant (Low Wycombe) - Supervisor,0.386456,0.68972,0.256533,0.009079,55
+21,Voluntary Experience,0.220464,0.708411,0.106367,0.008545,56
+21,2007/2011,0.220832,0.728438,0.055208,0.008011,57
+21,Teaching English in Mexico/Spain,0.386088,0.727904,0.167832,0.009079,58
+21,Interests,0.2212,0.748465,0.043062,0.006676,59
+21,Active member of University Business Club — Winner of the 'Bucks Best Business Pitch' award in 2010 Enterprise,0.220464,0.768224,0.556864,0.009079,60
+21,"week, judged by Michael Eavis.",0.220464,0.778104,0.15311,0.008812,61
+22,Skills_based_CV.qxd 5/8/11 3:55 pm Page,0.17777,0.135381,0.308428,0.008545,1
+22,Make sure you carefully assess,0.468531,0.23498,0.142068,0.008011,2
+22,Skills And Achievements,0.220832,0.245394,0.121457,0.006676,3
+22,the job advert/job description,0.468163,0.244326,0.139124,0.008278,4
+22,and address all the skills they,0.468531,0.253672,0.13618,0.008278,5
+22,Effective communication,0.2212,0.265421,0.123298,0.006676,6
+22,require.,0.468531,0.263017,0.034965,0.008011,7
+22,"Able to communicate effectively with a wide range of clients and colleagues, by showing interest, carefully",0.233714,0.275567,0.530364,0.008545,8
+22,"listening to needs and appropriately adjusting my message, as demonstrated during my time at Finsbury's",0.23445,0.285447,0.528892,0.008812,9
+22,Supermarket.,0.234082,0.295861,0.066618,0.008278,10
+22,Strong presentation skills and confidence demonstrated by experience of delivering presentations in different,0.23445,0.305474,0.543614,0.008812,11
+22,languages to groups of five to fifty.,0.234082,0.315621,0.172617,0.008812,12
+22,Customer service,0.220832,0.335915,0.085388,0.006676,13
+22,Ability to quickly build rapport with customers and calmly deal with any problems as shown during my retail,0.233714,0.345527,0.541038,0.008812,14
+22,experience in high pressure environments.,0.234082,0.355941,0.210526,0.008278,15
+22,"Capacity to maintain professional relationships through email and other written correspondence, for example,",0.234082,0.365554,0.548767,0.008812,16
+22,"at Audigest in Madrid, where I built longstanding business relationships with customers and colleagues across",0.233714,0.375701,0.549871,0.008812,17
+22,the globe.,0.233714,0.385848,0.049687,0.008278,18
+22,Teamwork,0.220464,0.406142,0.052632,0.006409,19
+22,"At Top Choice Holidays demonstrated excellent teamwork skills in a busy financial environment, such as an",0.233346,0.415754,0.532573,0.008812,20
+22,"ability to listen to clients and managers, perform my role to a high level and support colleagues, resulting in",0.234082,0.425634,0.535885,0.008812,21
+22,promotion.,0.234082,0.436048,0.05484,0.008545,22
+22,Administration,0.220464,0.456075,0.075083,0.006409,23
+22,Prove you have each of the,0.639676,0.453672,0.123666,0.008278,24
+22,"Excellent ability to plan ahead and manage time effectively, for example,",0.23445,0.465688,0.360692,0.008812,25
+22,skills required by outlining,0.63894,0.463017,0.12293,0.008278,26
+22,managing complex roles during my internship at Top Choice Holidays.,0.23445,0.476101,0.346338,0.008545,27
+22,where you performed them,0.63894,0.472363,0.128082,0.008278,28
+22,Gathered data from a wide range of sources during my dissertation,0.234082,0.485714,0.334928,0.008812,29
+22,and how you performed,0.639308,0.481709,0.111888,0.008278,30
+22,them well.,0.63894,0.491055,0.048951,0.006409,31
+22,"whilst balancing my other studies and two jobs, resulting in a 73% grade.",0.233346,0.495861,0.365109,0.008812,32
+22,Experience of travellers' needs,0.2212,0.515888,0.150534,0.008545,33
+22,Recent travel consultancy experience gives me an in-depth understanding of the expectations of holiday,0.23445,0.525768,0.518955,0.008812,34
+22,customers and the competitive nature of the industry.,0.234082,0.535915,0.269047,0.008812,35
+22,International travel experience and language ability give me an empathy with travellers and a passion for,0.234082,0.545794,0.524107,0.008812,36
+22,helping them find a unique holiday experience.,0.234082,0.555941,0.23445,0.008812,37
+22,Initiative,0.2212,0.576235,0.044166,0.006676,38
+22,Self-funding an evening course in bookkeeping during my first accountancy role demonstrated my ability to,0.234082,0.585848,0.535149,0.008812,39
+22,plan ahead and take control of my career.,0.23445,0.595995,0.205006,0.008545,40
+22,Successful study and work in Spain and Mexico show that I can creatively develop my skills and experience and,0.234082,0.605874,0.551711,0.008545,41
+22,adapt to new and different environments.,0.234082,0.616288,0.208686,0.008278,42
+22,Sales knowledge,0.220464,0.636315,0.083916,0.008011,43
+22,Wide experience of financial roles gives me an awareness of the tight monetary pressures which drive UK,0.234082,0.645928,0.525212,0.009346,44
+22,service industries.,0.234082,0.656609,0.088333,0.006943,45
+22,Raised sales at The Dogs Protection League by 12% by up selling add-on packages to new and existing,0.23445,0.665955,0.505705,0.009079,46
+22,customers.,0.234082,0.67717,0.054472,0.006142,47
+22,Language ability,0.2212,0.696395,0.082444,0.008812,48
+22,"Spanish fluency obtained working overseas, French semi-fluent.",0.233714,0.706008,0.323151,0.009079,49
+22,Referees,0.2212,0.726569,0.041958,0.006676,50
+22,Include all your referee details including their email and,0.351859,0.722029,0.259109,0.008545,51
+22,phone number (but ask for their permission first).,0.352227,0.731108,0.230401,0.008545,52
+22,"Professional: Mr. Jose Andreas, Management Accountant, Audigest, Avenida de Concha Espina 2, Madrid, ES-",0.2212,0.746328,0.537725,0.008812,53
+22,"28036, +34 91 398 5476, j.andreas@audigest.es",0.2212,0.756475,0.238498,0.008278,54
+22,"Academic: Dr. Jane Luffle, Personal Tutor, Buckinghamshire Edge University, Due Road, Low Wycombe, Bucks,",0.220464,0.776502,0.536621,0.008812,55
+22,"HD15 3DL, 01628 435 6784, j.luffle@bedge.ac.uk",0.2212,0.786382,0.244755,0.008545,56
diff --git a/example_data/example_outputs/example_of_emails_sent_to_a_professor_before_applying_ocr_output_textract.csv b/example_data/example_outputs/example_of_emails_sent_to_a_professor_before_applying_ocr_output_textract.csv
new file mode 100644
index 0000000000000000000000000000000000000000..9fbc84a3e9f6a72e039153800eaabe55e4749c23
--- /dev/null
+++ b/example_data/example_outputs/example_of_emails_sent_to_a_professor_before_applying_ocr_output_textract.csv
@@ -0,0 +1,40 @@
+page,text,left,top,width,height,line
+1,Example of emails sent to a professor before applying:,0.147059,0.093434,0.426471,0.013889,1
+1,Fwd: Prospective Graduate Student,0.145425,0.128788,0.277778,0.013889,2
+1,"Dr. Kornbluth,",0.147059,0.162879,0.114379,0.012626,3
+1,I am a senior biology major at the University of Notre Dame. I am applying to the CMB,0.147059,0.198232,0.689542,0.013889,4
+1,program and am very interested in your work. After glancing at a few of your recent,0.145425,0.214646,0.660131,0.013889,5
+1,papers and your research summary I find your work with apoptosis very interesting. Will,0.145425,0.232323,0.697712,0.013889,6
+1,"you be taking on new students next year? If I am invited to interview, is there any way",0.145425,0.25,0.683007,0.013889,7
+1,you will be able to meet with me?,0.145425,0.267677,0.264706,0.013889,8
+1,I have worked on several different research projects as an undergraduate in Dr. David R.,0.147059,0.30303,0.69281,0.013889,9
+1,Hyde's lab at the University of Notre Dame. The Hyde lab is interested in the signals that,0.147059,0.320707,0.697712,0.013889,10
+1,initiate Muller glia division post-light damage. My first research project was,0.147059,0.338384,0.598039,0.013889,11
+1,characterizing the role of leukemia inhibitory factor (LIF) in the activation of cell,0.147059,0.354798,0.637255,0.013889,12
+1,proliferation in the undamaged zebrafish retina. I am also working on several,0.145425,0.372475,0.604575,0.013889,13
+1,experiments that are related to a genetic screen that the Hyde lab plans on performing to,0.145425,0.390152,0.689542,0.013889,14
+1,identify mutants in the regeneration pathway--I am developing a neuroD4:EGFP,0.147059,0.407828,0.635621,0.013889,15
+1,transgenic line for use in this screen and I am characterizing the extent of damage and,0.145425,0.425505,0.673203,0.013889,16
+1,"regeneration in sheer zebrafish retinas. Finally, I am characterizing the chx10:EGFP",0.145425,0.443182,0.661765,0.013889,17
+1,transgenic line during retinal development and regeneration.,0.145425,0.459596,0.472222,0.013889,18
+1,Please find my CV attached.,0.145425,0.496212,0.222222,0.013889,19
+1,"Thank you for your time,",0.145425,0.531566,0.196078,0.013889,20
+1,--Lauren Lilley,0.147059,0.566919,0.119281,0.013889,21
+1,"Dr. Poss,",0.145425,0.637626,0.070261,0.012626,22
+1,I am a senior biology major at the University of Notre Dame. I am applying to your,0.145425,0.671717,0.655229,0.013889,23
+1,graduate program and am very interested in your work. After glancing at a few of your,0.145425,0.689394,0.679739,0.013889,24
+1,recent papers and your research summary I find your research greatly coincides with my,0.145425,0.707071,0.69281,0.013889,25
+1,research experiences and interests. Will you be taking on new students next year?,0.145425,0.723485,0.643791,0.015152,26
+1,I have worked on several different research projects as an undergraduate in Dr. David R.,0.145425,0.760101,0.69281,0.013889,27
+1,Hyde's lab at the University of Notre Dame. The Hyde lab is interested in the signals that,0.145425,0.777778,0.699346,0.013889,28
+1,initiate Muller glia division post-light damage. My first research project was,0.145425,0.795455,0.598039,0.013889,29
+1,characterizing the role of leukemia inhibitory factor (LIF) in the activation of cell,0.145425,0.811869,0.638889,0.013889,30
+1,proliferation in the undamaged zebrafish retina. I am also working on several,0.145425,0.829545,0.604575,0.013889,31
+1,experiments that are related to a genetic screen that the Hyde lab plans on performing to,0.145425,0.847222,0.691176,0.013889,32
+1,identify mutants in the regeneration pathway--I am developing a neuroD4:EGFP,0.145425,0.864899,0.635621,0.013889,33
+1,transgenic line for use in this screen and I am characterizing the extent of damage and,0.145425,0.881313,0.673203,0.013889,34
+2,"regeneration in sheer zebrafish retinas. Finally, I am characterizing the chx10:EGFP",0.145425,0.093434,0.661765,0.013889,1
+2,transgenic line during retinal development and regeneration.,0.145425,0.111111,0.472222,0.013889,2
+2,Please find my CV attached.,0.145425,0.146465,0.222222,0.013889,3
+2,"Thank you for your time,",0.145425,0.181818,0.196078,0.013889,4
+2,--Lauren Lilley,0.147059,0.218434,0.119281,0.013889,5
diff --git a/example_data/example_outputs/example_of_emails_sent_to_a_professor_before_applying_ocr_results_with_words_textract.csv b/example_data/example_outputs/example_of_emails_sent_to_a_professor_before_applying_ocr_results_with_words_textract.csv
new file mode 100644
index 0000000000000000000000000000000000000000..8eacdf5297ccf7a2a4ce5ce89371b9203c8ffb6c
--- /dev/null
+++ b/example_data/example_outputs/example_of_emails_sent_to_a_professor_before_applying_ocr_results_with_words_textract.csv
@@ -0,0 +1,432 @@
+page,line,word_text,word_x0,word_y0,word_x1,word_y1,line_text,line_x0,line_y0,line_x1,line_y1
+1,1,Example,0.147059,0.093434,0.215686,0.107323,,,,,
+1,1,of,0.220588,0.093434,0.240196,0.104798,,,,,
+1,1,emails,0.24183,0.093434,0.292484,0.104798,,,,,
+1,1,sent,0.297386,0.094697,0.330065,0.104798,,,,,
+1,1,to,0.334967,0.094697,0.349673,0.104798,,,,,
+1,1,a,0.354575,0.097222,0.362745,0.104798,,,,,
+1,1,professor,0.367647,0.093434,0.441176,0.108586,,,,,
+1,1,before,0.446078,0.093434,0.496732,0.104798,,,,,
+1,1,applying:,0.501634,0.093434,0.573529,0.107323,,,,,
+1,2,Fwd:,0.145425,0.128788,0.184641,0.140152,,,,,
+1,2,Prospective,0.191176,0.128788,0.28268,0.142677,,,,,
+1,2,Graduate,0.287582,0.128788,0.359477,0.140152,,,,,
+1,2,Student,0.364379,0.128788,0.424837,0.140152,,,,,
+1,3,Dr.,0.147059,0.162879,0.171569,0.174242,,,,,
+1,3,"Kornbluth,",0.176471,0.162879,0.261438,0.176768,,,,,
+1,4,I,0.147059,0.198232,0.153595,0.209596,,,,,
+1,4,am,0.158497,0.200758,0.181373,0.209596,,,,,
+1,4,a,0.186275,0.20202,0.194444,0.209596,,,,,
+1,4,senior,0.199346,0.198232,0.248366,0.209596,,,,,
+1,4,biology,0.253268,0.198232,0.312092,0.212121,,,,,
+1,4,major,0.316993,0.198232,0.364379,0.212121,,,,,
+1,4,at,0.367647,0.199495,0.382353,0.209596,,,,,
+1,4,the,0.387255,0.198232,0.411765,0.209596,,,,,
+1,4,University,0.416667,0.198232,0.5,0.212121,,,,,
+1,4,of,0.504902,0.198232,0.522876,0.209596,,,,,
+1,4,Notre,0.52451,0.198232,0.570261,0.209596,,,,,
+1,4,Dame.,0.575163,0.198232,0.625817,0.209596,,,,,
+1,4,I,0.632353,0.198232,0.637255,0.209596,,,,,
+1,4,am,0.643791,0.200758,0.666667,0.209596,,,,,
+1,4,applying,0.671569,0.198232,0.740196,0.212121,,,,,
+1,4,to,0.745098,0.199495,0.759804,0.209596,,,,,
+1,4,the,0.764706,0.198232,0.789216,0.209596,,,,,
+1,4,CMB,0.794118,0.198232,0.836601,0.209596,,,,,
+1,5,program,0.145425,0.218434,0.212418,0.229798,,,,,
+1,5,and,0.21732,0.215909,0.245098,0.227273,,,,,
+1,5,am,0.25,0.218434,0.27451,0.227273,,,,,
+1,5,very,0.279412,0.218434,0.313725,0.229798,,,,,
+1,5,interested,0.320261,0.214646,0.395425,0.22601,,,,,
+1,5,in,0.400327,0.214646,0.416667,0.22601,,,,,
+1,5,your,0.419935,0.218434,0.457516,0.229798,,,,,
+1,5,work.,0.460784,0.214646,0.506536,0.227273,,,,,
+1,5,After,0.511438,0.214646,0.553922,0.227273,,,,,
+1,5,glancing,0.55719,0.215909,0.625817,0.229798,,,,,
+1,5,at,0.630719,0.217172,0.645425,0.227273,,,,,
+1,5,a,0.650327,0.218434,0.658497,0.227273,,,,,
+1,5,few,0.663399,0.214646,0.69281,0.22601,,,,,
+1,5,of,0.697712,0.214646,0.715686,0.227273,,,,,
+1,5,your,0.718954,0.218434,0.754902,0.229798,,,,,
+1,5,recent,0.759804,0.217172,0.80719,0.22601,,,,,
+1,6,papers,0.145425,0.236111,0.197712,0.247475,,,,,
+1,6,and,0.202614,0.232323,0.230392,0.243687,,,,,
+1,6,your,0.235294,0.236111,0.271242,0.247475,,,,,
+1,6,research,0.276144,0.232323,0.341503,0.243687,,,,,
+1,6,summary,0.346405,0.236111,0.419935,0.247475,,,,,
+1,6,I,0.424837,0.232323,0.431373,0.243687,,,,,
+1,6,find,0.436275,0.232323,0.46732,0.243687,,,,,
+1,6,your,0.472222,0.236111,0.50817,0.247475,,,,,
+1,6,work,0.513072,0.232323,0.553922,0.243687,,,,,
+1,6,with,0.558824,0.232323,0.593137,0.243687,,,,,
+1,6,apoptosis,0.598039,0.233586,0.671569,0.247475,,,,,
+1,6,very,0.678105,0.236111,0.712418,0.247475,,,,,
+1,6,interesting.,0.71732,0.232323,0.803922,0.247475,,,,,
+1,6,Will,0.810458,0.232323,0.844771,0.243687,,,,,
+1,7,you,0.145425,0.253788,0.174837,0.263889,,,,,
+1,7,be,0.179739,0.25,0.199346,0.261364,,,,,
+1,7,taking,0.204248,0.25,0.253268,0.265152,,,,,
+1,7,on,0.25817,0.253788,0.277778,0.261364,,,,,
+1,7,new,0.28268,0.253788,0.315359,0.261364,,,,,
+1,7,students,0.320261,0.25,0.383987,0.261364,,,,,
+1,7,next,0.388889,0.251263,0.423203,0.261364,,,,,
+1,7,year?,0.428105,0.25,0.470588,0.263889,,,,,
+1,7,If,0.480392,0.25,0.495098,0.261364,,,,,
+1,7,I,0.498366,0.25,0.504902,0.261364,,,,,
+1,7,am,0.509804,0.253788,0.534314,0.261364,,,,,
+1,7,invited,0.539216,0.25,0.593137,0.261364,,,,,
+1,7,to,0.598039,0.251263,0.612745,0.261364,,,,,
+1,7,"interview,",0.617647,0.25,0.696078,0.263889,,,,,
+1,7,is,0.702614,0.25,0.714052,0.261364,,,,,
+1,7,there,0.718954,0.25,0.759804,0.261364,,,,,
+1,7,any,0.763072,0.253788,0.792484,0.263889,,,,,
+1,7,way,0.797386,0.253788,0.830065,0.263889,,,,,
+1,8,you,0.145425,0.271465,0.176471,0.281566,,,,,
+1,8,will,0.179739,0.267677,0.210784,0.27904,,,,,
+1,8,be,0.215686,0.267677,0.235294,0.27904,,,,,
+1,8,able,0.238562,0.267677,0.272876,0.27904,,,,,
+1,8,to,0.276144,0.268939,0.292484,0.27904,,,,,
+1,8,meet,0.297386,0.268939,0.334967,0.27904,,,,,
+1,8,with,0.339869,0.267677,0.375817,0.27904,,,,,
+1,8,me?,0.380719,0.267677,0.411765,0.27904,,,,,
+1,9,I,0.147059,0.30303,0.151961,0.314394,,,,,
+1,9,have,0.156863,0.30303,0.194444,0.314394,,,,,
+1,9,worked,0.199346,0.30303,0.25817,0.314394,,,,,
+1,9,on,0.263072,0.306818,0.28268,0.314394,,,,,
+1,9,several,0.287582,0.30303,0.343137,0.314394,,,,,
+1,9,different,0.348039,0.30303,0.416667,0.314394,,,,,
+1,9,research,0.419935,0.30303,0.485294,0.314394,,,,,
+1,9,projects,0.490196,0.30303,0.552288,0.318182,,,,,
+1,9,as,0.558824,0.306818,0.573529,0.314394,,,,,
+1,9,an,0.580065,0.306818,0.598039,0.314394,,,,,
+1,9,undergraduate,0.602941,0.30303,0.714052,0.318182,,,,,
+1,9,in,0.718954,0.30303,0.735294,0.314394,,,,,
+1,9,Dr.,0.740196,0.30303,0.764706,0.314394,,,,,
+1,9,David,0.769608,0.30303,0.816993,0.314394,,,,,
+1,9,R.,0.823529,0.30303,0.839869,0.314394,,,,,
+1,10,Hyde's,0.147059,0.320707,0.199346,0.334596,,,,,
+1,10,lab,0.204248,0.320707,0.228758,0.332071,,,,,
+1,10,at,0.23366,0.32197,0.248366,0.332071,,,,,
+1,10,the,0.251634,0.320707,0.276144,0.332071,,,,,
+1,10,University,0.281046,0.320707,0.364379,0.334596,,,,,
+1,10,of,0.369281,0.320707,0.387255,0.332071,,,,,
+1,10,Notre,0.390523,0.320707,0.434641,0.332071,,,,,
+1,10,Dame.,0.439542,0.320707,0.490196,0.332071,,,,,
+1,10,The,0.496732,0.320707,0.527778,0.332071,,,,,
+1,10,Hyde,0.53268,0.320707,0.573529,0.334596,,,,,
+1,10,lab,0.580065,0.320707,0.602941,0.332071,,,,,
+1,10,is,0.607843,0.320707,0.620915,0.332071,,,,,
+1,10,interested,0.625817,0.320707,0.702614,0.332071,,,,,
+1,10,in,0.707516,0.320707,0.722222,0.332071,,,,,
+1,10,the,0.727124,0.320707,0.751634,0.332071,,,,,
+1,10,signals,0.756536,0.320707,0.810458,0.334596,,,,,
+1,10,that,0.815359,0.320707,0.844771,0.332071,,,,,
+1,11,initiate,0.147059,0.338384,0.20098,0.349747,,,,,
+1,11,Muller,0.205882,0.338384,0.259804,0.349747,,,,,
+1,11,glia,0.264706,0.338384,0.292484,0.352273,,,,,
+1,11,division,0.297386,0.338384,0.361111,0.349747,,,,,
+1,11,post-light,0.366013,0.338384,0.44281,0.352273,,,,,
+1,11,damage.,0.446078,0.338384,0.511438,0.352273,,,,,
+1,11,My,0.51634,0.338384,0.544118,0.352273,,,,,
+1,11,first,0.54902,0.338384,0.581699,0.349747,,,,,
+1,11,research,0.584967,0.338384,0.650327,0.349747,,,,,
+1,11,project,0.655229,0.338384,0.710784,0.353535,,,,,
+1,11,was,0.715686,0.340909,0.745098,0.349747,,,,,
+1,12,characterizing,0.147059,0.354798,0.256536,0.369949,,,,,
+1,12,the,0.261438,0.356061,0.285948,0.367424,,,,,
+1,12,role,0.29085,0.356061,0.321895,0.367424,,,,,
+1,12,of,0.326797,0.356061,0.344771,0.367424,,,,,
+1,12,leukemia,0.348039,0.356061,0.419935,0.367424,,,,,
+1,12,inhibitory,0.424837,0.354798,0.501634,0.369949,,,,,
+1,12,factor,0.506536,0.356061,0.553922,0.367424,,,,,
+1,12,(LIF),0.55719,0.354798,0.599673,0.369949,,,,,
+1,12,in,0.604575,0.356061,0.620915,0.367424,,,,,
+1,12,the,0.624183,0.356061,0.648693,0.366162,,,,,
+1,12,activation,0.653595,0.356061,0.732026,0.367424,,,,,
+1,12,of,0.735294,0.354798,0.754902,0.367424,,,,,
+1,12,cell,0.756536,0.356061,0.785948,0.367424,,,,,
+1,13,proliferation,0.145425,0.372475,0.243464,0.387626,,,,,
+1,13,in,0.25,0.373737,0.264706,0.383838,,,,,
+1,13,the,0.269608,0.373737,0.292484,0.383838,,,,,
+1,13,undamaged,0.297386,0.372475,0.388889,0.387626,,,,,
+1,13,zebrafish,0.393791,0.372475,0.465686,0.383838,,,,,
+1,13,retina.,0.470588,0.373737,0.519608,0.383838,,,,,
+1,13,I,0.52451,0.373737,0.531046,0.383838,,,,,
+1,13,am,0.535948,0.376263,0.560458,0.383838,,,,,
+1,13,also,0.565359,0.372475,0.596405,0.383838,,,,,
+1,13,working,0.601307,0.372475,0.666667,0.387626,,,,,
+1,13,on,0.671569,0.376263,0.691176,0.385101,,,,,
+1,13,several,0.696078,0.373737,0.751634,0.383838,,,,,
+1,14,experiments,0.145425,0.390152,0.24183,0.405303,,,,,
+1,14,that,0.246732,0.390152,0.276144,0.401515,,,,,
+1,14,are,0.281046,0.393939,0.305556,0.401515,,,,,
+1,14,related,0.308824,0.390152,0.362745,0.401515,,,,,
+1,14,to,0.367647,0.392677,0.383987,0.401515,,,,,
+1,14,a,0.388889,0.393939,0.397059,0.401515,,,,,
+1,14,genetic,0.401961,0.390152,0.45915,0.405303,,,,,
+1,14,screen,0.464052,0.393939,0.514706,0.401515,,,,,
+1,14,that,0.517974,0.390152,0.547386,0.401515,,,,,
+1,14,the,0.552288,0.390152,0.576797,0.401515,,,,,
+1,14,Hyde,0.581699,0.390152,0.624183,0.405303,,,,,
+1,14,lab,0.629085,0.390152,0.653595,0.401515,,,,,
+1,14,plans,0.658497,0.390152,0.699346,0.405303,,,,,
+1,14,on,0.704248,0.393939,0.723856,0.401515,,,,,
+1,14,performing,0.728758,0.390152,0.816993,0.405303,,,,,
+1,14,to,0.821895,0.391414,0.836601,0.401515,,,,,
+1,15,identify,0.147059,0.407828,0.207516,0.421717,,,,,
+1,15,mutants,0.212418,0.409091,0.272876,0.419192,,,,,
+1,15,in,0.279412,0.407828,0.294118,0.419192,,,,,
+1,15,the,0.29902,0.407828,0.323529,0.419192,,,,,
+1,15,regeneration,0.328431,0.407828,0.426471,0.42298,,,,,
+1,15,pathway--I,0.429739,0.407828,0.51634,0.42298,,,,,
+1,15,am,0.522876,0.411616,0.545752,0.419192,,,,,
+1,15,developing,0.550654,0.407828,0.638889,0.42298,,,,,
+1,15,a,0.643791,0.411616,0.651961,0.419192,,,,,
+1,15,neuroD4:EGFP,0.656863,0.407828,0.78268,0.419192,,,,,
+1,16,transgenic,0.145425,0.425505,0.227124,0.439394,,,,,
+1,16,line,0.232026,0.425505,0.261438,0.436869,,,,,
+1,16,for,0.26634,0.425505,0.289216,0.436869,,,,,
+1,16,use,0.294118,0.42803,0.320261,0.436869,,,,,
+1,16,in,0.325163,0.425505,0.339869,0.436869,,,,,
+1,16,this,0.344771,0.425505,0.372549,0.436869,,,,,
+1,16,screen,0.377451,0.42803,0.428105,0.436869,,,,,
+1,16,and,0.433007,0.425505,0.460784,0.436869,,,,,
+1,16,I,0.46732,0.425505,0.472222,0.436869,,,,,
+1,16,am,0.477124,0.42803,0.501634,0.436869,,,,,
+1,16,characterizing,0.506536,0.425505,0.617647,0.439394,,,,,
+1,16,the,0.622549,0.425505,0.647059,0.436869,,,,,
+1,16,extent,0.651961,0.426768,0.70098,0.436869,,,,,
+1,16,of,0.704248,0.425505,0.722222,0.436869,,,,,
+1,16,damage,0.72549,0.425505,0.787582,0.439394,,,,,
+1,16,and,0.79085,0.425505,0.820261,0.436869,,,,,
+1,17,regeneration,0.145425,0.443182,0.243464,0.457071,,,,,
+1,17,in,0.25,0.443182,0.264706,0.454545,,,,,
+1,17,sheer,0.267974,0.443182,0.312092,0.454545,,,,,
+1,17,zebrafish,0.316993,0.443182,0.388889,0.454545,,,,,
+1,17,retinas.,0.393791,0.443182,0.449346,0.454545,,,,,
+1,17,"Finally,",0.455882,0.443182,0.51634,0.457071,,,,,
+1,17,I,0.521242,0.443182,0.527778,0.454545,,,,,
+1,17,am,0.53268,0.445707,0.55719,0.454545,,,,,
+1,17,characterizing,0.560458,0.443182,0.671569,0.457071,,,,,
+1,17,the,0.676471,0.443182,0.70098,0.454545,,,,,
+1,17,chx10:EGFP,0.705882,0.443182,0.808824,0.454545,,,,,
+1,18,transgenic,0.145425,0.459596,0.227124,0.474747,,,,,
+1,18,line,0.232026,0.459596,0.261438,0.47096,,,,,
+1,18,during,0.26634,0.459596,0.316993,0.474747,,,,,
+1,18,retinal,0.321895,0.459596,0.372549,0.47096,,,,,
+1,18,development,0.377451,0.459596,0.478758,0.474747,,,,,
+1,18,and,0.48366,0.460859,0.511438,0.47096,,,,,
+1,18,regeneration.,0.51634,0.459596,0.619281,0.474747,,,,,
+1,19,Please,0.145425,0.496212,0.196078,0.507576,,,,,
+1,19,find,0.20098,0.496212,0.232026,0.507576,,,,,
+1,19,my,0.236928,0.5,0.263072,0.510101,,,,,
+1,19,CV,0.267974,0.496212,0.295752,0.507576,,,,,
+1,19,attached.,0.29902,0.496212,0.369281,0.507576,,,,,
+1,20,Thank,0.145425,0.531566,0.196078,0.542929,,,,,
+1,20,you,0.20098,0.535354,0.230392,0.546717,,,,,
+1,20,for,0.235294,0.531566,0.25817,0.542929,,,,,
+1,20,your,0.263072,0.535354,0.29902,0.546717,,,,,
+1,20,"time,",0.303922,0.531566,0.343137,0.545455,,,,,
+1,21,--Lauren,0.147059,0.568182,0.215686,0.579545,,,,,
+1,21,Lilley,0.218954,0.566919,0.26634,0.582071,,,,,
+1,22,Dr.,0.145425,0.637626,0.171569,0.64899,,,,,
+1,22,"Poss,",0.176471,0.637626,0.21732,0.651515,,,,,
+1,23,I,0.145425,0.671717,0.151961,0.683081,,,,,
+1,23,am,0.158497,0.675505,0.181373,0.684343,,,,,
+1,23,a,0.186275,0.675505,0.194444,0.684343,,,,,
+1,23,senior,0.199346,0.671717,0.248366,0.683081,,,,,
+1,23,biology,0.253268,0.671717,0.312092,0.686869,,,,,
+1,23,major,0.316993,0.671717,0.364379,0.686869,,,,,
+1,23,at,0.369281,0.674242,0.382353,0.683081,,,,,
+1,23,the,0.387255,0.671717,0.411765,0.684343,,,,,
+1,23,University,0.416667,0.671717,0.498366,0.686869,,,,,
+1,23,of,0.504902,0.671717,0.522876,0.683081,,,,,
+1,23,Notre,0.52451,0.671717,0.570261,0.684343,,,,,
+1,23,Dame.,0.575163,0.671717,0.625817,0.684343,,,,,
+1,23,I,0.630719,0.671717,0.637255,0.683081,,,,,
+1,23,am,0.643791,0.675505,0.666667,0.684343,,,,,
+1,23,applying,0.671569,0.67298,0.740196,0.686869,,,,,
+1,23,to,0.745098,0.67298,0.759804,0.683081,,,,,
+1,23,your,0.764706,0.675505,0.802288,0.686869,,,,,
+1,24,graduate,0.145425,0.689394,0.214052,0.704545,,,,,
+1,24,program,0.218954,0.693182,0.284314,0.703283,,,,,
+1,24,and,0.289216,0.689394,0.318627,0.700758,,,,,
+1,24,am,0.323529,0.693182,0.348039,0.700758,,,,,
+1,24,very,0.351307,0.693182,0.387255,0.703283,,,,,
+1,24,interested,0.392157,0.689394,0.46732,0.700758,,,,,
+1,24,in,0.473856,0.689394,0.488562,0.700758,,,,,
+1,24,your,0.493464,0.693182,0.529412,0.703283,,,,,
+1,24,work.,0.534314,0.689394,0.578431,0.700758,,,,,
+1,24,After,0.583333,0.689394,0.625817,0.700758,,,,,
+1,24,glancing,0.630719,0.689394,0.697712,0.703283,,,,,
+1,24,at,0.702614,0.690657,0.71732,0.700758,,,,,
+1,24,a,0.722222,0.693182,0.730392,0.700758,,,,,
+1,24,few,0.735294,0.689394,0.764706,0.700758,,,,,
+1,24,of,0.769608,0.689394,0.787582,0.700758,,,,,
+1,24,your,0.79085,0.693182,0.826797,0.703283,,,,,
+1,25,recent,0.145425,0.708333,0.194444,0.718434,,,,,
+1,25,papers,0.199346,0.710859,0.25,0.72096,,,,,
+1,25,and,0.254902,0.707071,0.28268,0.718434,,,,,
+1,25,your,0.287582,0.710859,0.325163,0.72096,,,,,
+1,25,research,0.328431,0.707071,0.393791,0.718434,,,,,
+1,25,summary,0.398693,0.709596,0.472222,0.72096,,,,,
+1,25,I,0.477124,0.707071,0.48366,0.718434,,,,,
+1,25,find,0.488562,0.707071,0.519608,0.718434,,,,,
+1,25,your,0.52451,0.710859,0.562092,0.72096,,,,,
+1,25,research,0.565359,0.707071,0.632353,0.718434,,,,,
+1,25,greatly,0.637255,0.707071,0.691176,0.72096,,,,,
+1,25,coincides,0.696078,0.707071,0.769608,0.718434,,,,,
+1,25,with,0.77451,0.707071,0.810458,0.718434,,,,,
+1,25,my,0.813725,0.710859,0.839869,0.72096,,,,,
+1,26,research,0.145425,0.724747,0.210784,0.736111,,,,,
+1,26,experiences,0.21732,0.724747,0.308824,0.738636,,,,,
+1,26,and,0.313725,0.723485,0.341503,0.736111,,,,,
+1,26,interests.,0.346405,0.723485,0.416667,0.736111,,,,,
+1,26,Will,0.426471,0.723485,0.462418,0.736111,,,,,
+1,26,you,0.465686,0.727273,0.496732,0.738636,,,,,
+1,26,be,0.5,0.723485,0.519608,0.736111,,,,,
+1,26,taking,0.52451,0.724747,0.573529,0.738636,,,,,
+1,26,on,0.578431,0.727273,0.598039,0.736111,,,,,
+1,26,new,0.602941,0.727273,0.635621,0.736111,,,,,
+1,26,students,0.640523,0.724747,0.704248,0.736111,,,,,
+1,26,next,0.70915,0.72601,0.745098,0.734848,,,,,
+1,26,year?,0.748366,0.724747,0.79085,0.738636,,,,,
+1,27,I,0.145425,0.760101,0.151961,0.771465,,,,,
+1,27,have,0.156863,0.760101,0.194444,0.771465,,,,,
+1,27,worked,0.199346,0.760101,0.25817,0.771465,,,,,
+1,27,on,0.263072,0.763889,0.28268,0.771465,,,,,
+1,27,several,0.287582,0.760101,0.343137,0.771465,,,,,
+1,27,different,0.348039,0.760101,0.416667,0.771465,,,,,
+1,27,research,0.419935,0.760101,0.485294,0.771465,,,,,
+1,27,projects,0.490196,0.760101,0.552288,0.775253,,,,,
+1,27,as,0.55719,0.763889,0.573529,0.771465,,,,,
+1,27,an,0.578431,0.763889,0.598039,0.771465,,,,,
+1,27,undergraduate,0.602941,0.760101,0.714052,0.775253,,,,,
+1,27,in,0.718954,0.760101,0.735294,0.771465,,,,,
+1,27,Dr.,0.740196,0.760101,0.764706,0.771465,,,,,
+1,27,David,0.769608,0.760101,0.818627,0.771465,,,,,
+1,27,R.,0.823529,0.760101,0.839869,0.771465,,,,,
+1,28,Hyde's,0.145425,0.777778,0.199346,0.791667,,,,,
+1,28,lab,0.204248,0.777778,0.228758,0.789141,,,,,
+1,28,at,0.23366,0.77904,0.248366,0.789141,,,,,
+1,28,the,0.251634,0.777778,0.276144,0.789141,,,,,
+1,28,University,0.281046,0.777778,0.364379,0.791667,,,,,
+1,28,of,0.369281,0.777778,0.387255,0.789141,,,,,
+1,28,Notre,0.390523,0.777778,0.434641,0.789141,,,,,
+1,28,Dame.,0.439542,0.777778,0.490196,0.789141,,,,,
+1,28,The,0.496732,0.777778,0.527778,0.789141,,,,,
+1,28,Hyde,0.53268,0.777778,0.573529,0.791667,,,,,
+1,28,lab,0.580065,0.777778,0.602941,0.789141,,,,,
+1,28,is,0.607843,0.777778,0.620915,0.789141,,,,,
+1,28,interested,0.625817,0.777778,0.702614,0.789141,,,,,
+1,28,in,0.707516,0.777778,0.722222,0.789141,,,,,
+1,28,the,0.727124,0.777778,0.751634,0.789141,,,,,
+1,28,signals,0.756536,0.777778,0.810458,0.791667,,,,,
+1,28,that,0.815359,0.777778,0.846405,0.789141,,,,,
+1,29,initiate,0.145425,0.795455,0.20098,0.806818,,,,,
+1,29,Muller,0.205882,0.795455,0.259804,0.806818,,,,,
+1,29,glia,0.264706,0.795455,0.292484,0.809343,,,,,
+1,29,division,0.297386,0.795455,0.361111,0.806818,,,,,
+1,29,post-light,0.366013,0.795455,0.44281,0.809343,,,,,
+1,29,damage.,0.446078,0.795455,0.511438,0.809343,,,,,
+1,29,My,0.51634,0.795455,0.544118,0.809343,,,,,
+1,29,first,0.54902,0.795455,0.581699,0.806818,,,,,
+1,29,research,0.584967,0.795455,0.651961,0.806818,,,,,
+1,29,project,0.655229,0.795455,0.710784,0.809343,,,,,
+1,29,was,0.715686,0.799242,0.745098,0.806818,,,,,
+1,30,characterizing,0.145425,0.811869,0.25817,0.82702,,,,,
+1,30,the,0.261438,0.811869,0.285948,0.823232,,,,,
+1,30,role,0.29085,0.813131,0.321895,0.823232,,,,,
+1,30,of,0.326797,0.811869,0.344771,0.824495,,,,,
+1,30,leukemia,0.348039,0.811869,0.419935,0.823232,,,,,
+1,30,inhibitory,0.424837,0.811869,0.501634,0.82702,,,,,
+1,30,factor,0.506536,0.811869,0.553922,0.823232,,,,,
+1,30,(LIF),0.55719,0.813131,0.599673,0.82702,,,,,
+1,30,in,0.604575,0.811869,0.620915,0.824495,,,,,
+1,30,the,0.624183,0.811869,0.648693,0.824495,,,,,
+1,30,activation,0.653595,0.813131,0.732026,0.824495,,,,,
+1,30,of,0.735294,0.811869,0.754902,0.824495,,,,,
+1,30,cell,0.756536,0.811869,0.785948,0.824495,,,,,
+1,31,proliferation,0.145425,0.829545,0.245098,0.844697,,,,,
+1,31,in,0.25,0.829545,0.264706,0.840909,,,,,
+1,31,the,0.267974,0.829545,0.292484,0.840909,,,,,
+1,31,undamaged,0.297386,0.830808,0.388889,0.844697,,,,,
+1,31,zebrafish,0.393791,0.829545,0.465686,0.842172,,,,,
+1,31,retina.,0.470588,0.830808,0.519608,0.842172,,,,,
+1,31,I,0.52451,0.830808,0.531046,0.840909,,,,,
+1,31,am,0.535948,0.833333,0.560458,0.842172,,,,,
+1,31,also,0.565359,0.829545,0.596405,0.840909,,,,,
+1,31,working,0.601307,0.830808,0.666667,0.844697,,,,,
+1,31,on,0.671569,0.833333,0.691176,0.840909,,,,,
+1,31,several,0.696078,0.829545,0.751634,0.840909,,,,,
+1,32,experiments,0.145425,0.847222,0.24183,0.862374,,,,,
+1,32,that,0.246732,0.847222,0.276144,0.858586,,,,,
+1,32,are,0.281046,0.85101,0.305556,0.858586,,,,,
+1,32,related,0.308824,0.847222,0.362745,0.858586,,,,,
+1,32,to,0.367647,0.848485,0.383987,0.858586,,,,,
+1,32,a,0.388889,0.85101,0.397059,0.858586,,,,,
+1,32,genetic,0.401961,0.847222,0.45915,0.861111,,,,,
+1,32,screen,0.464052,0.85101,0.514706,0.858586,,,,,
+1,32,that,0.517974,0.847222,0.54902,0.858586,,,,,
+1,32,the,0.552288,0.847222,0.576797,0.858586,,,,,
+1,32,Hyde,0.581699,0.847222,0.624183,0.861111,,,,,
+1,32,lab,0.629085,0.847222,0.653595,0.858586,,,,,
+1,32,plans,0.656863,0.847222,0.699346,0.861111,,,,,
+1,32,on,0.704248,0.85101,0.723856,0.858586,,,,,
+1,32,performing,0.728758,0.847222,0.816993,0.862374,,,,,
+1,32,to,0.821895,0.848485,0.836601,0.858586,,,,,
+1,33,identify,0.145425,0.864899,0.207516,0.878788,,,,,
+1,33,mutants,0.212418,0.866162,0.272876,0.876263,,,,,
+1,33,in,0.279412,0.864899,0.294118,0.876263,,,,,
+1,33,the,0.29902,0.864899,0.323529,0.876263,,,,,
+1,33,regeneration,0.328431,0.864899,0.426471,0.878788,,,,,
+1,33,pathway--I,0.431373,0.864899,0.51634,0.878788,,,,,
+1,33,am,0.522876,0.868687,0.545752,0.876263,,,,,
+1,33,developing,0.550654,0.864899,0.638889,0.878788,,,,,
+1,33,a,0.643791,0.868687,0.651961,0.876263,,,,,
+1,33,neuroD4:EGFP,0.655229,0.864899,0.78268,0.876263,,,,,
+1,34,transgenic,0.145425,0.882576,0.227124,0.896465,,,,,
+1,34,line,0.232026,0.882576,0.261438,0.893939,,,,,
+1,34,for,0.26634,0.881313,0.289216,0.893939,,,,,
+1,34,use,0.294118,0.885101,0.320261,0.893939,,,,,
+1,34,in,0.325163,0.882576,0.339869,0.893939,,,,,
+1,34,this,0.344771,0.882576,0.372549,0.893939,,,,,
+1,34,screen,0.379085,0.885101,0.428105,0.893939,,,,,
+1,34,and,0.433007,0.882576,0.460784,0.893939,,,,,
+1,34,I,0.46732,0.882576,0.472222,0.893939,,,,,
+1,34,am,0.478758,0.885101,0.501634,0.893939,,,,,
+1,34,characterizing,0.506536,0.882576,0.617647,0.896465,,,,,
+1,34,the,0.622549,0.882576,0.647059,0.893939,,,,,
+1,34,extent,0.651961,0.883838,0.699346,0.892677,,,,,
+1,34,of,0.704248,0.882576,0.722222,0.893939,,,,,
+1,34,damage,0.72549,0.882576,0.785948,0.896465,,,,,
+1,34,and,0.79085,0.882576,0.820261,0.893939,,,,,
+2,1,regeneration,0.145425,0.093434,0.243464,0.107323,,,,,
+2,1,in,0.248366,0.093434,0.264706,0.104798,,,,,
+2,1,sheer,0.267974,0.093434,0.312092,0.104798,,,,,
+2,1,zebrafish,0.316993,0.093434,0.387255,0.104798,,,,,
+2,1,retinas.,0.392157,0.093434,0.449346,0.104798,,,,,
+2,1,"Finally,",0.455882,0.093434,0.514706,0.107323,,,,,
+2,1,I,0.521242,0.093434,0.527778,0.104798,,,,,
+2,1,am,0.53268,0.097222,0.555556,0.104798,,,,,
+2,1,characterizing,0.560458,0.093434,0.671569,0.107323,,,,,
+2,1,the,0.676471,0.093434,0.70098,0.104798,,,,,
+2,1,chx10:EGFP,0.705882,0.093434,0.808824,0.104798,,,,,
+2,2,transgenic,0.145425,0.111111,0.227124,0.125,,,,,
+2,2,line,0.232026,0.111111,0.261438,0.122475,,,,,
+2,2,during,0.26634,0.111111,0.316993,0.125,,,,,
+2,2,retinal,0.321895,0.111111,0.372549,0.122475,,,,,
+2,2,development,0.377451,0.111111,0.478758,0.125,,,,,
+2,2,and,0.48366,0.111111,0.511438,0.122475,,,,,
+2,2,regeneration.,0.51634,0.111111,0.617647,0.125,,,,,
+2,3,Please,0.145425,0.146465,0.196078,0.157828,,,,,
+2,3,find,0.20098,0.146465,0.232026,0.157828,,,,,
+2,3,my,0.236928,0.150253,0.263072,0.160354,,,,,
+2,3,CV,0.267974,0.146465,0.295752,0.157828,,,,,
+2,3,attached.,0.29902,0.146465,0.369281,0.157828,,,,,
+2,4,Thank,0.145425,0.183081,0.196078,0.193182,,,,,
+2,4,you,0.20098,0.185606,0.230392,0.19697,,,,,
+2,4,for,0.235294,0.181818,0.25817,0.193182,,,,,
+2,4,your,0.263072,0.185606,0.29902,0.19697,,,,,
+2,4,"time,",0.303922,0.181818,0.343137,0.195707,,,,,
+2,5,--Lauren,0.147059,0.218434,0.215686,0.229798,,,,,
+2,5,Lilley,0.218954,0.218434,0.26634,0.232323,,,,,
diff --git a/example_data/example_outputs/example_of_emails_sent_to_a_professor_before_applying_review_file.csv b/example_data/example_outputs/example_of_emails_sent_to_a_professor_before_applying_review_file.csv
new file mode 100644
index 0000000000000000000000000000000000000000..1fa38f603fe68e30cec63478e138a5499560b702
--- /dev/null
+++ b/example_data/example_outputs/example_of_emails_sent_to_a_professor_before_applying_review_file.csv
@@ -0,0 +1,15 @@
+image,page,label,color,xmin,ymin,xmax,ymax,id,text
+placeholder_image_0.png,1,TITLES,"(0, 0, 0)",0.147059,0.162879,0.171569,0.174242,oJIosRHGyCRn,Dr
+placeholder_image_0.png,1,TITLES - NAME,"(0, 0, 0)",0.147059,0.162879,0.261438,0.176768,5C5tA6mfeL7T,Dr Kornbluth
+placeholder_image_0.png,1,NAME,"(0, 0, 0)",0.176471,0.162879,0.261438,0.176768,UoYN48bc2ry5,Kornbluth
+placeholder_image_0.png,1,TITLES,"(0, 0, 0)",0.740196,0.30303,0.764706,0.314394,cAsjVETPEisV,Dr
+placeholder_image_0.png,1,TITLES - NAME,"(0, 0, 0)",0.740196,0.30303,0.839869,0.314394,yQ5HKn4tfT7L,Dr David R.
+placeholder_image_0.png,1,NAME,"(0, 0, 0)",0.769608,0.30303,0.839869,0.314394,LR8phiOYnLWi,David R.
+placeholder_image_0.png,1,NAME,"(0, 0, 0)",0.218954,0.566919,0.26634,0.582071,X8iObIauqZ9k,Lauren Lilley
+placeholder_image_0.png,1,TITLES,"(0, 0, 0)",0.145425,0.637626,0.171569,0.64899,SvWjK2F7R3un,Dr
+placeholder_image_0.png,1,TITLES - NAME,"(0, 0, 0)",0.145425,0.637626,0.21732,0.651515,zKJFVAOszwdM,Dr Poss
+placeholder_image_0.png,1,NAME,"(0, 0, 0)",0.176471,0.637626,0.21732,0.651515,Iqda7ixkzcmg,Poss
+placeholder_image_0.png,1,TITLES,"(0, 0, 0)",0.740196,0.760101,0.764706,0.771465,TWQD93bGI3B3,Dr
+placeholder_image_0.png,1,TITLES - NAME,"(0, 0, 0)",0.740196,0.760101,0.839869,0.771465,vQuQQwqWjSES,Dr David R.
+placeholder_image_0.png,1,NAME,"(0, 0, 0)",0.769608,0.760101,0.839869,0.771465,f8xf6ORJUSnG,David R.
+placeholder_image_1.png,2,NAME,"(0, 0, 0)",0.218954,0.218434,0.26634,0.232323,N0nje9UiCzZK,Lauren Lilley
diff --git a/example_data/graduate-job-example-cover-letter.pdf b/example_data/graduate-job-example-cover-letter.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..1137c80bc4a463513879a64d2ce29f1a0de1ef8f
--- /dev/null
+++ b/example_data/graduate-job-example-cover-letter.pdf
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:71cc851d41f80dd8b045af32657b76bf85dd8f72d39ae08fa43dc7a78256fe35
+size 77045
diff --git a/example_data/partnership_toolkit_redact_custom_deny_list.csv b/example_data/partnership_toolkit_redact_custom_deny_list.csv
new file mode 100644
index 0000000000000000000000000000000000000000..9f86e677beef24a9176464d43ecd4c6a126c876a
--- /dev/null
+++ b/example_data/partnership_toolkit_redact_custom_deny_list.csv
@@ -0,0 +1,2 @@
+Friendship City
+United States
diff --git a/example_data/partnership_toolkit_redact_some_pages.csv b/example_data/partnership_toolkit_redact_some_pages.csv
new file mode 100644
index 0000000000000000000000000000000000000000..43266aeb85729796ed189b7aa48528894579bf3d
--- /dev/null
+++ b/example_data/partnership_toolkit_redact_some_pages.csv
@@ -0,0 +1,2 @@
+2
+5
diff --git a/example_data/test_allow_list_graduate.csv b/example_data/test_allow_list_graduate.csv
new file mode 100644
index 0000000000000000000000000000000000000000..3e538c018fb417db10053db8cd944f0194c88c47
--- /dev/null
+++ b/example_data/test_allow_list_graduate.csv
@@ -0,0 +1 @@
+Wilson
diff --git a/example_data/test_allow_list_partnership.csv b/example_data/test_allow_list_partnership.csv
new file mode 100644
index 0000000000000000000000000000000000000000..0c14e43e0fa59cbbd4692c837705bf5e21d493a2
--- /dev/null
+++ b/example_data/test_allow_list_partnership.csv
@@ -0,0 +1 @@
+akaplan@sister-cities.org
diff --git a/favicon.png b/favicon.png
new file mode 100644
index 0000000000000000000000000000000000000000..81f48b2346e28bf09aecec8c5d82d36e47f5c9c3
--- /dev/null
+++ b/favicon.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:49b53f802a66a482b87a21d4bf11891e2822eb0abe4aa4d69d917c0d8e36c1d8
+size 2508
diff --git a/index.qmd b/index.qmd
new file mode 100644
index 0000000000000000000000000000000000000000..7eda56c7b567ebf2e6842a54187cc5deacb5009d
--- /dev/null
+++ b/index.qmd
@@ -0,0 +1,23 @@
+---
+title: "Home"
+---
+
+version: 2.0.1
+
+Welcome to the Document Redaction App documentation. This site provides comprehensive documentation for the Document Redaction App.
+
+Navigate through the sections to learn how to install, use, and manage the application. Below is a brief introduction to the app.
+
+## Document redaction
+
+Redact personally identifiable information (PII) from documents (pdf, png, jpg), Word files (docx), or tabular data (xlsx/csv/parquet). Please see the [User Guide](src/user_guide.qmd) for a full walkthrough of all the features in the app.
+
+
+
+To identify text in documents, the 'Local' text extraction uses PikePDF, and OCR image analysis uses Tesseract, and works well only for documents with typed text or scanned PDFs with clear text. Use AWS Textract to extract more complex elements e.g. handwriting, signatures, or unclear text. For PII identification, 'Local' (based on spaCy) gives good results if you are looking for common names or terms, or a custom list of terms to redact (see Redaction settings). AWS Comprehend gives better results at a small cost.
+
+Additional options include, choosing the type of information to redact (e.g. people, places), custom terms to include/ exclude from redaction, fuzzy matching, language settings, and whole page redaction. After redaction is complete, you can view and modify suggested redactions on the 'Review redactions' tab to quickly create a final redacted document.
+
+NOTE: The app is not 100% accurate, and it will miss some personal information. It is essential that all outputs are reviewed **by a human** before using the final outputs.
+
+
diff --git a/intros/long_intro.txt b/intros/long_intro.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e154cd3862d3f540fc6d18550ac62beb5109d903
--- /dev/null
+++ b/intros/long_intro.txt
@@ -0,0 +1,9 @@
+# Document redaction
+
+Redact personally identifiable information (PII) from documents (PDF, PNG, JPG), Word files (DOCX), or tabular data (XLSX/CSV/Parquet). Please see the [User Guide]({USER_GUIDE_URL}) for a full walkthrough of all the features in the app.
+
+To extract text from documents, the 'Local' options are PikePDF for PDFs with selectable text, and OCR with Tesseract. Use AWS Textract to extract more complex elements e.g. handwriting, signatures, or unclear text. For PII identification, 'Local' (based on spaCy) gives good results if you are looking for common names or terms, or a custom list of terms to redact (see Redaction settings). AWS Comprehend gives better results at a small cost.
+
+Additional options on the 'Redaction settings' include, the type of information to redact (e.g. people, places), custom terms to include/ exclude from redaction, fuzzy matching, language settings, and whole page redaction. After redaction is complete, you can view and modify suggested redactions on the 'Review redactions' tab to quickly create a final redacted document.
+
+NOTE: The app is not 100% accurate, and it will miss some personal information. It is essential that all outputs are reviewed **by a human** before using the final outputs.
\ No newline at end of file
diff --git a/intros/short_intro.txt b/intros/short_intro.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6867e892bdbb669271d30b777a9678722af97a15
--- /dev/null
+++ b/intros/short_intro.txt
@@ -0,0 +1,7 @@
+# Document redaction
+
+Redact personally identifiable information (PII) from documents (PDF, PNG, JPG), Word files (DOCX), or tabular data (XLSX/CSV/Parquet). Please see the [User Guide]({USER_GUIDE_URL}) for a full walkthrough of all the features and settings.
+
+To start, upload a document below (or click on an example), then click 'Extract text and redact document' to redact the document. Then, view and modify suggested redactions on the 'Review redactions' tab.
+
+NOTE: The app is not 100% accurate, and it will miss some personal information. It is essential that all outputs are reviewed **by a human** before using the final outputs.
\ No newline at end of file
diff --git a/intros/short_intro_responsible.txt b/intros/short_intro_responsible.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ba8f3c496f3799d226448ef9e8257b4e8a387119
--- /dev/null
+++ b/intros/short_intro_responsible.txt
@@ -0,0 +1,7 @@
+# Document redaction
+
+Redact personally identifiable information (PII) from documents (PDF, PNG, JPG), Word files (DOCX), or tabular data (XLSX/CSV/Parquet). Please see the [User Guide]({USER_GUIDE_URL}) for a full walkthrough of all the features and settings.
+
+To start, upload a document below (or click on an example), then click 'Extract text and redact document' to redact the document. Then, view and modify suggested redactions on the 'Review redactions' tab.
+
+**NOTE: The app is not 100% accurate, and it will miss some personal information. It is essential that all redacted files are reviewed and verified before final use, as you are responsible for the outputs.**
\ No newline at end of file
diff --git a/lambda_entrypoint.py b/lambda_entrypoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f9d9065a81f60118e417cafaa534590db054f34
--- /dev/null
+++ b/lambda_entrypoint.py
@@ -0,0 +1,794 @@
+import json
+import os
+
+import boto3
+from dotenv import load_dotenv
+
+# Import the main function from your CLI script
+from cli_redact import main as cli_main
+from tools.config import (
+ AWS_LLM_PII_OPTION,
+ AWS_REGION,
+ AZURE_OPENAI_API_KEY,
+ AZURE_OPENAI_INFERENCE_ENDPOINT,
+ CHOSEN_LLM_ENTITIES,
+ CHOSEN_LLM_PII_INFERENCE_METHOD,
+ CLOUD_LLM_PII_MODEL_CHOICE,
+ CLOUD_VLM_MODEL_CHOICE,
+ DEFAULT_DUPLICATE_DETECTION_THRESHOLD,
+ DEFAULT_FUZZY_SPELLING_MISTAKES_NUM,
+ DEFAULT_INFERENCE_SERVER_PII_MODEL,
+ DEFAULT_INFERENCE_SERVER_VLM_MODEL,
+ DEFAULT_MIN_CONSECUTIVE_PAGES,
+ DEFAULT_MIN_WORD_COUNT,
+ DEFAULT_PAGE_MAX,
+ DEFAULT_PAGE_MIN,
+ EFFICIENT_OCR,
+ EFFICIENT_OCR_MIN_IMAGE_COVERAGE_FRACTION,
+ EFFICIENT_OCR_MIN_WORDS,
+ GEMINI_API_KEY,
+ HYBRID_TEXTRACT_BEDROCK_VLM,
+ IMAGES_DPI,
+ INFERENCE_SERVER_API_URL,
+ LAMBDA_DEFAULT_USERNAME,
+ LAMBDA_EXTRACT_SIGNATURES,
+ LAMBDA_MAX_POLL_ATTEMPTS,
+ LAMBDA_POLL_INTERVAL,
+ LAMBDA_PREPARE_IMAGES,
+ LLM_MAX_NEW_TOKENS,
+ LLM_TEMPERATURE,
+ OCR_FIRST_PASS_MAX_WORKERS,
+ SUMMARY_PAGE_GROUP_MAX_WORKERS,
+)
+
+
+def _get_env_list(env_var_name: str | list[str] | None) -> list[str]:
+ """Parses a comma-separated environment variable into a list of strings."""
+ if isinstance(env_var_name, list):
+ return env_var_name
+ if env_var_name is None:
+ return []
+
+ # Handle string input
+ value = str(env_var_name).strip()
+ if not value or value == "[]":
+ return []
+
+ # Remove brackets if present (e.g., "[item1, item2]" -> "item1, item2")
+ if value.startswith("[") and value.endswith("]"):
+ value = value[1:-1]
+
+ # Remove quotes and split by comma
+ value = value.replace('"', "").replace("'", "")
+ if not value:
+ return []
+
+ # Split by comma and filter out any empty strings
+ return [s.strip() for s in value.split(",") if s.strip()]
+
+
+def convert_string_to_boolean(value: str) -> bool:
+ """Convert string to boolean, handling various formats."""
+ if isinstance(value, bool):
+ return value
+ elif value in ["True", "1", "true", "TRUE"]:
+ return True
+ elif value in ["False", "0", "false", "FALSE"]:
+ return False
+ else:
+ raise ValueError(f"Invalid boolean value: {value}")
+
+
+print("Lambda entrypoint loading...")
+
+# Initialize S3 client outside the handler for connection reuse
+s3_client = boto3.client("s3", region_name=os.getenv("AWS_REGION", AWS_REGION))
+print("S3 client initialised")
+
+# Lambda's only writable directory is /tmp. Ensure that all temporary files are stored in this directory.
+TMP_DIR = "/tmp"
+INPUT_DIR = os.path.join(TMP_DIR, "input")
+OUTPUT_DIR = os.path.join(TMP_DIR, "output")
+os.environ["TESSERACT_DATA_FOLDER"] = os.path.join(TMP_DIR, "share/tessdata")
+os.environ["TLDEXTRACT_CACHE"] = os.path.join(TMP_DIR, "tld")
+os.environ["MPLCONFIGDIR"] = os.path.join(TMP_DIR, "matplotlib_cache")
+os.environ["GRADIO_TEMP_DIR"] = os.path.join(TMP_DIR, "gradio_tmp")
+os.environ["FEEDBACK_LOGS_FOLDER"] = os.path.join(TMP_DIR, "feedback")
+os.environ["ACCESS_LOGS_FOLDER"] = os.path.join(TMP_DIR, "logs")
+os.environ["USAGE_LOGS_FOLDER"] = os.path.join(TMP_DIR, "usage")
+os.environ["PADDLE_MODEL_PATH"] = os.path.join(TMP_DIR, "paddle_models")
+os.environ["SPACY_MODEL_PATH"] = os.path.join(TMP_DIR, "spacy_models")
+
+# Define compatible file types for processing
+COMPATIBLE_FILE_TYPES = {
+ ".pdf",
+ ".xlsx",
+ ".xls",
+ ".png",
+ ".jpeg",
+ ".csv",
+ ".parquet",
+ ".txt",
+ ".jpg",
+}
+
+
+def download_file_from_s3(bucket_name, key, download_path):
+ """Download a file from S3 to the local filesystem."""
+ try:
+ s3_client.download_file(bucket_name, key, download_path)
+ print("Successfully downloaded file from S3")
+ except Exception as e:
+ print(f"Error downloading from S3: {e}")
+ raise
+
+
+def upload_directory_to_s3(local_directory, bucket_name, s3_prefix):
+ """Upload all files from a local directory to an S3 prefix."""
+ for root, _, files in os.walk(local_directory):
+ for file_name in files:
+ local_file_path = os.path.join(root, file_name)
+ # Create a relative path to maintain directory structure if needed
+ relative_path = os.path.relpath(local_file_path, local_directory)
+ output_key = os.path.join(s3_prefix, relative_path)
+
+ try:
+ s3_client.upload_file(local_file_path, bucket_name, output_key)
+ print(
+ f"Successfully uploaded {local_file_path} to s3://{bucket_name}/{output_key}"
+ )
+ except Exception as e:
+ print(f"Error uploading to S3: {e}")
+ raise
+
+
+def lambda_handler(event, context):
+ print(f"Received event: {json.dumps(event)}")
+
+ # 1. Setup temporary directories
+ os.makedirs(INPUT_DIR, exist_ok=True)
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
+
+ # 2. Extract information from the event
+ # Assumes the event is triggered by S3 and may contain an 'arguments' payload
+ try:
+ record = event["Records"][0]
+ bucket_name = record["s3"]["bucket"]["name"]
+ input_key = record["s3"]["object"]["key"]
+
+ # The user metadata can be used to pass arguments
+ # This is more robust than embedding them in the main event body
+ try:
+ response = s3_client.head_object(Bucket=bucket_name, Key=input_key)
+ metadata = response.get("Metadata", dict())
+ print(f"S3 object metadata: {metadata}")
+
+ # Arguments can be passed as a JSON string in metadata
+ arguments_str = metadata.get("arguments", "{}")
+ print(f"Arguments string from metadata: '{arguments_str}'")
+
+ if arguments_str and arguments_str != "{}":
+ arguments = json.loads(arguments_str)
+ print(f"Successfully parsed arguments from metadata: {arguments}")
+ else:
+ arguments = dict()
+ print("No arguments found in metadata, using empty dictionary")
+ except Exception as e:
+ print(f"Warning: Could not parse metadata arguments: {e}")
+ print("Using empty arguments dictionary")
+ arguments = dict()
+
+ except (KeyError, IndexError) as e:
+ print(
+ f"Could not parse S3 event record: {e}. Checking for direct invocation payload."
+ )
+ # Fallback for direct invocation (e.g., from Step Functions or manual test)
+ bucket_name = event.get("bucket_name")
+ input_key = event.get("input_key")
+ arguments = event.get("arguments", dict())
+ if not all([bucket_name, input_key]):
+ raise ValueError(
+ "Missing 'bucket_name' or 'input_key' in direct invocation event."
+ )
+
+ # print(f"Processing s3://{bucket_name}/{input_key}")
+ # print(f"With arguments: {arguments}")
+ # print(f"Arguments type: {type(arguments)}")
+
+ # Log file type information
+ file_extension = os.path.splitext(input_key)[1].lower()
+ print(f"Detected file extension: '{file_extension}'")
+
+ # 3. Download the main input file
+ input_file_path = os.path.join(INPUT_DIR, os.path.basename(input_key))
+ download_file_from_s3(bucket_name, input_key, input_file_path)
+
+ # 3.1. Validate file type compatibility
+ is_env_file = input_key.lower().endswith(".env")
+
+ if not is_env_file and file_extension not in COMPATIBLE_FILE_TYPES:
+ error_message = f"File type '{file_extension}' is not supported for processing. Compatible file types are: {', '.join(sorted(COMPATIBLE_FILE_TYPES))}"
+ print(f"ERROR: {error_message}")
+ print(f"File was not processed due to unsupported file type: {file_extension}")
+ return {
+ "statusCode": 400,
+ "body": json.dumps(
+ {
+ "error": "Unsupported file type",
+ "message": error_message,
+ "supported_types": list(COMPATIBLE_FILE_TYPES),
+ "received_type": file_extension,
+ "file_processed": False,
+ }
+ ),
+ }
+
+ print(f"File type '{file_extension}' is compatible for processing")
+ if is_env_file:
+ print("Processing .env file for configuration")
+ else:
+ print(f"Processing {file_extension} file for redaction/anonymization")
+
+ # 3.5. Check if the downloaded file is a .env file and handle accordingly
+ actual_input_file_path = input_file_path
+ if input_key.lower().endswith(".env"):
+ print("Detected .env file, loading environment variables...")
+
+ # Load environment variables from the .env file
+ print(f"Loading .env file from: {input_file_path}")
+
+ # Check if file exists and is readable
+ if os.path.exists(input_file_path):
+ print(".env file exists and is readable")
+ with open(input_file_path, "r") as f:
+ content = f.read()
+ print(f".env file content preview: {content[:200]}...")
+ else:
+ print(f"ERROR: .env file does not exist at {input_file_path}")
+
+ load_dotenv(input_file_path, override=True)
+ print("Environment variables loaded from .env file")
+
+ # Extract the actual input file path from environment variables
+ # Look for common environment variable names that might contain the input file path
+ env_input_file = os.getenv(
+ "INPUT_FILE"
+ ) # This needs to be the full S3 path to the input file, e.g.INPUT_FILE=s3://my-processing-bucket/documents/sensitive-data.pdf
+
+ if env_input_file:
+ print(f"Found input file path in environment: {env_input_file}")
+
+ # If the path is an S3 path, download it
+ if env_input_file.startswith("s3://"):
+ # Parse S3 path: s3://bucket/key
+ s3_path_parts = env_input_file[5:].split("/", 1)
+ if len(s3_path_parts) == 2:
+ env_bucket = s3_path_parts[0]
+ env_key = s3_path_parts[1]
+ actual_input_file_path = os.path.join(
+ INPUT_DIR, os.path.basename(env_key)
+ )
+ print(
+ f"Downloading actual input file from s3://{env_bucket}/{env_key}"
+ )
+ download_file_from_s3(env_bucket, env_key, actual_input_file_path)
+ else:
+ print("Warning: Invalid S3 path format in environment variable")
+ actual_input_file_path = input_file_path
+ else:
+ # Assume it's a local path or relative path
+ actual_input_file_path = env_input_file
+ print(
+ f"Using input file path from environment: {actual_input_file_path}"
+ )
+ else:
+ print("Warning: No input file path found in environment variables")
+ print(
+ "Available environment variables:",
+ [
+ k
+ for k in os.environ.keys()
+ if k.startswith(("INPUT", "FILE", "DOCUMENT", "DIRECT"))
+ ],
+ )
+ # Fall back to using the .env file itself (though this might not be what we want)
+ actual_input_file_path = input_file_path
+ else:
+ print("File is not a .env file, proceeding with normal processing")
+
+ # 4. Prepare arguments for the CLI function
+ # This dictionary should mirror the one in your app.py's "direct mode"
+ # If we loaded a .env file, use environment variables as defaults
+
+ # Note: For task "combine_review_pdfs" the CLI expects multiple input file paths.
+ # Lambda currently passes a single file (the S3-triggered object). To support
+ # combine_review_pdfs here, the event would need to supply multiple keys or
+ # arguments["input_files"] (list of S3 URIs) and download each before calling cli_main.
+ # For task "summarise", PDF input is supported: the CLI extracts text via ocr_method
+ # then summarises (same as direct mode and CLI --task summarise --input_file file.pdf).
+ cli_args = {
+ # Task Selection
+ "task": arguments.get("task", os.getenv("DIRECT_MODE_TASK", "redact")),
+ # General Arguments (apply to all file types)
+ "input_file": actual_input_file_path,
+ "output_dir": OUTPUT_DIR,
+ "input_dir": INPUT_DIR,
+ "language": arguments.get("language", os.getenv("DEFAULT_LANGUAGE", "en")),
+ "allow_list": arguments.get("allow_list", os.getenv("ALLOW_LIST_PATH", "")),
+ "pii_detector": arguments.get(
+ "pii_detector", os.getenv("LOCAL_PII_OPTION", "Local")
+ ),
+ "username": arguments.get(
+ "username", os.getenv("DIRECT_MODE_DEFAULT_USER", LAMBDA_DEFAULT_USERNAME)
+ ),
+ "save_to_user_folders": convert_string_to_boolean(
+ arguments.get(
+ "save_to_user_folders", os.getenv("SESSION_OUTPUT_FOLDER", "False")
+ )
+ ),
+ "local_redact_entities": _get_env_list(
+ arguments.get(
+ "local_redact_entities", os.getenv("CHOSEN_REDACT_ENTITIES", list())
+ )
+ ),
+ "aws_redact_entities": _get_env_list(
+ arguments.get(
+ "aws_redact_entities", os.getenv("CHOSEN_COMPREHEND_ENTITIES", list())
+ )
+ ),
+ "aws_access_key": None, # Use IAM Role instead of keys
+ "aws_secret_key": None, # Use IAM Role instead of keys
+ "cost_code": arguments.get("cost_code", os.getenv("DEFAULT_COST_CODE", "")),
+ "aws_region": os.getenv("AWS_REGION", ""),
+ "s3_bucket": bucket_name,
+ "do_initial_clean": arguments.get(
+ "do_initial_clean",
+ convert_string_to_boolean(
+ os.getenv("DO_INITIAL_TABULAR_DATA_CLEAN", "False")
+ ),
+ ),
+ "save_logs_to_csv": convert_string_to_boolean(
+ arguments.get("save_logs_to_csv", os.getenv("SAVE_LOGS_TO_CSV", "True"))
+ ),
+ "save_logs_to_dynamodb": arguments.get(
+ "save_logs_to_dynamodb",
+ convert_string_to_boolean(os.getenv("SAVE_LOGS_TO_DYNAMODB", "False")),
+ ),
+ "display_file_names_in_logs": convert_string_to_boolean(
+ arguments.get(
+ "display_file_names_in_logs",
+ os.getenv("DISPLAY_FILE_NAMES_IN_LOGS", "True"),
+ )
+ ),
+ "upload_logs_to_s3": convert_string_to_boolean(
+ arguments.get("upload_logs_to_s3", os.getenv("RUN_AWS_FUNCTIONS", "False"))
+ ),
+ "s3_logs_prefix": arguments.get(
+ "s3_logs_prefix", os.getenv("S3_USAGE_LOGS_FOLDER", "")
+ ),
+ "feedback_logs_folder": arguments.get(
+ "feedback_logs_folder",
+ os.getenv("FEEDBACK_LOGS_FOLDER", os.environ["FEEDBACK_LOGS_FOLDER"]),
+ ),
+ "access_logs_folder": arguments.get(
+ "access_logs_folder",
+ os.getenv("ACCESS_LOGS_FOLDER", os.environ["ACCESS_LOGS_FOLDER"]),
+ ),
+ "usage_logs_folder": arguments.get(
+ "usage_logs_folder",
+ os.getenv("USAGE_LOGS_FOLDER", os.environ["USAGE_LOGS_FOLDER"]),
+ ),
+ "paddle_model_path": arguments.get(
+ "paddle_model_path",
+ os.getenv("PADDLE_MODEL_PATH", os.environ["PADDLE_MODEL_PATH"]),
+ ),
+ "spacy_model_path": arguments.get(
+ "spacy_model_path",
+ os.getenv("SPACY_MODEL_PATH", os.environ["SPACY_MODEL_PATH"]),
+ ),
+ # PDF/Image Redaction Arguments
+ "ocr_method": arguments.get("ocr_method", os.getenv("OCR_METHOD", "Local OCR")),
+ "page_min": int(
+ arguments.get("page_min", os.getenv("DEFAULT_PAGE_MIN", DEFAULT_PAGE_MIN))
+ ),
+ "page_max": int(
+ arguments.get("page_max", os.getenv("DEFAULT_PAGE_MAX", DEFAULT_PAGE_MAX))
+ ),
+ "images_dpi": float(
+ arguments.get("images_dpi", os.getenv("IMAGES_DPI", IMAGES_DPI))
+ ),
+ "chosen_local_ocr_model": arguments.get(
+ "chosen_local_ocr_model", os.getenv("DEFAULT_LOCAL_OCR_MODEL", "tesseract")
+ ),
+ "preprocess_local_ocr_images": convert_string_to_boolean(
+ arguments.get(
+ "preprocess_local_ocr_images",
+ os.getenv("PREPROCESS_LOCAL_OCR_IMAGES", "True"),
+ )
+ ),
+ "compress_redacted_pdf": convert_string_to_boolean(
+ arguments.get(
+ "compress_redacted_pdf", os.getenv("COMPRESS_REDACTED_PDF", "True")
+ )
+ ),
+ "return_pdf_end_of_redaction": convert_string_to_boolean(
+ arguments.get(
+ "return_pdf_end_of_redaction", os.getenv("RETURN_REDACTED_PDF", "True")
+ )
+ ),
+ "deny_list_file": arguments.get(
+ "deny_list_file", os.getenv("DENY_LIST_PATH", "")
+ ),
+ "allow_list_file": arguments.get(
+ "allow_list_file", os.getenv("ALLOW_LIST_PATH", "")
+ ),
+ "redact_whole_page_file": arguments.get(
+ "redact_whole_page_file", os.getenv("WHOLE_PAGE_REDACTION_LIST_PATH", "")
+ ),
+ "handwrite_signature_extraction": _get_env_list(
+ arguments.get(
+ "handwrite_signature_extraction",
+ os.getenv(
+ "DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX",
+ ["Extract handwriting", "Extract signatures"],
+ ),
+ )
+ ),
+ "extract_forms": convert_string_to_boolean(
+ arguments.get(
+ "extract_forms",
+ os.getenv("INCLUDE_FORM_EXTRACTION_TEXTRACT_OPTION", "False"),
+ )
+ ),
+ "extract_tables": convert_string_to_boolean(
+ arguments.get(
+ "extract_tables",
+ os.getenv("INCLUDE_TABLE_EXTRACTION_TEXTRACT_OPTION", "False"),
+ )
+ ),
+ "extract_layout": convert_string_to_boolean(
+ arguments.get(
+ "extract_layout",
+ os.getenv("INCLUDE_LAYOUT_EXTRACTION_TEXTRACT_OPTION", "False"),
+ )
+ ),
+ # VLM OCR Arguments
+ "vlm_model_choice": arguments.get(
+ "vlm_model_choice",
+ os.getenv("CLOUD_VLM_MODEL_CHOICE", CLOUD_VLM_MODEL_CHOICE),
+ ),
+ "inference_server_vlm_model": arguments.get(
+ "inference_server_vlm_model",
+ os.getenv(
+ "DEFAULT_INFERENCE_SERVER_VLM_MODEL", DEFAULT_INFERENCE_SERVER_VLM_MODEL
+ ),
+ ),
+ "inference_server_api_url": arguments.get(
+ "inference_server_api_url",
+ os.getenv("INFERENCE_SERVER_API_URL", INFERENCE_SERVER_API_URL),
+ ),
+ "gemini_api_key": arguments.get(
+ "gemini_api_key", os.getenv("GEMINI_API_KEY", GEMINI_API_KEY)
+ ),
+ "azure_openai_api_key": arguments.get(
+ "azure_openai_api_key",
+ os.getenv("AZURE_OPENAI_API_KEY", AZURE_OPENAI_API_KEY),
+ ),
+ "azure_openai_endpoint": arguments.get(
+ "azure_openai_endpoint",
+ os.getenv(
+ "AZURE_OPENAI_INFERENCE_ENDPOINT", AZURE_OPENAI_INFERENCE_ENDPOINT
+ ),
+ ),
+ "ocr_first_pass_max_workers": int(
+ arguments.get(
+ "ocr_first_pass_max_workers",
+ os.getenv(
+ "OCR_FIRST_PASS_MAX_WORKERS", str(OCR_FIRST_PASS_MAX_WORKERS)
+ ),
+ )
+ ),
+ "efficient_ocr": convert_string_to_boolean(
+ arguments.get(
+ "efficient_ocr", os.getenv("EFFICIENT_OCR", str(EFFICIENT_OCR))
+ )
+ ),
+ "efficient_ocr_min_words": int(
+ arguments.get(
+ "efficient_ocr_min_words",
+ os.getenv("EFFICIENT_OCR_MIN_WORDS", str(EFFICIENT_OCR_MIN_WORDS)),
+ )
+ ),
+ "efficient_ocr_min_image_coverage_fraction": float(
+ arguments.get(
+ "efficient_ocr_min_image_coverage_fraction",
+ os.getenv(
+ "EFFICIENT_OCR_MIN_IMAGE_COVERAGE_FRACTION",
+ str(EFFICIENT_OCR_MIN_IMAGE_COVERAGE_FRACTION),
+ ),
+ )
+ ),
+ "hybrid_textract_bedrock_vlm": convert_string_to_boolean(
+ arguments.get(
+ "hybrid_textract_bedrock_vlm",
+ os.getenv(
+ "HYBRID_TEXTRACT_BEDROCK_VLM", str(HYBRID_TEXTRACT_BEDROCK_VLM)
+ ),
+ )
+ ),
+ # LLM PII Detection Arguments
+ # Note: The actual model used is determined by pii_identification_method in the downstream code
+ # This is just the default - it will be overridden based on the selected PII method
+ "llm_model_choice": arguments.get(
+ "llm_model_choice",
+ os.getenv("CLOUD_LLM_PII_MODEL_CHOICE", CLOUD_LLM_PII_MODEL_CHOICE),
+ ),
+ "llm_inference_method": arguments.get(
+ "llm_inference_method",
+ os.getenv(
+ "CHOSEN_LLM_PII_INFERENCE_METHOD", CHOSEN_LLM_PII_INFERENCE_METHOD
+ ),
+ ),
+ "inference_server_pii_model": arguments.get(
+ "inference_server_pii_model",
+ os.getenv(
+ "DEFAULT_INFERENCE_SERVER_PII_MODEL", DEFAULT_INFERENCE_SERVER_PII_MODEL
+ ),
+ ),
+ "llm_temperature": float(
+ arguments.get(
+ "llm_temperature",
+ os.getenv("LLM_TEMPERATURE", LLM_TEMPERATURE),
+ )
+ ),
+ "llm_max_tokens": int(
+ arguments.get(
+ "llm_max_tokens",
+ os.getenv("LLM_MAX_NEW_TOKENS", LLM_MAX_NEW_TOKENS),
+ )
+ ),
+ "llm_redact_entities": _get_env_list(
+ arguments.get(
+ "llm_redact_entities",
+ os.getenv("CHOSEN_LLM_ENTITIES", CHOSEN_LLM_ENTITIES),
+ )
+ ),
+ "custom_llm_instructions": arguments.get(
+ "custom_llm_instructions", os.getenv("CUSTOM_LLM_INSTRUCTIONS", "")
+ ),
+ # Document Summarisation Arguments (used when task is summarise)
+ "summarisation_inference_method": arguments.get(
+ "summarisation_inference_method",
+ os.getenv("SUMMARISATION_INFERENCE_METHOD", AWS_LLM_PII_OPTION),
+ ),
+ "summarisation_temperature": float(
+ arguments.get(
+ "summarisation_temperature",
+ os.getenv("SUMMARISATION_TEMPERATURE", "0.6"),
+ )
+ ),
+ "summarisation_max_pages_per_group": int(
+ arguments.get(
+ "summarisation_max_pages_per_group",
+ os.getenv("SUMMARISATION_MAX_PAGES_PER_GROUP", "30"),
+ )
+ ),
+ "summary_page_group_max_workers": int(
+ arguments.get(
+ "summary_page_group_max_workers",
+ os.getenv(
+ "SUMMARY_PAGE_GROUP_MAX_WORKERS",
+ str(SUMMARY_PAGE_GROUP_MAX_WORKERS),
+ ),
+ )
+ ),
+ "summarisation_api_key": arguments.get(
+ "summarisation_api_key", os.getenv("SUMMARISATION_API_KEY", "")
+ ),
+ "summarisation_context": arguments.get(
+ "summarisation_context", os.getenv("SUMMARISATION_CONTEXT", "")
+ ),
+ "summarisation_format": arguments.get(
+ "summarisation_format", os.getenv("SUMMARISATION_FORMAT", "detailed")
+ ),
+ "summarisation_additional_instructions": arguments.get(
+ "summarisation_additional_instructions",
+ os.getenv("SUMMARISATION_ADDITIONAL_INSTRUCTIONS", ""),
+ ),
+ # Word/Tabular Anonymisation Arguments
+ "anon_strategy": arguments.get(
+ "anon_strategy",
+ os.getenv("DEFAULT_TABULAR_ANONYMISATION_STRATEGY", "redact completely"),
+ ),
+ "text_columns": arguments.get(
+ "text_columns", _get_env_list(os.getenv("DEFAULT_TEXT_COLUMNS", list()))
+ ),
+ "excel_sheets": arguments.get(
+ "excel_sheets", _get_env_list(os.getenv("DEFAULT_EXCEL_SHEETS", list()))
+ ),
+ "fuzzy_mistakes": int(
+ arguments.get(
+ "fuzzy_mistakes",
+ os.getenv(
+ "DEFAULT_FUZZY_SPELLING_MISTAKES_NUM",
+ DEFAULT_FUZZY_SPELLING_MISTAKES_NUM,
+ ),
+ )
+ ),
+ "match_fuzzy_whole_phrase_bool": convert_string_to_boolean(
+ arguments.get(
+ "match_fuzzy_whole_phrase_bool",
+ os.getenv("MATCH_FUZZY_WHOLE_PHRASE_BOOL", "True"),
+ )
+ ),
+ # Duplicate Detection Arguments
+ "duplicate_type": arguments.get(
+ "duplicate_type", os.getenv("DIRECT_MODE_DUPLICATE_TYPE", "pages")
+ ),
+ "similarity_threshold": float(
+ arguments.get(
+ "similarity_threshold",
+ os.getenv(
+ "DEFAULT_DUPLICATE_DETECTION_THRESHOLD",
+ DEFAULT_DUPLICATE_DETECTION_THRESHOLD,
+ ),
+ )
+ ),
+ "min_word_count": int(
+ arguments.get(
+ "min_word_count",
+ os.getenv("DEFAULT_MIN_WORD_COUNT", DEFAULT_MIN_WORD_COUNT),
+ )
+ ),
+ "min_consecutive_pages": int(
+ arguments.get(
+ "min_consecutive_pages",
+ os.getenv(
+ "DEFAULT_MIN_CONSECUTIVE_PAGES", DEFAULT_MIN_CONSECUTIVE_PAGES
+ ),
+ )
+ ),
+ "greedy_match": convert_string_to_boolean(
+ arguments.get(
+ "greedy_match", os.getenv("USE_GREEDY_DUPLICATE_DETECTION", "False")
+ )
+ ),
+ "combine_pages": convert_string_to_boolean(
+ arguments.get("combine_pages", os.getenv("DEFAULT_COMBINE_PAGES", "True"))
+ ),
+ "remove_duplicate_rows": convert_string_to_boolean(
+ arguments.get(
+ "remove_duplicate_rows", os.getenv("REMOVE_DUPLICATE_ROWS", "False")
+ )
+ ),
+ # Textract Batch Operations Arguments
+ "textract_action": arguments.get("textract_action", ""),
+ "job_id": arguments.get("job_id", ""),
+ "extract_signatures": convert_string_to_boolean(
+ arguments.get("extract_signatures", str(LAMBDA_EXTRACT_SIGNATURES))
+ ),
+ "textract_bucket": arguments.get(
+ "textract_bucket", os.getenv("TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET", "")
+ ),
+ "textract_input_prefix": arguments.get(
+ "textract_input_prefix",
+ os.getenv("TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER", ""),
+ ),
+ "textract_output_prefix": arguments.get(
+ "textract_output_prefix",
+ os.getenv("TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER", ""),
+ ),
+ "s3_textract_document_logs_subfolder": arguments.get(
+ "s3_textract_document_logs_subfolder", os.getenv("TEXTRACT_JOBS_S3_LOC", "")
+ ),
+ "local_textract_document_logs_subfolder": arguments.get(
+ "local_textract_document_logs_subfolder",
+ os.getenv("TEXTRACT_JOBS_LOCAL_LOC", ""),
+ ),
+ "poll_interval": int(arguments.get("poll_interval", LAMBDA_POLL_INTERVAL)),
+ "max_poll_attempts": int(
+ arguments.get("max_poll_attempts", LAMBDA_MAX_POLL_ATTEMPTS)
+ ),
+ # Additional arguments that were missing
+ "search_query": arguments.get(
+ "search_query", os.getenv("DEFAULT_SEARCH_QUERY", "")
+ ),
+ "prepare_images": convert_string_to_boolean(
+ arguments.get("prepare_images", str(LAMBDA_PREPARE_IMAGES))
+ ),
+ }
+
+ # Combine extraction options
+ extraction_options = (
+ _get_env_list(cli_args["handwrite_signature_extraction"])
+ if cli_args["handwrite_signature_extraction"]
+ else list()
+ )
+ if cli_args["extract_forms"]:
+ extraction_options.append("Extract forms")
+ if cli_args["extract_tables"]:
+ extraction_options.append("Extract tables")
+ if cli_args["extract_layout"]:
+ extraction_options.append("Extract layout")
+ cli_args["handwrite_signature_extraction"] = extraction_options
+
+ # Download optional files if they are specified
+ # Note: These can be S3 keys (relative to bucket) or full s3:// paths
+ # If they're full s3:// paths, the CLI will handle them automatically
+ # If they're S3 keys (not starting with s3:// and not existing locally), download them here
+ allow_list_file = arguments.get("allow_list_file") or cli_args.get(
+ "allow_list_file"
+ )
+ if allow_list_file:
+ # Check if it's a full S3 path (s3://bucket/key)
+ if allow_list_file.startswith("s3://"):
+ # Let the CLI handle it - don't download here
+ cli_args["allow_list_file"] = allow_list_file
+ elif os.path.exists(allow_list_file) or os.path.isabs(allow_list_file):
+ # It's already a local absolute path or exists - use it as-is
+ cli_args["allow_list_file"] = allow_list_file
+ else:
+ # Assume it's an S3 key (relative to bucket) - download it
+ allow_list_path = os.path.join(INPUT_DIR, "allow_list.csv")
+ download_file_from_s3(bucket_name, allow_list_file, allow_list_path)
+ cli_args["allow_list_file"] = allow_list_path
+
+ deny_list_file = arguments.get("deny_list_file") or cli_args.get("deny_list_file")
+ if deny_list_file:
+ # Check if it's a full S3 path (s3://bucket/key)
+ if deny_list_file.startswith("s3://"):
+ # Let the CLI handle it - don't download here
+ cli_args["deny_list_file"] = deny_list_file
+ elif os.path.exists(deny_list_file) or os.path.isabs(deny_list_file):
+ # It's already a local absolute path or exists - use it as-is
+ cli_args["deny_list_file"] = deny_list_file
+ else:
+ # Assume it's an S3 key (relative to bucket) - download it
+ deny_list_path = os.path.join(INPUT_DIR, "deny_list.csv")
+ download_file_from_s3(bucket_name, deny_list_file, deny_list_path)
+ cli_args["deny_list_file"] = deny_list_path
+
+ redact_whole_page_file = arguments.get("redact_whole_page_file") or cli_args.get(
+ "redact_whole_page_file"
+ )
+ if redact_whole_page_file:
+ # Check if it's a full S3 path (s3://bucket/key)
+ if redact_whole_page_file.startswith("s3://"):
+ # Let the CLI handle it - don't download here
+ cli_args["redact_whole_page_file"] = redact_whole_page_file
+ elif os.path.exists(redact_whole_page_file) or os.path.isabs(
+ redact_whole_page_file
+ ):
+ # It's already a local absolute path or exists - use it as-is
+ cli_args["redact_whole_page_file"] = redact_whole_page_file
+ else:
+ # Assume it's an S3 key (relative to bucket) - download it
+ redact_whole_page_path = os.path.join(INPUT_DIR, "redact_whole_page.csv")
+ download_file_from_s3(
+ bucket_name, redact_whole_page_file, redact_whole_page_path
+ )
+ cli_args["redact_whole_page_file"] = redact_whole_page_path
+
+ # 5. Execute the main application logic
+ try:
+ print("--- Starting CLI Redact Main Function ---")
+ cli_main(direct_mode_args=cli_args)
+ print("--- CLI Redact Main Function Finished ---")
+ except Exception as e:
+ print(f"An error occurred during CLI execution: {e}")
+ # Optionally, re-raise the exception to make the Lambda fail
+ raise
+
+ # 6. Upload results back to S3
+ output_s3_prefix = f"output/{os.path.splitext(os.path.basename(input_key))[0]}"
+ print(
+ f"Uploading contents of {OUTPUT_DIR} to s3://{bucket_name}/{output_s3_prefix}/"
+ )
+ upload_directory_to_s3(OUTPUT_DIR, bucket_name, output_s3_prefix)
+
+ return {
+ "statusCode": 200,
+ "body": json.dumps(
+ f"Processing complete for {input_key}. Output saved to s3://{bucket_name}/{output_s3_prefix}/"
+ ),
+ }
diff --git a/load_dynamo_logs.py b/load_dynamo_logs.py
new file mode 100644
index 0000000000000000000000000000000000000000..271a79afc407ca96f4158569604fc36f8dd3e07e
--- /dev/null
+++ b/load_dynamo_logs.py
@@ -0,0 +1,278 @@
+import argparse
+import csv
+import datetime
+import os
+from decimal import Decimal
+
+import boto3
+
+from tools.config import (
+ AWS_ACCESS_KEY,
+ AWS_REGION,
+ AWS_SECRET_KEY,
+ OUTPUT_FOLDER,
+ USAGE_LOG_DYNAMODB_TABLE_NAME,
+)
+
+
+def parse_args():
+ """Parse command-line arguments; config values are used as defaults."""
+ parser = argparse.ArgumentParser(
+ description="Export DynamoDB usage log table to CSV."
+ )
+ parser.add_argument(
+ "--table",
+ default=USAGE_LOG_DYNAMODB_TABLE_NAME,
+ help=f"DynamoDB table name (default from config: {USAGE_LOG_DYNAMODB_TABLE_NAME!r})",
+ )
+ parser.add_argument(
+ "--region",
+ default=AWS_REGION,
+ help=f"AWS region (default from config: {AWS_REGION!r})",
+ )
+ parser.add_argument(
+ "--output",
+ default=None,
+ metavar="PATH",
+ help="Full output CSV path (overrides --output-folder and --output-filename if set)",
+ )
+ parser.add_argument(
+ "--output-folder",
+ default=OUTPUT_FOLDER,
+ metavar="DIR",
+ help=f"Output folder for the CSV (default from config: {OUTPUT_FOLDER!r})",
+ )
+ parser.add_argument(
+ "--output-filename",
+ default="dynamodb_logs_export.csv",
+ metavar="NAME",
+ help="Output CSV file name (default: dynamodb_logs_export.csv)",
+ )
+ parser.add_argument(
+ "--from-date",
+ dest="from_date",
+ default=None,
+ metavar="YYYY-MM-DD",
+ help="Only include entries on or after this date (optional)",
+ )
+ parser.add_argument(
+ "--to-date",
+ dest="to_date",
+ default=None,
+ metavar="YYYY-MM-DD",
+ help="Only include entries on or before this date (optional)",
+ )
+ parser.add_argument(
+ "--date-attribute",
+ default="timestamp",
+ help="DynamoDB attribute name used for date filtering (default: timestamp)",
+ )
+ parser.add_argument(
+ "--s3-output-bucket",
+ default=None,
+ metavar="BUCKET",
+ help="If set (with --s3-output-key), upload the output CSV to this S3 bucket",
+ )
+ parser.add_argument(
+ "--s3-output-key",
+ default=None,
+ metavar="KEY",
+ help="S3 object key (path) for the output CSV when using --s3-output-bucket",
+ )
+ return parser.parse_args()
+
+
+# Helper function to convert Decimal to float or int
+def convert_types(item):
+ new_item = {}
+ for key, value in item.items():
+ # Handle Decimals first
+ if isinstance(value, Decimal):
+ new_item[key] = int(value) if value % 1 == 0 else float(value)
+ # Handle Strings that might be dates
+ elif isinstance(value, str):
+ try:
+ # Attempt to parse a common ISO 8601 format.
+ # The .replace() handles the 'Z' for Zulu/UTC time.
+ dt_obj = datetime.datetime.fromisoformat(value.replace("Z", "+00:00"))
+ # Now that we have a datetime object, format it as desired
+ new_item[key] = dt_obj.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
+ except (ValueError, TypeError):
+ # If it fails to parse, it's just a regular string
+ new_item[key] = value
+ # Handle all other types
+ else:
+ new_item[key] = value
+ return new_item
+
+
+def _parse_item_date(value):
+ """Parse a DynamoDB attribute value to datetime for comparison. Returns None if unparseable."""
+ if value is None:
+ return None
+ # Decimal (DynamoDB number type, e.g. Unix timestamp)
+ if isinstance(value, Decimal):
+ try:
+ ts = float(value)
+ return datetime.datetime.utcfromtimestamp(ts)
+ except (ValueError, OSError):
+ return None
+ if isinstance(value, (int, float)):
+ try:
+ return datetime.datetime.utcfromtimestamp(float(value))
+ except (ValueError, OSError):
+ return None
+ # String: try ISO and common formats
+ if isinstance(value, str):
+ for fmt in (
+ "%Y-%m-%d %H:%M:%S.%f",
+ "%Y-%m-%d %H:%M:%S",
+ "%Y-%m-%d",
+ "%Y-%m-%dT%H:%M:%S",
+ ):
+ try:
+ return datetime.datetime.strptime(value, fmt)
+ except (ValueError, TypeError):
+ continue
+ try:
+ # Handles ISO with Z or +00:00
+ return datetime.datetime.fromisoformat(value.replace("Z", "+00:00"))
+ except (ValueError, TypeError):
+ pass
+ return None
+
+
+def filter_items_by_date(items, from_date, to_date, date_attribute: str):
+ """Return items whose date attribute falls within [from_date, to_date] (inclusive)."""
+ if from_date is None and to_date is None:
+ return items
+ start = datetime.datetime.min
+ end = datetime.datetime.max
+ if from_date is not None:
+ start = datetime.datetime.combine(from_date, datetime.time.min)
+ if to_date is not None:
+ end = datetime.datetime.combine(to_date, datetime.time.max)
+ filtered = []
+ for item in items:
+ raw = item.get(date_attribute)
+ dt = _parse_item_date(raw)
+ if dt is None:
+ continue
+ # Normalize to naive for comparison if needed
+ if dt.tzinfo:
+ dt = dt.replace(tzinfo=None)
+ if start <= dt <= end:
+ filtered.append(item)
+ return filtered
+
+
+# Paginated scan
+def scan_table(table):
+ items = []
+ response = table.scan()
+ items.extend(response["Items"])
+
+ while "LastEvaluatedKey" in response:
+ response = table.scan(ExclusiveStartKey=response["LastEvaluatedKey"])
+ items.extend(response["Items"])
+
+ return items
+
+
+# Export to CSV
+def export_to_csv(items, output_path, fields_to_drop: list = None):
+ if not items:
+ print("No items found.")
+ return
+
+ # Use a set for efficient lookup
+ drop_set = set(fields_to_drop or [])
+
+ # Get a comprehensive list of all possible headers from all items
+ all_keys = set()
+ for item in items:
+ all_keys.update(item.keys())
+
+ # Determine the final fieldnames by subtracting the ones to drop
+ fieldnames = sorted(list(all_keys - drop_set))
+
+ print("Final CSV columns will be:", fieldnames)
+
+ with open(output_path, "w", newline="", encoding="utf-8-sig") as csvfile:
+ # The key fix is here: extrasaction='ignore'
+ # restval='' is also good practice to handle rows that are missing a key
+ writer = csv.DictWriter(
+ csvfile, fieldnames=fieldnames, extrasaction="ignore", restval=""
+ )
+ writer.writeheader()
+
+ for item in items:
+ # The convert_types function can now return the full dict,
+ # and the writer will simply ignore the extra fields.
+ writer.writerow(convert_types(item))
+
+ print(f"Exported {len(items)} items to {output_path}")
+
+
+def main():
+ args = parse_args()
+ table_name = args.table
+ region = args.region
+ if args.output is not None:
+ csv_output = args.output
+ else:
+ csv_output = os.path.join(
+ args.output_folder.rstrip(r"\/"), args.output_filename
+ )
+
+ today = datetime.datetime.now().date()
+ one_year_ago = today - datetime.timedelta(days=365)
+
+ from_date = None
+ to_date = None
+ if args.from_date:
+ from_date = datetime.datetime.strptime(args.from_date, "%Y-%m-%d").date()
+ if args.to_date:
+ to_date = datetime.datetime.strptime(args.to_date, "%Y-%m-%d").date()
+ # Default date range: one year ago to today
+ if from_date is None and to_date is None:
+ from_date = one_year_ago
+ to_date = today
+ elif from_date is None:
+ from_date = one_year_ago
+ elif to_date is None:
+ to_date = today
+ if from_date > to_date:
+ raise ValueError("--from-date must be on or before --to-date")
+
+ dynamodb = boto3.resource("dynamodb", region_name=region)
+ table = dynamodb.Table(table_name)
+
+ items = scan_table(table)
+ items = filter_items_by_date(items, from_date, to_date, args.date_attribute)
+ print(f"Filtered to {len(items)} items in date range {from_date} to {to_date}.")
+ export_to_csv(items, csv_output, fields_to_drop=[])
+
+ if args.s3_output_bucket and args.s3_output_key:
+ if AWS_ACCESS_KEY and AWS_SECRET_KEY and region:
+ s3_client = boto3.client(
+ "s3",
+ aws_access_key_id=AWS_ACCESS_KEY,
+ aws_secret_access_key=AWS_SECRET_KEY,
+ region_name=region,
+ )
+ else:
+ s3_client = boto3.client("s3", region_name=region if region else None)
+ try:
+ s3_client.upload_file(csv_output, args.s3_output_bucket, args.s3_output_key)
+ print(f"Uploaded to s3://{args.s3_output_bucket}/{args.s3_output_key}")
+ except Exception as e:
+ print(f"Failed to upload to S3: {e}")
+ elif args.s3_output_bucket or args.s3_output_key:
+ print(
+ "Warning: both --s3-output-bucket and --s3-output-key are required for S3 upload; skipping."
+ )
+
+
+if __name__ == "__main__":
+ main()
diff --git a/load_s3_logs.py b/load_s3_logs.py
new file mode 100644
index 0000000000000000000000000000000000000000..d30cee09b317cee55614806f05f5fad822556c80
--- /dev/null
+++ b/load_s3_logs.py
@@ -0,0 +1,187 @@
+import argparse
+import os
+from datetime import datetime, timedelta
+from io import StringIO
+
+import boto3
+import pandas as pd
+
+from tools.config import (
+ AWS_ACCESS_KEY,
+ AWS_REGION,
+ AWS_SECRET_KEY,
+ DOCUMENT_REDACTION_BUCKET,
+ OUTPUT_FOLDER,
+)
+
+# Combine together log files that can be then used for e.g. dashboarding and financial tracking.
+
+
+def parse_args():
+ """Parse command-line arguments; config values are used as defaults."""
+ today = datetime.now()
+ one_year_ago = (today - timedelta(days=365)).strftime("%Y%m%d")
+ today_str = today.strftime("%Y%m%d")
+
+ parser = argparse.ArgumentParser(
+ description="Combine S3 usage log CSVs in a date range into a single CSV."
+ )
+ parser.add_argument(
+ "--bucket",
+ default=DOCUMENT_REDACTION_BUCKET,
+ help=f"S3 bucket name (default from config: {DOCUMENT_REDACTION_BUCKET!r})",
+ )
+ parser.add_argument(
+ "--region",
+ default=AWS_REGION,
+ help=f"AWS region (default from config: {AWS_REGION!r})",
+ )
+ parser.add_argument(
+ "--prefix",
+ default="usage/",
+ help="S3 prefix / top-level folder where logs are stored (default: usage/)",
+ )
+ parser.add_argument(
+ "--from-date",
+ dest="earliest_date",
+ default=one_year_ago,
+ metavar="YYYYMMDD",
+ help=f"Earliest date of logs to include (default: one year ago, {one_year_ago})",
+ )
+ parser.add_argument(
+ "--to-date",
+ dest="latest_date",
+ default=today_str,
+ metavar="YYYYMMDD",
+ help=f"Latest date of logs to include (default: today, {today_str})",
+ )
+ parser.add_argument(
+ "--output",
+ default=None,
+ metavar="PATH",
+ help="Full output CSV path (overrides --output-folder and --output-filename if set)",
+ )
+ parser.add_argument(
+ "--output-folder",
+ default=OUTPUT_FOLDER,
+ metavar="DIR",
+ help=f"Output folder for the CSV (default from config: {OUTPUT_FOLDER!r})",
+ )
+ parser.add_argument(
+ "--output-filename",
+ default="consolidated_s3_logs.csv",
+ metavar="NAME",
+ help="Output CSV file name (default: consolidated_s3_logs.csv)",
+ )
+ parser.add_argument(
+ "--s3-output-bucket",
+ default=None,
+ metavar="BUCKET",
+ help="If set (with --s3-output-key), upload the output CSV to this S3 bucket",
+ )
+ parser.add_argument(
+ "--s3-output-key",
+ default=None,
+ metavar="KEY",
+ help="S3 object key (path) for the output CSV when using --s3-output-bucket",
+ )
+ return parser.parse_args()
+
+
+# Function to list all files in a folder
+def list_files_in_s3(s3_client, bucket, prefix):
+ response = s3_client.list_objects_v2(Bucket=bucket, Prefix=prefix)
+ if "Contents" in response:
+ return [content["Key"] for content in response["Contents"]]
+ return []
+
+
+# Function to filter date range
+def is_within_date_range(date_str, start_date, end_date):
+ date_obj = datetime.strptime(date_str, "%Y%m%d")
+ return start_date <= date_obj <= end_date
+
+
+def main():
+ args = parse_args()
+ bucket_name = args.bucket
+ region = args.region
+ prefix = args.prefix
+ earliest_date = args.earliest_date
+ latest_date = args.latest_date
+ if args.output is not None:
+ output_path = args.output
+ else:
+ output_path = os.path.join(
+ args.output_folder.rstrip(r"\/"), args.output_filename
+ )
+
+ # S3 setup. Use provided keys if in config, otherwise assume AWS SSO/default credentials
+ if AWS_ACCESS_KEY and AWS_SECRET_KEY and region:
+ s3 = boto3.client(
+ "s3",
+ aws_access_key_id=AWS_ACCESS_KEY,
+ aws_secret_access_key=AWS_SECRET_KEY,
+ region_name=region,
+ )
+ else:
+ s3 = boto3.client("s3", region_name=region if region else None)
+
+ # Define the date range
+ start_date = datetime.strptime(earliest_date, "%Y%m%d")
+ end_date = datetime.strptime(latest_date, "%Y%m%d")
+
+ # List all files under prefix
+ all_files = list_files_in_s3(s3, bucket_name, prefix)
+
+ # Filter based on date range (expects structure like prefix/YYYYMMDD/.../log.csv)
+ log_files = []
+ for file in all_files:
+ parts = file.split("/")
+ if len(parts) >= 3:
+ date_str = parts[1]
+ if (
+ is_within_date_range(date_str, start_date, end_date)
+ and parts[-1] == "log.csv"
+ ):
+ log_files.append(file)
+
+ # Download, read and concatenate CSV files into a pandas DataFrame
+ df_list = []
+ for log_file in log_files:
+ obj = s3.get_object(Bucket=bucket_name, Key=log_file)
+ try:
+ csv_content = obj["Body"].read().decode("utf-8")
+ except Exception as e:
+ print("Could not load in log file:", log_file, "due to:", e)
+ csv_content = obj["Body"].read().decode("latin-1")
+
+ try:
+ df = pd.read_csv(StringIO(csv_content))
+ except Exception as e:
+ print("Could not load in log file:", log_file, "due to:", e)
+ continue
+
+ df_list.append(df)
+
+ if df_list:
+ concatenated_df = pd.concat(df_list, ignore_index=True)
+ concatenated_df.to_csv(output_path, index=False)
+ print(f"Consolidated CSV saved to {output_path}")
+
+ if args.s3_output_bucket and args.s3_output_key:
+ try:
+ s3.upload_file(output_path, args.s3_output_bucket, args.s3_output_key)
+ print(f"Uploaded to s3://{args.s3_output_bucket}/{args.s3_output_key}")
+ except Exception as e:
+ print(f"Failed to upload to S3: {e}")
+ elif args.s3_output_bucket or args.s3_output_key:
+ print(
+ "Warning: both --s3-output-bucket and --s3-output-key are required for S3 upload; skipping."
+ )
+ else:
+ print("No log files found in the given date range.")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/packages.txt b/packages.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8605b78ca922451ea2b5b23ea1acccbc562c2e9b
--- /dev/null
+++ b/packages.txt
@@ -0,0 +1,4 @@
+tesseract-ocr
+poppler-utils
+libgl1
+libglib2.0-0
\ No newline at end of file
diff --git a/pre-requirements.txt b/pre-requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f8d4962b2d15fbece60eaab5c698a74a5732a4f5
--- /dev/null
+++ b/pre-requirements.txt
@@ -0,0 +1,7 @@
+# --- PaddleOCR (CPU version installed here, for torch compatibility on HF spaces) ---
+# If you do want to install the GPU version elsewhere, e.g. in your local environment, you can uncomment the following line and install the GPU version from the following link: https://www.paddlepaddle.org.cn/packages/stable/cu129/
+#--extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu129/
+#paddlepaddle-gpu>=3.0.0,<=3.2.1
+paddlepaddle>=3.0.0,<=3.2.1
+paddleocr<=3.3.0
+pycocotools==2.0.10
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000000000000000000000000000000000000..056f8f1a977ba3b5e301da4a7d700b5b08d11629
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,159 @@
+[build-system]
+requires = ["setuptools>=61.0", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "doc_redaction"
+version = "2.0.1"
+description = "Redact PDF/image-based documents, Word, or CSV/XLSX files using a Gradio-based GUI interface"
+readme = "README.md"
+authors = [
+ { name = "Sean Pedrick-Case", email = "spedrickcase@lambeth.gov.uk" },
+]
+maintainers = [
+ { name = "Sean Pedrick-Case", email = "spedrickcase@lambeth.gov.uk" },
+]
+license = { text = "AGPL-3.0-only" } # This licence type required to use PyMuPDF
+keywords = [
+ "redaction",
+ "pdf",
+ "nlp",
+ "documents",
+ "document-processing",
+ "gradio",
+ "pii",
+ "pii-detection"
+]
+classifiers = [
+ "Development Status :: 5 - Production/Stable",
+ "Intended Audience :: Developers",
+ "Intended Audience :: Legal Industry",
+ "Topic :: Text Processing :: General",
+ "Topic :: Security :: Cryptography",
+ "Programming Language :: Python :: 3",
+ "Programming Language :: Python :: 3.10",
+ "Programming Language :: Python :: 3.11",
+ "Programming Language :: Python :: 3.12",
+ "Programming Language :: Python :: 3.13"
+]
+requires-python = ">=3.10"
+dependencies = [
+ "pdfminer.six<=20260107",
+ "pdf2image<=1.17.0",
+ "pymupdf<=1.27.1",
+ "bleach<=6.3.0",
+ "opencv-python<=4.13.0.92",
+ "presidio_analyzer<=2.2.361",
+ "presidio_anonymizer<=2.2.361",
+ "presidio-image-redactor<=0.0.57",
+ "pikepdf<=10.3.0",
+ "pandas<=2.3.3",
+ "scikit-learn<=1.8.0",
+ "spacy<=3.8.11",
+ "en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
+ "gradio<=6.10.0",
+ "boto3<=1.42.61",
+ "pyarrow<=21.0.0",
+ "openpyxl<=3.1.5",
+ "Faker<=40.8.0",
+ "python-levenshtein<=0.27.3",
+ "spaczz<=0.6.1",
+ "gradio_image_annotation @ https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.5.3/gradio_image_annotation-0.5.3-py3-none-any.whl",
+ "rapidfuzz<=3.14.3",
+ "python-dotenv<=1.0.1",
+ "awslambdaric<=3.1.1",
+ "python-docx<=1.2.0",
+ "polars<=1.38.1",
+ "defusedxml<=0.7.1",
+ "numpy<=2.4.2",
+ "spaces<=0.48.1",
+ "google-genai<=1.66.0",
+ "openai<=2.25.0",
+ "markdown<=3.10.2",
+ "tabulate<=0.10.0"
+]
+
+[project.optional-dependencies]
+
+# For testing
+dev = ["pytest"]
+test = ["pytest", "pytest-cov"]
+
+# To install the app with paddle and vlm support with pip, example command (in base folder and correct python environment): pip install .[paddle,vlm], or uv pip install .[paddle,vlm] if using uv. Note need to GPU version of Torch below
+
+# Extra dependencies for PaddleOCR
+# The following installs the CPU version of paddleOCR. If you want the GPU-accelerated version, run pip install the relevant wheel for paddlepaddle-gpu==3.2.1 from the following link: https://www.paddlepaddle.org.cn/packages/stable/cu129/
+paddle = [
+ "protobuf<=7.34.0",
+ "paddlepaddle>=3.0.0,<=3.2.1",
+ "paddleocr<=3.3.0",
+ "pycocotools<=2.0.10",
+]
+
+# Extra dependencies for VLM models
+# The following installs the CPU compatible version of pytorch. For torch cuda support you should run manually pip install torch==2.9.1 torchvision==0.24.1 --index-url https://download.pytorch.org/whl/cu129 after installation
+vlm = [
+ "torch<=2.9.1",
+ "torchvision<=0.24.1",
+ "transformers<=5.3.0",
+ "accelerate<=1.13.0",
+ "bitsandbytes<=0.49.2", # Needed for on the fly quantisation in transformers
+ "sentencepiece<=0.2.1", # Needed for PaddleOCRVL
+ #"optimum<=2.1.0", # Needed for GPTQ quantised models in transformers. Commented out, as optional
+ #"GPTQModel<=5.8.0", # Needed for GPTQ quantised models in transformers. Highly advised to install from a wheel from https://github.com/ModelCloud/GPTQModel
+ #"flash_attn<=2.8.3", # Faster inference with transformers. Highly recommended to install from a wheel at https://github.com/Dao-AILab/flash-attention
+]
+
+# Run Gradio as an mcp server
+mcp = [
+ "gradio[mcp]<=6.9.0"
+]
+
+[project.urls]
+Homepage = "https://seanpedrick-case.github.io/doc_redaction/"
+Repository = "https://github.com/seanpedrick-case/doc_redaction"
+
+[project.scripts]
+cli_redact = "cli_redact:main"
+
+# Configuration for Ruff linter:
+[tool.ruff]
+line-length = 88
+
+[tool.ruff.lint]
+select = ["E", "F", "I"]
+ignore = [
+ "E501", # line-too-long (handled with Black)
+ "E402", # module-import-not-at-top-of-file (sometimes needed for conditional imports)
+]
+
+[tool.ruff.lint.per-file-ignores]
+"__init__.py" = ["F401"] # Allow unused imports in __init__.py
+
+# Configuration for a Black formatter:
+[tool.black]
+line-length = 88
+target-version = ['py310']
+
+# Configuration for pytest:
+[tool.pytest.ini_options]
+filterwarnings = [
+ "ignore::DeprecationWarning:click.parser",
+ "ignore::DeprecationWarning:weasel.util.config",
+ "ignore::DeprecationWarning:builtin type",
+ "ignore::DeprecationWarning:websockets.legacy",
+ "ignore::DeprecationWarning:websockets.server",
+ "ignore::DeprecationWarning:spacy.cli._util",
+ "ignore::DeprecationWarning:weasel.util.config",
+ "ignore::DeprecationWarning:importlib._bootstrap",
+]
+testpaths = ["test"]
+python_files = ["test_*.py", "*_test.py"]
+python_classes = ["Test*"]
+python_functions = ["test_*"]
+addopts = [
+ "-v",
+ "--tb=short",
+ "--strict-markers",
+ "--disable-warnings",
+]
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..975d18b3ac22a3b2761925dc8d34108f2aa77f5b
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,64 @@
+# This requirements.txt file is intended for use with Hugging Face Spaces (Python 3.10), and so is not intended as the file to use to install packages for a local install. Please refer to the README.md file for install instructions for the app.
+
+# --- Core and data packages ---
+numpy<=2.4.2
+pandas<=2.3.3
+bleach<=6.3.0
+polars<=1.38.1
+pyarrow<=21.0.0
+openpyxl<=3.1.5
+boto3<=1.42.61
+python-dotenv<=1.0.1
+defusedxml<=0.7.1
+Faker<=40.8.0
+python-levenshtein<=0.27.3
+rapidfuzz<=3.14.3
+markdown<=3.10.2
+tabulate<=0.10.0
+
+# --- PDF / OCR / Redaction tools ---
+pdfminer.six<=20260107
+pdf2image<=1.17.0
+pymupdf<=1.27.1
+pikepdf<=10.3.0
+opencv-python<=4.13.0.92
+presidio_analyzer<=2.2.361
+presidio_anonymizer<=2.2.361
+presidio-image-redactor<=0.0.57
+
+# --- Document generation ---
+python-docx<=1.2.0
+
+# --- Gradio and apps ---
+gradio<=6.10.0
+https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.5.3/gradio_image_annotation-0.5.3-py3-none-any.whl # Custom annotator version with rotation, zoom, labels, and box IDs
+spaces<=0.48.1
+
+# --- AWS Lambda runtime ---
+awslambdaric<=3.1.1
+
+# --- Machine learning / NLP ---
+scikit-learn<=1.8.0
+spacy<=3.8.11
+spaczz<=0.6.1
+en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
+transformers<=5.3.0
+accelerate<=1.13.0
+bitsandbytes<=0.49.2
+sentencepiece<=0.2.1
+optimum<=2.1.0
+
+# --- Testing ---
+pytest<=7.0.0
+pytest-cov<=4.0.0
+
+# --- LLM libraries ---
+google-genai<=1.66.0
+openai<=2.25.0
+
+# --- PyTorch (CUDA 12.6) ---
+--extra-index-url https://download.pytorch.org/whl/cu128
+torch<=2.8.0
+torchvision<=0.23.0
+flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu12torch2.8cxx11abiTRUE-cp310-cp310-linux_x86_64.whl
+# GPTQModel @ https://github.com/ModelCloud/GPTQModel/releases/download/v5.8.0/gptqmodel-5.8.0+cu128torch2.8-cp310-cp310-linux_x86_64.whl # Need to install from wheel as requirements-based compile fails on Hugging Face spaces
\ No newline at end of file
diff --git a/requirements_lightweight.txt b/requirements_lightweight.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3c36c2da01a7d507b7011885e08c5d2c6a45466c
--- /dev/null
+++ b/requirements_lightweight.txt
@@ -0,0 +1,53 @@
+# --- Core and data packages ---
+numpy<=2.4.2
+pandas<=2.3.3
+bleach<=6.3.0
+polars<=1.38.1
+pyarrow<=21.0.0
+openpyxl<=3.1.5
+boto3<=1.42.61
+python-dotenv<=1.0.1
+defusedxml<=0.7.1
+Faker<=40.8.0
+python-levenshtein<=0.27.3
+rapidfuzz<=3.14.3
+markdown<=3.10.2
+tabulate<=0.10.0
+
+# --- Machine learning / NLP ---
+scikit-learn<=1.8.0
+spacy<=3.8.11
+spaczz<=0.6.1
+en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
+
+# --- PDF / OCR / Redaction tools ---
+pdfminer.six<=20260107
+pdf2image<=1.17.0
+pymupdf<=1.27.1
+pikepdf<=10.3.0
+opencv-python<=4.13.0.92
+presidio_analyzer<=2.2.361
+presidio_anonymizer<=2.2.361
+presidio-image-redactor<=0.0.57
+
+# --- Gradio and apps ---
+gradio<=6.10.0
+https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.5.3/gradio_image_annotation-0.5.3-py3-none-any.whl # Custom annotator version with rotation, zoom, labels, and box IDs
+spaces<=0.48.1
+
+# --- AWS Lambda runtime ---
+awslambdaric<=3.1.1
+
+# --- Document generation ---
+python-docx<=1.2.0
+
+# --- Testing ---
+pytest<=7.0.0
+pytest-cov<=4.0.0
+
+# --- LLM libraries ---
+# Explicit protobuf: PaddlePaddle imports google.protobuf; layered `pip --target` installs
+# plus google-* packages can leave the google namespace incomplete without it.
+protobuf<=7.34.0
+google-genai<=1.66.0
+openai<=2.25.0
\ No newline at end of file
diff --git a/src/app_settings.qmd b/src/app_settings.qmd
new file mode 100644
index 0000000000000000000000000000000000000000..f9a1d1345318c5cbabeff32709d83449ad27391f
--- /dev/null
+++ b/src/app_settings.qmd
@@ -0,0 +1,1237 @@
+---
+title: "App settings management guide"
+format:
+ html:
+ toc: true
+ toc-depth: 3
+ toc-title: "On this page"
+---
+
+Settings for the redaction app can be set from outside by changing values in the `.env` file stored in your local config folder, or in S3 if running on AWS. This guide provides an overview of how to configure the application using environment variables. The application loads configurations using `os.environ.get()`. It first attempts to load variables from the file specified by `APP_CONFIG_PATH` (which defaults to `config/app_config.env`). If `AWS_CONFIG_PATH` is also set (e.g., to `config/aws_config.env`), variables are loaded from that file as well. Environment variables set directly in the system will always take precedence over those defined in these `.env` files.
+
+### Value Format Notes
+
+* **Boolean Values:** Boolean environment variables accept multiple formats: `"True"`, `"1"`, `"true"`, or `"TRUE"` for true; `"False"`, `"0"`, `"false"`, or `"FALSE"` for false.
+
+* **List Values:** List environment variables should be provided as comma-separated strings within square brackets, e.g., `"['item1', 'item2', 'item3']"`. The application will automatically parse these into Python lists.
+
+* **Temporary Folders:** Setting `OUTPUT_FOLDER` or `INPUT_FOLDER` to `"TEMP"` will create a temporary directory that is automatically cleaned up when the application exits. This is useful for increased security in some deployment scenarios.
+
+## App Configuration File (`config.env`)
+
+This section details variables related to the main application configuration file.
+
+* **`CONFIG_FOLDER`**
+ * **Description:** The folder where configuration files are stored.
+ * **Default Value:** `config/`
+
+* **`APP_CONFIG_PATH`**
+ * **Description:** Specifies the path to the application configuration `.env` file. This file contains various settings that control the application's behavior.
+ * **Default Value:** `config/app_config.env`
+
+## AWS Options
+
+This section covers configurations related to AWS services used by the application.
+
+* **`AWS_CONFIG_PATH`**
+ * **Description:** Specifies the path to the AWS configuration `.env` file. This file is intended to store AWS credentials and specific settings.
+ * **Default Value:** `''` (empty string)
+
+* **`RUN_AWS_FUNCTIONS`**
+ * **Description:** Enables or disables AWS-specific functionalities within the application. Set to `"True"` to enable.
+ * **Default Value:** `"False"`
+
+* **`AWS_REGION`**
+ * **Description:** Defines the AWS region where services like S3, Cognito, and Textract are located.
+ * **Default Value:** `''`
+
+* **`AWS_CLIENT_ID`**
+ * **Description:** The client ID for AWS Cognito, used for user authentication.
+ * **Default Value:** `''`
+
+* **`AWS_CLIENT_SECRET`**
+ * **Description:** The client secret for AWS Cognito, used in conjunction with the client ID for authentication.
+ * **Default Value:** `''`
+
+* **`AWS_USER_POOL_ID`**
+ * **Description:** The user pool ID for AWS Cognito, identifying the user directory.
+ * **Default Value:** `''`
+
+* **`AWS_ACCESS_KEY`**
+ * **Description:** The AWS access key ID for programmatic access to AWS services.
+ * **Default Value:** `''`
+
+* **`AWS_SECRET_KEY`**
+ * **Description:** The AWS secret access key corresponding to the AWS access key ID.
+ * **Default Value:** `''`
+
+* **`DOCUMENT_REDACTION_BUCKET`**
+ * **Description:** The name of the S3 bucket used for storing documents related to the redaction process.
+ * **Default Value:** `''`
+
+* **`PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS`**
+ * **Description:** If set to `"True"`, the app will prioritize using AWS SSO credentials over access keys stored in environment variables.
+ * **Default Value:** `"True"`
+
+* **`CUSTOM_HEADER`**
+ * **Description:** Specifies a custom header name to be included in requests, often used for services like AWS CloudFront.
+ * **Default Value:** `''`
+
+* **`CUSTOM_HEADER_VALUE`**
+ * **Description:** The value for the custom header specified by `CUSTOM_HEADER`.
+ * **Default Value:** `''`
+
+## Image Options
+
+Settings related to image processing within the application.
+
+* **`IMAGES_DPI`**
+ * **Description:** Dots Per Inch (DPI) setting for image processing, affecting the resolution and quality of processed images.
+ * **Default Value:** `'300.0'`
+
+* **`LOAD_TRUNCATED_IMAGES`**
+ * **Description:** Controls whether the application attempts to load truncated images. Set to `'True'` to enable.
+ * **Default Value:** `'True'`
+
+* **`MAX_IMAGE_PIXELS`**
+ * **Description:** Sets the maximum number of pixels for an image that the application will process. Leave blank for no limit. This can help prevent issues with very large images.
+ * **Default Value:** `''`
+
+* **`MERGE_BOUNDING_BOXES`**
+ * **Description:** When enabled (`'True'`), merges nearby bounding boxes (reconstruction, grouping, and horizontal merge). Affects OCR/Textract result processing.
+ * **Default Value:** `'True'`
+
+* **`MAX_SPACES_GPU_RUN_TIME`**
+ * **Description:** Maximum number of seconds to run the GPU on Hugging Face Spaces. Used to limit long-running VLM/OCR jobs when deployed on Spaces.
+ * **Default Value:** `60`
+
+## File I/O Options
+
+Configuration for input and output file handling.
+
+* **`SESSION_OUTPUT_FOLDER`**
+ * **Description:** If set to `'True'`, the application will save output and input files into session-specific subfolders.
+ * **Default Value:** `'False'`
+
+* **`OUTPUT_FOLDER`** (environment variable: `GRADIO_OUTPUT_FOLDER`)
+ * **Description:** Specifies the default output folder for generated files. Can be set to `"TEMP"` to use a temporary directory.
+ * **Default Value:** `'output/'`
+
+* **`INPUT_FOLDER`** (environment variable: `GRADIO_INPUT_FOLDER`)
+ * **Description:** Specifies the default input folder for files. Can be set to `"TEMP"` to use a temporary directory.
+ * **Default Value:** `'input/'`
+
+* **`GRADIO_TEMP_DIR`**
+ * **Description:** Defines the path for Gradio's temporary file storage.
+ * **Default Value:** `''`
+
+* **`MPLCONFIGDIR`**
+ * **Description:** Specifies the cache directory for the Matplotlib library.
+ * **Default Value:** `''`
+
+## Logging Options
+
+Settings for configuring application logging.
+
+**Note:** By default, logs are stored in subfolders based on today's date and the hostname of the instance running the app (e.g., `logs/20240101/hostname/`). This prevents log files from one instance overwriting logs from another instance, which is especially important when running on S3 or in multi-instance deployments. If you're always running the app on a single system or just locally, you can disable this behavior by setting `USE_LOG_SUBFOLDERS` to `"False"`.
+
+* **`SAVE_LOGS_TO_CSV`**
+ * **Description:** Enables or disables saving logs to CSV files. Set to `'True'` to enable.
+ * **Default Value:** `'True'`
+
+* **`USE_LOG_SUBFOLDERS`**
+ * **Description:** If enabled (`'True'`), logs will be stored in subfolders based on date and hostname.
+ * **Default Value:** `'True'`
+
+* **`FEEDBACK_LOGS_FOLDER`**, **`ACCESS_LOGS_FOLDER`**, **`USAGE_LOGS_FOLDER`**
+ * **Description:** Base folders for feedback, access, and usage logs respectively.
+ * **Default Values:** `'feedback/'`, `'logs/'`, `'usage/'`
+
+* **`S3_FEEDBACK_LOGS_FOLDER`**, **`S3_ACCESS_LOGS_FOLDER`**, **`S3_USAGE_LOGS_FOLDER`**
+ * **Description:** S3 paths where feedback, access, and usage logs will be stored if `RUN_AWS_FUNCTIONS` is enabled.
+ * **Default Values:** Dynamically generated based on date and hostname, e.g., `'feedback/YYYYMMDD/hostname/'`.
+
+* **`LOG_FILE_NAME`**, **`USAGE_LOG_FILE_NAME`**, **`FEEDBACK_LOG_FILE_NAME`**
+ * **Description:** Specifies the name for log files. `USAGE_LOG_FILE_NAME` and `FEEDBACK_LOG_FILE_NAME` default to the value of `LOG_FILE_NAME`.
+ * **Default Value:** `'log.csv'`
+
+* **`DISPLAY_FILE_NAMES_IN_LOGS`**
+ * **Description:** If set to `'True'`, file names will be included in log entries.
+ * **Default Value:** `'False'`
+
+* **`CSV_ACCESS_LOG_HEADERS`**, **`CSV_FEEDBACK_LOG_HEADERS`**, **`CSV_USAGE_LOG_HEADERS`**
+ * **Description:** Defines custom headers for the respective CSV logs as a string representation of a list. If blank, component labels are used.
+ * **Default Value:** Varies; see script for `CSV_USAGE_LOG_HEADERS` default.
+
+* **`SAVE_LOGS_TO_DYNAMODB`**
+ * **Description:** Enables or disables saving logs to AWS DynamoDB. Set to `'True'` to enable.
+ * **Default Value:** `'False'`
+
+* **`ACCESS_LOG_DYNAMODB_TABLE_NAME`**, **`FEEDBACK_LOG_DYNAMODB_TABLE_NAME`**, **`USAGE_LOG_DYNAMODB_TABLE_NAME`**
+ * **Description:** Names of the DynamoDB tables for storing access, feedback, and usage logs.
+ * **Default Values:** `'redaction_access_log'`, `'redaction_feedback'`, `'redaction_usage'`
+
+* **`DYNAMODB_ACCESS_LOG_HEADERS`**, **`DYNAMODB_FEEDBACK_LOG_HEADERS`**, **`DYNAMODB_USAGE_LOG_HEADERS`**
+ * **Description:** Specifies the headers (attributes) for the respective DynamoDB log tables.
+ * **Default Value:** `''`
+
+* **`LOGGING`**
+ * **Description:** Enables or disables general console logging. Set to `'True'` to enable.
+ * **Default Value:** `'False'`
+
+## Gradio & General App Options
+
+Configurations for the Gradio UI, server behavior, and application limits.
+
+* **`FAVICON_PATH`**
+ * **Description:** Path to the favicon icon file for the web interface.
+ * **Default Value:** `"favicon.png"`
+
+* **`RUN_FASTAPI`**
+ * **Description:** If set to `"True"`, the application will be served via FastAPI, allowing for API endpoint integration.
+ * **Default Value:** `"False"`
+
+* **`RUN_MCP_SERVER`**
+ * **Description:** If set to `"True"`, the application will run as an MCP (Model Context Protocol) server.
+ * **Default Value:** `"False"`
+
+* **`GRADIO_SERVER_NAME`**
+ * **Description:** The IP address the Gradio server will bind to. Use `"0.0.0.0"` to allow external access.
+ * **Default Value:** `"127.0.0.1"`
+
+* **`GRADIO_SERVER_PORT`**
+ * **Description:** The network port on which the Gradio server will listen.
+ * **Default Value:** `7860`
+
+* **`ALLOWED_ORIGINS`**
+ * **Description:** A comma-separated list of allowed origins for Cross-Origin Resource Sharing (CORS).
+ * **Default Value:** `''`
+
+* **`ALLOWED_HOSTS`**
+ * **Description:** A comma-separated list of allowed hostnames.
+ * **Default Value:** `''`
+
+* **`ROOT_PATH`**
+ * **Description:** The root path for the application, useful if running behind a reverse proxy (e.g., `/app`).
+ * **Default Value:** `''`
+
+* **`FASTAPI_ROOT_PATH`**
+ * **Description:** The root path for the FastAPI application, used when `RUN_FASTAPI` is true.
+ * **Default Value:** `"/"`
+
+* **`MAX_QUEUE_SIZE`**
+ * **Description:** The maximum number of requests that can be queued in the Gradio interface.
+ * **Default Value:** `20`
+
+* **`MAX_FILE_SIZE`**
+ * **Description:** Maximum file size allowed for uploads (e.g., "250mb", "1gb").
+ * **Default Value:** `'250mb'`
+
+* **`DEFAULT_CONCURRENCY_LIMIT`**
+ * **Description:** The default concurrency limit for Gradio event handlers, controlling how many requests can be processed simultaneously.
+ * **Default Value:** `3`
+
+* **`MAX_SIMULTANEOUS_FILES`**
+ * **Description:** The maximum number of files that can be processed at once.
+ * **Default Value:** `10`
+
+* **`MAX_DOC_PAGES`**
+ * **Description:** The maximum number of pages a document can have.
+ * **Default Value:** `3000`
+
+* **`MAX_TABLE_ROWS`** / **`MAX_TABLE_COLUMNS`**
+ * **Description:** Maximum number of rows and columns for tabular data processing.
+ * **Default Values:** `250000` / `100`
+
+* **`MAX_OPEN_TEXT_CHARACTERS`**
+ * **Description:** Maximum number of characters for open text input.
+ * **Default Value:** `50000`
+
+* **`PAGE_BREAK_VALUE`**
+ * **Description:** Number of pages to process before breaking and restarting from the last finished page.
+ * **Default Value:** `99999`
+ * **Note:** This feature is not currently activated in the application.
+
+* **`MAX_TIME_VALUE`**
+ * **Description:** Maximum time value for processing operations.
+ * **Default Value:** `999999`
+
+* **`TLDEXTRACT_CACHE`**
+ * **Description:** Path to the cache directory used by the `tldextract` library.
+ * **Default Value:** `'tmp/tld/'`
+
+* **`COGNITO_AUTH`**
+ * **Description:** Enables or disables AWS Cognito authentication. Set to `'True'` to enable.
+ * **Default Value:** `'False'`
+
+* **`SHOW_FEEDBACK_BUTTONS`**
+ * **Description:** If set to `"True"`, displays feedback buttons in the Gradio interface.
+ * **Default Value:** `"False"`
+
+* **`USER_GUIDE_URL`**
+ * **Description:** A safe URL pointing to the user guide. The URL is validated against a list of allowed domains.
+ * **Default Value:** `"https://seanpedrick-case.github.io/doc_redaction"`
+
+* **`INTRO_TEXT`**
+ * **Description:** Custom introduction text for the app. Should be in Markdown format, html is stripped out. Can also be set to a path to a `.txt` file (e.g., `"intro.txt"`), which will be read and used as the intro text. The text is automatically sanitized to remove dangerous HTML/scripts while preserving safe markdown syntax.
+ * **Default Value:** `"Too long to display here, see tools/config.py"`
+
+* **`SHOW_EXAMPLES`**
+ * **Description:** If set to `"True"`, displays example files in the Gradio interface.
+ * **Default Value:** `"True"`
+
+* **`SHOW_AWS_EXAMPLES`**
+ * **Description:** If set to `"True"`, includes AWS-specific examples.
+ * **Default Value:** `"False"`
+
+* **`SHOW_DIFFICULT_OCR_EXAMPLES`**
+ * **Description:** If set to `"True"`, includes examples that demonstrate difficult OCR scenarios.
+ * **Default Value:** `"False"`
+
+* **`SHOW_ALL_OUTPUTS_IN_OUTPUT_FOLDER`**
+ * **Description:** If set to `"True"`, displays all output files in the output folder interface.
+ * **Default Value:** `"False"`
+
+* **`FILE_INPUT_HEIGHT`**
+ * **Description:** Sets the height (in pixels) of the file input component in the Gradio UI.
+ * **Default Value:** `200`
+
+* **`SHOW_QUICKSTART`**
+ * **Description:** If set to `"True"`, displays a quickstart/walkthrough component in the Gradio interface to guide new users.
+ * **Default Value:** `"False"`
+
+* **`SHOW_SUMMARISATION`**
+ * **Description:** If set to `"True"`, shows the summarisation tab in the Gradio interface.
+ * **Default Value:** `"False"`
+
+* **`APPLY_DUPLICATES_TO_FILE_AUTOMATICALLY`**
+ * **Description:** If set to `"True"`, duplicate detection results can be applied to the file automatically in the UI workflow.
+ * **Default Value:** `"False"`
+
+* **`RUN_ALL_EXAMPLES_THROUGH_AWS`**
+ * **Description:** If set to `"True"`, example documents will be run through AWS (e.g. Textract) when examples are used.
+ * **Default Value:** `"False"`
+
+## Redaction & PII Options
+
+Configurations related to text extraction, PII detection, and the redaction process.
+
+### UI and Model Selection
+
+* **`EXTRACTION_AND_PII_OPTIONS_OPEN_BY_DEFAULT`**
+ * **Description:** If set to `"True"`, the "Extraction and PII Options" accordion in the UI will be open by default.
+ * **Default Value:** `"True"`
+
+* **`SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS`** / **`SHOW_AWS_TEXT_EXTRACTION_OPTIONS`**
+ * **Description:** Controls whether local (Tesseract) or AWS (Textract) text extraction options are shown in the UI.
+ * **Default Value:** `"True"` for both.
+ * **Note:** If both are set to `"False"`, the application will automatically enable `SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS` to ensure at least one option is available.
+
+* **`SELECTABLE_TEXT_EXTRACT_OPTION`**, **`LOCAL_OCR_MODEL_TEXT_EXTRACT_OPTION`**, **`TEXTRACT_TEXT_EXTRACT_OPTION`**
+ * **Description:** Labels for text extraction model options displayed in the UI. Customize the display names for "Local model - selectable text", "Local OCR model - PDFs without selectable text", and "AWS Textract service - all PDF types" respectively.
+ * **Default Values:** `"Local model - selectable text"`, `"Local OCR model - PDFs without selectable text"`, `"AWS Textract service - all PDF types"`
+
+* **`BEDROCK_VLM_TEXT_EXTRACT_OPTION`**, **`GEMINI_VLM_TEXT_EXTRACT_OPTION`**, **`AZURE_OPENAI_VLM_TEXT_EXTRACT_OPTION`**
+ * **Description:** Labels for cloud VLM text extraction options (AWS Bedrock, Google Gemini, Azure/OpenAI) displayed in the UI when the corresponding show options are enabled.
+ * **Default Values:** `"AWS Bedrock VLM OCR - all PDF types"`, `"Google Gemini VLM OCR - all PDF types"`, `"Azure/OpenAI VLM OCR - all PDF types"`
+
+* **`EFFICIENT_OCR`**
+ * **Description:** If set to `"True"`, uses a two-step OCR process for PDFs: try selectable text extraction per page first; only run OCR (Tesseract/Textract/VLM) on pages where no text could be extracted. Saves cost and time. Affects both the Gradio app and the CLI.
+ * **Default Value:** `"False"`
+
+* **`EFFICIENT_OCR_MIN_WORDS`**
+ * **Description:** Minimum number of extractable words on a page to use the text-only route when `EFFICIENT_OCR` is enabled; below this threshold the page uses OCR.
+ * **Default Value:** `20`
+
+* **`EFFICIENT_OCR_MIN_IMAGE_COVERAGE_FRACTION`**
+ * **Description:** When `EFFICIENT_OCR` is enabled, minimum fraction of the page area (MediaBox) that a single embedded image must cover to route that page through OCR (`redact_image_pdf`) in addition to the word-count rule. This catches pages with plenty of selectable text but meaningful raster content (e.g. photos) where text may appear only inside the image. Set to `0` to disable image-based routing and rely on word count only.
+ * **Default Value:** `0.01` (1% of page area)
+
+* **`MAX_WORKERS`**
+ * **Description:** Default maximum number of worker threads for parallel processing across the app. Used as the default or cap for thread pools in review (saving redacted pages, loading page images), file conversion (PDF-to-images, input file listing, annotation processing), file redaction (text extraction, merge/summary, pikepdf annotations), custom image analyser (mapping analyzer results to bounding boxes), and data anonymisation (initial column cleaning). When not set, `OCR_FIRST_PASS_MAX_WORKERS` and `SUMMARY_PAGE_GROUP_MAX_WORKERS` default to this value; set those explicitly to override for OCR or summarisation only.
+ * **Default Value:** `8`
+
+* **`OCR_FIRST_PASS_MAX_WORKERS`**
+ * **Description:** Maximum number of threads for the OCR first pass in image-based PDF redaction (e.g. when using AWS Textract). Set to `1` for sequential processing; use a value greater than `1` to run OCR on multiple pages in parallel. Only applies when the OCR method supports parallel calls (e.g. Textract). Defaults to `MAX_WORKERS` if not set. Can be overridden via CLI `--ocr_first_pass_max_workers` or direct-mode/Lambda env.
+ * **Default Value:** Same as `MAX_WORKERS` (e.g. `8`)
+
+* **`TESSERACT_MAX_WORKERS`**
+ * **Description:** Maximum number of worker threads used to run local Tesseract OCR on multiple pages in parallel (page-level parallelism). Keep this lower than `MAX_WORKERS` to avoid saturating CPU/RAM; increase gradually if you have spare CPU and want faster OCR.
+ * **Default Value:** `4`
+
+* **`PADDLE_MAX_WORKERS`**
+ * **Description:** Maximum number of worker threads used to run local PaddleOCR on multiple pages in parallel when **Paddle** is the chosen local OCR model. Paddle is often GPU-bound and shares one model instance; keep this low (e.g. `1`–`2`) to avoid VRAM contention or errors. Set to `1` for effectively sequential Paddle OCR per document.
+ * **Default Value:** `2`
+
+* **`LINE_TO_WORD_SEGMENT_MAX_WORKERS`**
+ * **Description:** Maximum number of worker threads used to run the line-to-word segmentation process. This is a CPU/memory intensive process and can slow down when matching `MAX_WORKERS` on busy documents.
+ * **Default Value:** `4`
+
+* **`SUMMARY_PAGE_GROUP_MAX_WORKERS`**
+ * **Description:** Maximum number of threads for page-group summarisation in document summarisation. Set to `1` for sequential processing (recommended for local LLM models); use a value greater than `1` to summarise multiple page groups in parallel (typically for API-based models). Defaults to `MAX_WORKERS` if not set. Can be overridden via CLI `--summary_page_group_max_workers` or direct-mode/Lambda env.
+ * **Default Value:** Same as `MAX_WORKERS` (e.g. `8`)
+
+### Hybrid Textract + Bedrock VLM
+
+When AWS Textract is selected for text extraction, these options allow lines with low confidence to be re-extracted using Bedrock VLM and the result to replace the Textract output for those lines.
+
+* **`SHOW_HYBRID_TEXTRACT_BEDROCK_CHECKBOX`**
+ * **Description:** When set to `'True'`, the "High-quality Textract OCR (re-run low-confidence lines with Bedrock VLM)" checkbox is shown in the GUI when AWS Textract options are visible. Set to `'False'` to hide this option from the UI.
+ * **Default Value:** `'False'`
+
+* **`HYBRID_TEXTRACT_BEDROCK_VLM`**
+ * **Description:** When set to `'True'` and AWS Textract is selected, lines whose average confidence is below `HYBRID_TEXTRACT_BEDROCK_VLM_CONFIDENCE_THRESHOLD` are re-analyzed with Bedrock VLM and the extracted text replaces the Textract result for those lines.
+ * **Default Value:** `'False'`
+
+* **`HYBRID_TEXTRACT_BEDROCK_VLM_CONFIDENCE_THRESHOLD`**
+ * **Description:** Line average confidence (0–100) below this value triggers Bedrock VLM re-extraction when `HYBRID_TEXTRACT_BEDROCK_VLM` is enabled.
+ * **Default Value:** `97`
+
+* **`HYBRID_TEXTRACT_BEDROCK_VLM_PADDING`**
+ * **Description:** Padding in pixels around the line crop when calling Bedrock VLM for re-extraction.
+ * **Default Value:** `5`
+
+* **`SAVE_TEXTRACT_BEDROCK_HYBRID_EXAMPLES`**
+ * **Description:** If set to `"True"`, saves example images or artefacts from the Textract + Bedrock VLM hybrid re-extraction path for debugging (see `tools/config.py` / hybrid Textract–Bedrock implementation).
+ * **Default Value:** `"False"`
+
+### Bedrock VLM and LLM cost estimation
+
+These variables are used by the app to estimate AWS cost when "AWS Bedrock VLM OCR - all PDF types" or "LLM (AWS Bedrock)" PII detection is selected. Estimates are shown in the UI (e.g. estimated cost and time).
+
+* **`BEDROCK_VLM_INPUT_COST`**
+ * **Description:** USD per million input tokens for Bedrock VLM OCR cost estimate.
+ * **Default Value:** `3.0`
+
+* **`BEDROCK_VLM_OUTPUT_COST`**
+ * **Description:** USD per million output tokens for Bedrock VLM OCR cost estimate.
+ * **Default Value:** `15.0`
+
+* **`BEDROCK_VLM_PIXELS_PER_INPUT_TOKEN`**
+ * **Description:** Pixels (width × height) per input token; used with `VLM_MAX_IMAGE_SIZE` to estimate input tokens per page for VLM cost calculation.
+ * **Default Value:** `2500`
+
+* **`BEDROCK_LLM_INPUT_COST`**
+ * **Description:** USD per million input tokens for Bedrock LLM (e.g. PII detection) cost estimate.
+ * **Default Value:** `3.0`
+
+* **`BEDROCK_LLM_OUTPUT_COST`**
+ * **Description:** USD per million output tokens for Bedrock LLM cost estimate.
+ * **Default Value:** `15.0`
+
+* **`BEDROCK_LLM_INPUT_TOKENS_PER_PAGE`**
+ * **Description:** Estimated input tokens per page used for Bedrock LLM cost calculation.
+ * **Default Value:** `2000`
+
+* **`BEDROCK_LLM_OUTPUT_TOKENS_PER_PAGE`**
+ * **Description:** Estimated output tokens per page used for Bedrock LLM cost calculation.
+ * **Default Value:** `250`
+
+* **`SHOW_BEDROCK_VLM_MODELS`**, **`SHOW_GEMINI_VLM_MODELS`**, **`SHOW_AZURE_OPENAI_VLM_MODELS`**
+ * **Description:** Controls whether AWS Bedrock, Google Gemini, or Azure/OpenAI VLM text extraction options are shown in the UI.
+ * **Default Value:** `"False"` for all.
+
+* **`NO_REDACTION_PII_OPTION`**, **`LOCAL_PII_OPTION`**, **`AWS_PII_OPTION`**
+ * **Description:** Labels for PII detection model options displayed in the UI. Customize the display names for "Only extract text (no redaction)", "Local", and "AWS Comprehend" respectively.
+ * **Default Values:** `"Only extract text (no redaction)"`, `"Local"`, `"AWS Comprehend"`
+
+* **`AWS_LLM_PII_OPTION`**, **`INFERENCE_SERVER_PII_OPTION`**, **`LOCAL_TRANSFORMERS_LLM_PII_OPTION`**
+ * **Description:** Labels for LLM-based PII detection options: AWS Bedrock, local inference server, and local transformers LLM.
+ * **Default Values:** `"LLM (AWS Bedrock)"`, `"Local inference server"`, `"Local transformers LLM"`
+
+* **`SHOW_INFERENCE_SERVER_PII_OPTIONS`**, **`SHOW_TRANSFORMERS_LLM_PII_DETECTION_OPTIONS`**, **`SHOW_AWS_BEDROCK_LLM_MODELS`**
+ * **Description:** Controls whether inference-server, local transformers LLM, or AWS Bedrock LLM PII detection options are shown in the UI.
+ * **Default Value:** `"False"` for inference-server and transformers; `"False"` for AWS Bedrock LLM.
+
+* **`CHOSEN_LLM_PII_INFERENCE_METHOD`**
+ * **Description:** Default LLM inference method for PII detection. Options include `"aws-bedrock"`, `"local"`, `"inference-server"`, `"azure-openai"`, `"gemini"`. Only methods enabled via the corresponding `SHOW_*_LLM_PII_OPTIONS` appear in the UI.
+ * **Default Value:** `"aws-bedrock"`
+
+* **`SHOW_LOCAL_LLM_PII_OPTIONS`**, **`SHOW_INFERENCE_SERVER_LLM_PII_OPTIONS`**, **`SHOW_AZURE_LLM_PII_OPTIONS`**, **`SHOW_GEMINI_LLM_PII_OPTIONS`**
+ * **Description:** Controls whether local, inference-server, Azure/OpenAI, or Gemini LLM PII options are shown in the UI.
+ * **Default Value:** `"False"` for all.
+
+* **`SHOW_PII_IDENTIFICATION_OPTIONS`**
+ * **Description:** If set to `"True"`, shows the PII entity identification/selection options in the UI.
+ * **Default Value:** `"True"`
+
+* **`SHOW_GEMINI_LLM_MODELS`**, **`SHOW_AZURE_LLM_MODELS`**, **`SHOW_INFERENCE_SERVER_LLM_MODELS`**
+ * **Description:** Controls whether Google Gemini, Azure/OpenAI, or inference-server LLM options appear in the PII detection model list. When enabled, the corresponding models (e.g. gemini-2.5-flash, gpt-4o-mini) can be selected for PII detection.
+ * **Default Value:** `"False"` for all.
+
+* **`SHOW_LOCAL_PII_DETECTION_OPTIONS`** / **`SHOW_AWS_PII_DETECTION_OPTIONS`**
+ * **Description:** Controls whether local or AWS (Comprehend) PII detection options are shown in the UI.
+ * **Default Value:** `"True"` for both.
+ * **Note:** If both are set to `"False"`, the application will automatically enable `SHOW_LOCAL_PII_DETECTION_OPTIONS` to ensure at least one option is available.
+
+* **`DEFAULT_TEXT_EXTRACTION_MODEL`**
+ * **Description:** Sets the default text extraction model selected in the UI.
+ * **Default Value:** Automatically defaults to AWS Textract if `SHOW_AWS_TEXT_EXTRACTION_OPTIONS` is enabled, otherwise defaults to the local selectable text option.
+
+* **`DEFAULT_PII_DETECTION_MODEL`**
+ * **Description:** Sets the default PII detection model selected in the UI.
+ * **Default Value:** Automatically defaults to AWS Comprehend if `SHOW_AWS_PII_DETECTION_OPTIONS` is enabled, otherwise defaults to the local model.
+
+* **`LOAD_REDACTION_ANNOTATIONS_FROM_PDF`**
+ * **Description:** If set to `"True"`, the application will load existing redaction annotations from PDFs during the review step.
+ * **Default Value:** `"True"`
+
+### External Tool Paths
+
+* **`TESSERACT_FOLDER`**
+ * **Description:** Path to the local Tesseract OCR installation folder.
+ * **Default Value:** `''`
+ * **Installation Note:** For Windows, install Tesseract 5.5.0 from [UB-Mannheim/tesseract](https://github.com/UB-Mannheim/tesseract/wiki). This environment variable should point to the Tesseract folder (e.g., `tesseract/`).
+
+* **`TESSERACT_DATA_FOLDER`**
+ * **Description:** Path to the Tesseract trained data files (`tessdata`).
+ * **Default Value:** `"/usr/share/tessdata"`
+
+* **`POPPLER_FOLDER`**
+ * **Description:** Path to the local Poppler installation's `bin` folder.
+ * **Default Value:** `''`
+ * **Installation Note:** For Windows, install Poppler from [oschwartz10612/poppler-windows](https://github.com/oschwartz10612/poppler-windows). This variable needs to point to the Poppler bin folder (e.g., `poppler/poppler-24.02.0/Library/bin/`).
+
+* **`PADDLE_MODEL_PATH`** / **`SPACY_MODEL_PATH`**
+ * **Description:** Custom directory for PaddleOCR and spaCy model storage, useful for environments like AWS Lambda.
+ * **Default Value:** `''` (uses default location).
+
+* **`PADDLE_FONT_PATH`**
+ * **Description:** Custom font path for PaddleOCR. If empty, the application will attempt to use system fonts to avoid downloading `simfang.ttf` or `PingFang-SC-Regular.ttf`. Set this if you want to use a specific font file for PaddleOCR text rendering.
+ * **Default Value:** `''` (uses system fonts).
+
+* **`GEMINI_API_KEY`**
+ * **Description:** API key for Google Gemini. Required when using Gemini for PII detection or VLM OCR (`SHOW_GEMINI_LLM_MODELS` or `SHOW_GEMINI_VLM_MODELS`).
+ * **Default Value:** `''`
+
+* **`AZURE_OPENAI_API_KEY`**
+ * **Description:** API key for Azure or OpenAI. Required when using Azure/OpenAI for PII detection or VLM OCR (`SHOW_AZURE_LLM_MODELS` or `SHOW_AZURE_OPENAI_VLM_MODELS`).
+ * **Default Value:** `''`
+
+* **`AZURE_OPENAI_INFERENCE_ENDPOINT`**
+ * **Description:** Inference endpoint URL for Azure/OpenAI. Used with `AZURE_OPENAI_API_KEY` when Azure/OpenAI LLM or VLM options are enabled.
+ * **Default Value:** `''`
+
+* **`HF_TOKEN`**
+ * **Description:** Hugging Face token. May be required for downloading gated or private models (e.g. some local transformers LLM or VLM models).
+ * **Default Value:** `''`
+
+### Local OCR (Tesseract & PaddleOCR)
+
+* **`SHOW_OCR_GUI_OPTIONS`**
+ * **Description:** If set to `"True"`, OCR-related options (e.g. local OCR model, Paddle options) are shown in the UI.
+ * **Default Value:** `"True"`
+
+* **`DEFAULT_LOCAL_OCR_MODEL`**
+ * **Description:** Choose the engine for local OCR: `"tesseract"`, `"paddle"`, `"hybrid-paddle"`, `"hybrid-vlm"`, `"hybrid-paddle-vlm"`, `"hybrid-paddle-inference-server"`, `"vlm"`, or `"inference-server"`.
+ * **Default Value:** `"tesseract"`
+
+* **`SHOW_LOCAL_OCR_MODEL_OPTIONS`**
+ * **Description:** If set to `"True"`, allows the user to select the local OCR model from the UI.
+ * **Default Value:** `"False"`
+
+* **`HYBRID_OCR_CONFIDENCE_THRESHOLD`**
+ * **Description:** In "hybrid-paddle" mode, this is the Tesseract confidence score below which PaddleOCR will be used for re-extraction.
+ * **Default Value:** `95`
+
+* **`HYBRID_OCR_PADDING`**
+ * **Description:** In "hybrid-paddle" mode, padding added to the word's bounding box before re-extraction.
+ * **Default Value:** `1`
+
+* **`PADDLE_USE_TEXTLINE_ORIENTATION`**
+ * **Description:** Toggles textline orientation detection for PaddleOCR.
+ * **Default Value:** `"False"`
+
+* **`PADDLE_DET_DB_UNCLIP_RATIO`**
+ * **Description:** Controls the expansion ratio of the detected text region in PaddleOCR.
+ * **Default Value:** `1.2`
+
+* **`SAVE_EXAMPLE_HYBRID_IMAGES`**
+ * **Description:** Saves comparison images when using "hybrid-paddle" OCR mode.
+ * **Default Value:** `"False"`
+
+* **`SAVE_PAGE_OCR_VISUALISATIONS`**
+ * **Description:** If set to `"True"`, saves visualisations of Tesseract, PaddleOCR, and Textract bounding boxes overlaid on the page images.
+ * **Default Value:** `"False"`
+
+* **`INCLUDE_OCR_VISUALISATION_IN_OUTPUT_FILES`**
+ * **Description:** If set to `"True"`, OCR visualisation output files (created when `SAVE_PAGE_OCR_VISUALISATIONS` is enabled) will be included in the final output file list returned by `choose_and_run_redactor`. This makes the visualisation files available in the Gradio output interface.
+ * **Default Value:** `"False"`
+
+* **`SAVE_WORD_SEGMENTER_OUTPUT_IMAGES`**
+ * **Description:** If set to `"True"`, saves output images from the word segmenter for debugging purposes.
+ * **Default Value:** `"False"`
+
+* **`PREPROCESS_LOCAL_OCR_IMAGES`**
+ * **Description:** If set to `"True"`, images will be preprocessed before local OCR. Can slow down processing.
+ * **Default Value:** `"True"`
+ * **Note:** Testing has shown that preprocessing doesn't necessarily improve OCR results and can significantly slow down extraction. Consider setting this to `"False"` if processing speed is a priority.
+
+* **`SAVE_PREPROCESS_IMAGES`**
+ * **Description:** If set to `"True"`, saves the preprocessed images for debugging purposes.
+ * **Default Value:** `"False"`
+
+* **`SHOW_PADDLE_MODEL_OPTIONS`**
+ * **Description:** If set to `"True"`, allows the user to select PaddleOCR-related options (paddle, hybrid-paddle) from the UI.
+ * **Default Value:** `"False"`
+
+* **`MODEL_CACHE_PATH`**
+ * **Description:** Path to the directory where models are cached.
+ * **Default Value:** `"./model_cache"`
+
+* **`TESSERACT_SEGMENTATION_LEVEL`**
+ * **Description:** Tesseract PSM (Page Segmentation Mode) level to use for OCR. Valid values are 0-13.
+ * **Default Value:** `11`
+
+* **`TESSERACT_WORD_LEVEL_OCR`**
+ * **Description:** If set to `"True"`, uses Tesseract word-level OCR instead of line-level.
+ * **Default Value:** `"True"`
+
+* **`CONVERT_LINE_TO_WORD_LEVEL`**
+ * **Description:** If set to `"True"`, converts PaddleOCR line-level OCR results to word-level for better precision.
+ * **Default Value:** `"False"`
+
+* **`LOAD_PADDLE_AT_STARTUP`**
+ * **Description:** If set to `"True"`, loads the PaddleOCR model at application startup.
+ * **Default Value:** `"False"`
+
+* **`USE_TRANSFORMERS_VLM_MODEL_AS_LLM`**
+ * **Description:** When set to `"True"`, use the same local transformers VLM model (e.g. Qwen3-VL-4B-Instruct) for LLM tasks (e.g. PII entity detection in `tools/llm_entity_detection.py`) as for VLM/OCR. Overrides `LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE` when local transformers LLM is selected. The VLM weights must be loaded first (e.g. **`LOAD_TRANSFORMERS_VLM_MODEL_AT_START=True`**, or by running a VLM OCR task first); otherwise the standard PII model is used.
+ * **Default Value:** `"False"`
+
+* **`LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE`**
+ * **Description:** Identifier for the local transformers LLM used for PII detection (e.g. `"gemma-3-4b"`, `"qwen-3-4b"`). Used for display and to select the correct repo/file/folder. Supported values include gemma-3-4b, gemma-3-12b, gemma-2, qwen-3-4b, GPT-OSS 20B, granite-4-tiny, granite-4-micro. Ignored for local LLM when `USE_TRANSFORMERS_VLM_MODEL_AS_LLM` is `"True"` (the VLM model is used instead).
+ * **Default Value:** `"gemma-3-4b"`
+
+* **`LOCAL_TRANSFORMERS_LLM_PII_REPO_ID`**, **`LOCAL_TRANSFORMERS_LLM_PII_MODEL_FILE`**, **`LOCAL_TRANSFORMERS_LLM_PII_MODEL_FOLDER`**
+ * **Description:** Hugging Face repo ID, for the local transformers PII detection model. Can be overridden automatically based on `LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE`.
+ * **Default Values:** e.g. `"unsloth/gemma-3-4b-it-bnb-4bit"`.
+
+* **`OVERRIDE_LLM_TRANSFORMERS_REPO_ID`**
+ * **Description:** If non-empty, overrides **`LOCAL_TRANSFORMERS_LLM_PII_REPO_ID`** (the Hugging Face repo ID or local path used for the local transformers LLM PII model). Use to point at a different checkpoint or a local folder without changing `LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE`. Leave empty for no override.
+ * **Default Value:** `''` (empty)
+
+* **`LOAD_TRANSFORMERS_LLM_PII_MODEL_AT_START`**
+ * **Description:** If set to `"True"`, loads the local transformers LLM PII model at application startup (reduces latency on first use).
+ * **Default Value:** `"False"`
+
+* **`MULTIMODAL_PROMPT_FORMAT`**
+ * **Description:** If set to `"True"`, uses multimodal prompt format for the local LLM. Set automatically for some model choices (e.g. Gemma 3 4B/12B).
+ * **Default Value:** `"False"`
+
+* **`REASONING_SUFFIX`**
+ * **Description:** Suffix added to prompts for reasoning-capable models (e.g. `"Reasoning: low"` for GPT-OSS 20B, `"/nothink"` for Qwen 3 4B to disable chain-of-thought). Model-specific; see config for defaults.
+ * **Default Value:** Model-dependent (e.g. `"Reasoning: low"` or `"/nothink"` or `''`).
+
+* **`PRINT_TRANSFORMERS_USER_PROMPT`**
+ * **Description:** If set to `"True"`, prints the system and user prompts to the console when calling the local transformers LLM (`tools/llm_funcs.py`). Useful for debugging; avoid in production if logs are sensitive.
+ * **Default Value:** `"False"`
+
+* **LLM generation parameters:** **`LLM_TEMPERATURE`** (default `0.1`), **`LLM_TOP_K`** (`64`), **`LLM_MIN_P`** (`0`), **`LLM_TOP_P`** (`0.95`), **`LLM_REPETITION_PENALTY`** (`1.0`), **`LLM_MAX_NEW_TOKENS`** (`4096`), **`LLM_CONTEXT_LENGTH`** (`32768`), **`LLM_SEED`** (`42`), **`LLM_RESET`** (`False`), **`LLM_STREAM`** (`True`), **`LLM_STOP_STRINGS`**, **`LLM_THREADS`** (`-1`), **`SPECULATIVE_DECODING`** (`False`), **`ASSISTANT_MODEL`**, **`LLM_MODEL_DTYPE`** (`bfloat16`), and other **`LLM_*`** / related variables in `tools/config.py` control local/transformers LLM generation in `tools/llm_funcs.py`. **`LLM_TEMPERATURE`** and **`LLM_MAX_NEW_TOKENS`** also control PII detection LLM calls and retries.
+
+* **`SHOW_INFERENCE_SERVER_VLM_OPTIONS`**
+ * **Description:** If set to `"True"`, allows the user to select inference-server-related options from the UI.
+ * **Default Value:** `"False"`
+
+* **`SHOW_HYBRID_MODELS`**
+ * **Description:** If set to `"True"`, enables hybrid model options (e.g., hybrid-paddle-vlm, hybrid-paddle-inference-server) in the UI.
+ * **Default Value:** `"False"`
+
+* **`INFERENCE_SERVER_API_URL`**
+ * **Description:** Base URL of the inference-server API for remote OCR processing and PII detection.
+ * **Default Value:** `"http://localhost:8080"`
+
+* **`INFERENCE_SERVER_MODEL_NAME`**
+ * **Description:** Optional model name to use for inference-server API. If empty, uses the default model on the server.
+ * **Default Value:** `''`
+
+* **`INFERENCE_SERVER_TIMEOUT`**
+ * **Description:** Timeout in seconds for inference-server API requests.
+ * **Default Value:** `300`
+
+* **`DEFAULT_INFERENCE_SERVER_VLM_MODEL`**
+ * **Description:** Default model name for inference-server VLM API calls. If empty, uses `INFERENCE_SERVER_MODEL_NAME` or the server default.
+ * **Default Value:** `"qwen_3_vl_30b_a3b_it"`
+
+* **`DEFAULT_INFERENCE_SERVER_PII_MODEL`**
+ * **Description:** Default model name for inference-server PII detection API calls. If empty, uses `INFERENCE_SERVER_MODEL_NAME`, `CHOSEN_INFERENCE_SERVER_PII_MODEL`, or the server default.
+ * **Default Value:** `"gemma_3_12b"`
+
+* **`CHOSEN_INFERENCE_SERVER_PII_MODEL`**
+ * **Description:** Default inference-server PII model selected when `SHOW_INFERENCE_SERVER_LLM_MODELS` is enabled. Must be one of the registered inference-server models.
+ * **Default Value:** First model in the inference-server models list (e.g. `"unnamed-inference-server-model"`).
+
+* **`INFERENCE_SERVER_LLM_PII_MODEL_CHOICE`**
+ * **Description:** Primary config variable for the inference-server PII model used at runtime. Defaults to `DEFAULT_INFERENCE_SERVER_PII_MODEL`, then `CHOSEN_INFERENCE_SERVER_PII_MODEL`, if not set.
+ * **Default Value:** Inherits from `DEFAULT_INFERENCE_SERVER_PII_MODEL` or `CHOSEN_INFERENCE_SERVER_PII_MODEL`.
+
+* **`SHOW_INFERENCE_SERVER_VLM_MODEL_OPTIONS`**
+ * **Description:** If set to `"True"`, allows the user to select from inference-server VLM model options in the UI.
+ * **Default Value:** `"False"`
+
+### Vision Language Model (VLM) Options
+
+* **`SHOW_VLM_MODEL_OPTIONS`**
+ * **Description:** If set to `"True"`, VLM (Vision Language Model) options will be shown in the UI.
+ * **Default Value:** `"False"`
+
+* **`LOAD_TRANSFORMERS_VLM_MODEL_AT_START`**
+ * **Description:** When `SHOW_VLM_MODEL_OPTIONS` is `"True"`, if set to `"True"`, loads local transformers VLM weights at application startup (same as historical behavior). If `"False"`, weights load on the first VLM OCR call (`tools/run_vlm.py`), reducing startup memory at the cost of a slower first run. Compare with **`LOAD_TRANSFORMERS_LLM_PII_MODEL_AT_START`** for the PII LLM (which defaults to `"False"`).
+ * **Default Value:** `"True"`
+
+* **`SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL`**
+ * **Description:** Selected vision model for OCR. Choose from: `"Nanonets-OCR2-3B"`, `"Dots.OCR"`, `"Qwen3-VL-2B-Instruct"`, `"Qwen3-VL-4B-Instruct"`, `"Qwen3-VL-8B-Instruct"`, `"Qwen3-VL-30B-A3B-Instruct"`, `"Qwen3-VL-235B-A22B-Instruct"`, `"PaddleOCR-VL"`.
+ * **Default Value:** `"Qwen3-VL-8B-Instruct"`
+
+* **`QUANTISE_VLM_MODELS`**
+ * **Description:** If set to `"True"`, the VLM models will be quantized using 4-bit quantisation (bitsandbytes).
+ * **Default Value:** `"False"`
+
+* **`OVERRIDE_VLM_REPO_ID`**
+ * **Description:** If non-empty, overrides the Hugging Face repo ID or local path used for the selected local transformers VLM (`SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL`). Use to point at a different checkpoint or a local folder. Leave empty for no override.
+ * **Default Value:** `''` (empty)
+
+* **`USE_FLASH_ATTENTION`**
+ * **Description:** If set to `"True"`, uses flash attention for the VLM, which can improve performance (not possible on Windows).
+ * **Default Value:** `"False"`
+
+* **`VLM_MAX_IMAGE_SIZE`**
+ * **Description:** Upper bound on total pixels (width × height) for images sent to the VLM after preparation (`_prepare_image_for_vlm`). Larger images are downscaled (aspect ratio preserved). For AWS Bedrock VLM paths, the effective cap is raised automatically (much larger budget) so this variable mainly limits local/inference-server/Gemini/Azure full-page and hybrid crops unless Bedrock is used. Multiples of 32×32 align with Qwen3-VL-style tiling.
+ * **Default Value:** `819200`
+
+* **`VLM_MIN_IMAGE_SIZE`**
+ * **Description:** Minimum total pixels for **full-page** VLM OCR (local transformers, inference-server, Gemini, Azure/OpenAI, Bedrock page). Smaller pages are upscaled (aspect preserved). Does **not** apply to hybrid line/crop VLM passes—those use `VLM_HYBRID_MIN_IMAGE_SIZE`. Preparation combines this with `VLM_MIN_DPI` / `VLM_MAX_DPI`; if constraints conflict, max pixel/DPI caps win (see console warning).
+ * **Default Value:** `614400`
+
+* **`VLM_HYBRID_MIN_IMAGE_SIZE`**
+ * **Description:** Minimum total pixels for **hybrid** VLM crops (second pass on line regions: local VLM, inference-server, Bedrock, Gemini, Azure in hybrid modes). Smaller crops are upscaled. Full-page VLM uses `VLM_MIN_IMAGE_SIZE` instead. Subject to the same DPI and max-pixel logic as other VLM images.
+ * **Default Value:** `153600`
+
+* **`VLM_MIN_DPI`**
+ * **Description:** Minimum effective DPI after preparation. Uses the image’s reported DPI from metadata (`image.info["dpi"]`), defaulting to **72** if absent. If reported DPI is below this value, the image is upscaled so that **reported_dpi × scale** falls within `[VLM_MIN_DPI, VLM_MAX_DPI]` (together with min/max pixel limits).
+ * **Default Value:** `200.0`
+
+* **`VLM_MAX_DPI`**
+ * **Description:** Maximum effective DPI after preparation. High-DPI scans are downscaled so effective DPI (see `VLM_MIN_DPI`) does not exceed this value, subject to `VLM_MAX_IMAGE_SIZE` (and hybrid/full-page min pixel settings).
+ * **Default Value:** `300.0`
+
+* **`VLM_MAX_ASPECT_RATIO`**
+ * **Description:** Upper bound on image aspect ratio **after** white-padding for every VLM path in `custom_image_analyser_engine`: local transformers (`extract_text_from_image_vlm`), inference-server (`_call_inference_server_vlm_api`), AWS Bedrock (`_call_bedrock_vlm_api`), Gemini, and Azure/OpenAI. The code pads very long/thin crops (typical hybrid line regions) so `max(width/height, height/width)` does not exceed this value. Lower values are stricter (more padding, smaller embedded text relative to canvas); higher values send nearer-native crops but may hit provider limits. Values below `1.0` are clamped to `1.0`. For Bedrock, if the image is still above the AWS Converse vision limit (20:1) after this step, a second stricter pad is applied automatically.
+ * **Default Value:** `10.0`
+
+* **`VLM_DISABLE_QWEN3_5_THINKING`**
+ * **Description:** If set to `"True"`, disables Qwen3.5 "thinking" mode for **local transformers** VLM calls. Implemented by appending `` to the generation prompt after `apply_chat_template`, so the model skips its reasoning chain and outputs the answer directly.
+ * **Default Value:** `"False"`
+
+* **`INFERENCE_SERVER_DISABLE_THINKING`**
+ * **Description:** If set to `"True"`, disables thinking for **inference-server** VLM calls (the server-side equivalent of `VLM_DISABLE_QWEN3_5_THINKING`). Adds `chat_template_kwargs: {"enable_thinking": false}` to every request payload; vLLM applies the Qwen3/Qwen3.5 chat template server-side and honours this flag to suppress `...` generation entirely. Useful when the inference server is running a Qwen3.5 reasoning model and thinking tokens are unnecessary (e.g. for simple line-crop OCR in `hybrid-paddle-inference-server` mode). Eliminates thinking overhead and removes the need to raise `HYBRID_OCR_MAX_NEW_TOKENS` to accommodate the reasoning budget.
+ * **Default Value:** `"False"`
+
+* **`MAX_NEW_TOKENS`**
+ * **Description:** Maximum number of tokens to generate for VLM responses.
+ * **Default Value:** `4096`
+
+* **`DEFAULT_MAX_NEW_TOKENS`**
+ * **Description:** Default maximum number of tokens to generate for VLM responses.
+ * **Default Value:** `4096`
+
+* **`MAX_INPUT_TOKEN_LENGTH`**
+ * **Description:** Maximum number of tokens that can be input to the VLM.
+ * **Default Value:** `8192`
+
+* **`OVERWRITE_EXISTING_OCR_RESULTS`**
+ * **Description:** If set to `"True"`, always creates new OCR results instead of loading from existing JSON files.
+ * **Default Value:** `"False"`
+
+* **`SAVE_VLM_INPUT_IMAGES`**
+ * **Description:** If set to `"True"`, saves input images sent to VLM OCR for debugging purposes.
+ * **Default Value:** `"False"`
+
+* **`ADD_VLM_BOUNDING_BOX_RULES`**
+ * **Description:** If set to `"True"`, adds bounding box rules to the VLM prompt (e.g. coordinate format or output constraints for OCR).
+ * **Default Value:** `"False"`
+
+* **`HYBRID_OCR_MAX_NEW_TOKENS`**
+ * **Description:** Maximum number of tokens the inference server (or local VLM) may generate per hybrid line-crop OCR call. This is the generation budget passed directly to the model via `max_tokens`. For reasoning models such as Qwen3.5, thinking tokens (`...`) count against this budget before any answer is produced, so the value must be large enough to cover both the reasoning chain and the short JSON answer. Setting this too low (e.g. `50`) causes the model to exhaust the budget during thinking and return empty results for every line. Increase to `2048` or higher if you observe "Inference server returned no results" log lines when using `hybrid-paddle-inference-server` with a reasoning model.
+ * **Default Value:** `1024`
+
+* **`HYBRID_OCR_MAX_WORDS`**
+ * **Description:** Maximum number of words allowed in a hybrid OCR result for a single text line. Results whose word count exceeds this threshold are discarded as likely hallucinations or runaway model output, and the original OCR result (e.g. from Paddle) is kept instead. This is a separate sanity check from `HYBRID_OCR_MAX_NEW_TOKENS` — it operates on the parsed response text, not on the token budget. A single line of text rarely contains more than ~50 words, so the default is conservative.
+ * **Default Value:** `50`
+
+* **`VLM_OCR_INTRO_TEXT`**
+ * **Description:** UI intro text when the local OCR option **vlm** (full-page vision-language OCR) is shown. Customise for your deployment.
+ * **Default Value:** `"vlm" will call the chosen vision model (VLM) to return a structured json output that is then parsed into word-level bounding boxes.`
+
+* **`VLM_OCR_HYBRID_INTRO_TEXT`**
+ * **Description:** UI intro text for **hybrid-vlm** (Tesseract first pass + VLM on selected lines/regions).
+ * **Default Value:** `"hybrid-vlm" is a combination of Tesseract for OCR, and a second pass with the chosen vision model (VLM).`
+
+* **`REPORT_VLM_OUTPUTS_TO_GUI`**
+ * **Description:** If set to `"True"`, reports VLM outputs to the GUI with info boxes as they are processed.
+ * **Default Value:** `"False"`
+
+* **`REPORT_LLM_OUTPUTS_TO_GUI`**
+ * **Description:** If set to `"True"`, reports streamed LLM outputs (local transformers or local inference-server PII detection) to the GUI with `gr.Info` as they are processed.
+ * **Default Value:** `"False"`
+
+* **`CLOUD_VLM_MODEL_CHOICE`**
+ * **Description:** Default model choice for cloud VLM OCR (AWS Bedrock, Gemini, or Azure/OpenAI). Used when cloud VLM options are enabled.
+ * **Default Value:** `"qwen.qwen3-vl-235b-a22b"` (or first available cloud model if empty)
+
+* **`SHOW_CUSTOM_VLM_ENTITIES`**
+ * **Description:** If set to `"True"`, appends **`CUSTOM_VLM_FACES`** and **`CUSTOM_VLM_SIGNATURE`** to the redaction entity lists when at least one of `SHOW_VLM_MODEL_OPTIONS`, `SHOW_INFERENCE_SERVER_VLM_OPTIONS`, or `SHOW_BEDROCK_VLM_MODELS` is enabled—so users can redact detected faces and signatures from VLM-capable pipelines.
+ * **Default Value:** `"False"`
+
+* **`CUSTOM_VLM_BACKEND`**
+ * **Description:** Backend used for CUSTOM_VLM_FACES and CUSTOM_VLM_SIGNATURE (face and signature detection) when those entities are selected for redaction. One of: `"transformers_vlm"` (local transformers VLM), `"inference_vlm"` (inference-server VLM), or `"bedrock_vlm"` (AWS Bedrock VLM). Applies regardless of the PII identification method (Local, AWS Comprehend, or LLM). Value is case-insensitive; invalid values fall back to `"bedrock_vlm"`.
+ * **Default Value:** `"bedrock_vlm"`
+
+* **`CUSTOM_VLM_MIN_CONFIDENCE`**
+ * **Description:** Minimum confidence required for VLM face/signature detections (**`CUSTOM_VLM_FACES`** / **`CUSTOM_VLM_SIGNATURE`**) to be kept for downstream outputs (annotations, decision tables, and PDF redaction). Values can be provided as `0..1` (e.g. `0.5`) or `0..100` (e.g. `50`).
+ * **Default Value:** `0.5`
+
+* **`CLOUD_LLM_PII_MODEL_CHOICE`**
+ * **Description:** Default AWS Bedrock (or cloud) model for LLM-based PII detection when `SHOW_AWS_BEDROCK_LLM_MODELS` (or other cloud LLM options) is enabled. Used by `tools/llm_entity_detection.py`.
+ * **Default Value:** `"amazon.nova-pro-v1:0"`
+
+* **`CLOUD_LLM_PII_CUSTOM_INSTRUCTIONS_MODEL_CHOICE`**
+ * **Description:** Optional “upgraded” cloud model used for LLM-based PII detection when custom instructions are provided (e.g. allow-list or user-specific rules). If set and non-empty, it overrides `CLOUD_LLM_PII_MODEL_CHOICE` whenever `custom_instructions` are passed to the entity-detection LLM. Leave empty to always use `CLOUD_LLM_PII_MODEL_CHOICE`.
+ * **Default Value:** `''` (empty; no override)
+
+* **`CLOUD_SUMMARISATION_MODEL_CHOICE`**
+ * **Description:** Default AWS Bedrock (or cloud) model for LLM-based summarisation when using cloud inference (e.g. AWS Bedrock). Used by `tools/summaries.py`. Separate from `CLOUD_LLM_PII_MODEL_CHOICE` so you can assign a different model for summarisation than for entity detection.
+ * **Default Value:** `"amazon.nova-pro-v1:0"`
+
+* **`VLM_SEED`**
+ * **Description:** Random seed for VLM generation. If empty, no seed is set (non-deterministic). If set to an integer, generation will be deterministic.
+ * **Default Value:** `''` (empty; no seed set)
+
+* **`VLM_DEFAULT_TEMPERATURE`**
+ * **Description:** Default temperature for VLM generation. If empty, model-specific defaults are used.
+ * **Default Value:** `''`
+
+* **`VLM_DEFAULT_TOP_P`**
+ * **Description:** Default top_p (nucleus sampling) for VLM generation. If empty, model-specific defaults are used.
+ * **Default Value:** `''`
+
+* **`VLM_DEFAULT_MIN_P`**
+ * **Description:** Default min_p (minimum probability threshold) for VLM generation. If empty, model-specific defaults are used.
+ * **Default Value:** `''`
+
+* **`VLM_DEFAULT_TOP_K`**
+ * **Description:** Default top_k for VLM generation. If empty, model-specific defaults are used.
+ * **Default Value:** `''`
+
+* **`VLM_DEFAULT_REPETITION_PENALTY`**
+ * **Description:** Default repetition penalty for VLM generation. If empty, model-specific defaults are used.
+ * **Default Value:** `''`
+
+* **`VLM_DEFAULT_DO_SAMPLE`**
+ * **Description:** Default do_sample setting for VLM generation. `"True"` means use sampling (do_sample=True), `"False"` means use greedy decoding (do_sample=False). If empty, model-specific defaults are used.
+ * **Default Value:** `''`
+
+* **`VLM_DEFAULT_PRESENCE_PENALTY`**
+ * **Description:** Default presence penalty for VLM generation. If empty, model-specific defaults are used.
+ * **Default Value:** `''`
+
+### Entity and Search Options
+
+* **`CHOSEN_COMPREHEND_ENTITIES`** / **`FULL_COMPREHEND_ENTITY_LIST`**
+ * **Description:** The selected and available PII entity types for AWS Comprehend.
+ * **Default Value:** Predefined lists of entities (see script).
+
+* **`CHOSEN_LLM_ENTITIES`** / **`FULL_LLM_ENTITY_LIST`**
+ * **Description:** The selected and available PII entity types for LLM-based PII detection (e.g. AWS Bedrock, local transformers, inference server).
+ * **Default Value:** Predefined lists (e.g. `['EMAIL_ADDRESS','ADDRESS','NAME','PHONE_NUMBER', 'CUSTOM']` for chosen; see script for full list).
+
+* **`CHOSEN_REDACT_ENTITIES`** / **`FULL_ENTITY_LIST`**
+ * **Description:** The selected and available PII entity types for the local model.
+ * **Default Value:** Predefined lists of entities (see script).
+
+* **`CUSTOM_ENTITIES`**
+ * **Description:** A list of entities that are considered "custom" and may have special handling.
+ * **Default Value:** `['TITLES', 'UKPOSTCODE', 'STREETNAME', 'CUSTOM']`
+
+* **`DEFAULT_SEARCH_QUERY`**
+ * **Description:** The default text for the custom search/redact input box.
+ * **Default Value:** `''`
+
+* **`DEFAULT_FUZZY_SPELLING_MISTAKES_NUM`**
+ * **Description:** Default number of allowed spelling mistakes for fuzzy searches.
+ * **Default Value:** `0`
+
+* **`DEFAULT_PAGE_MIN`** / **`DEFAULT_PAGE_MAX`**
+ * **Description:** Default start and end pages for processing. `0` for max means process all pages.
+ * **Default Value:** `0` for both.
+
+### Textract Feature Selection
+
+* **`DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX`**
+ * **Description:** The default options selected for Textract's handwriting and signature detection. Provide as a list string, e.g. `"['Extract handwriting', 'Extract signatures']"`.
+ * **Default Value:** `[]`
+
+* **`HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS`**
+ * **Description:** Full list of available options for Textract's handwriting and signature detection. Can include `'Extract handwriting'`, `'Extract signatures'`, and optionally `'Extract forms'`, `'Extract layout'`, `'Extract tables'`, and `'Face detection'` if the corresponding include options are enabled.
+ * **Default Value:** `['Extract handwriting', 'Extract signatures']`
+
+* **`INCLUDE_FORM_EXTRACTION_TEXTRACT_OPTION`**
+* **`INCLUDE_LAYOUT_EXTRACTION_TEXTRACT_OPTION`**
+* **`INCLUDE_TABLE_EXTRACTION_TEXTRACT_OPTION`**
+ * **Description:** Booleans (`"True"`/`"False"`) to include Forms, Layout, and Tables as selectable options for Textract analysis.
+ * **Default Value:** `"False"` for all.
+
+* **`INCLUDE_FACE_IDENTIFICATION_TEXTRACT_OPTION`**
+ * **Description:** When set to `"True"`, adds "Face detection" as a selectable option in the handwriting/signature checkbox group. When enabled with AWS Bedrock VLM OCR, a second VLM pass per page is run to identify faces (cost and time estimates account for this).
+ * **Default Value:** `"False"`
+
+### Tabular Data Options
+
+* **`DO_INITIAL_TABULAR_DATA_CLEAN`**
+ * **Description:** If `"True"`, performs an initial cleaning step on tabular data.
+ * **Default Value:** `"True"`
+
+* **`DEFAULT_TEXT_COLUMNS`** / **`DEFAULT_EXCEL_SHEETS`**
+ * **Description:** Default values for specifying which columns or sheets to process in tabular files.
+ * **Default Value:** `[]` (empty list)
+
+* **`DEFAULT_TABULAR_ANONYMISATION_STRATEGY`**
+ * **Description:** The default method for anonymizing tabular data (e.g., "redact completely").
+ * **Default Value:** `"redact completely"`
+
+## Language Options
+
+Settings for multi-language support.
+
+* **`SHOW_LANGUAGE_SELECTION`**
+ * **Description:** If set to `"True"`, a language selection dropdown will be visible in the UI.
+ * **Default Value:** `"False"`
+
+* **`DEFAULT_LANGUAGE_FULL_NAME`** / **`DEFAULT_LANGUAGE`**
+ * **Description:** The default language's full name (e.g., "english") and its short code (e.g., "en").
+ * **Default Values:** `"english"`, `"en"`
+ * **Language Support Notes:**
+ * **Tesseract:** Ensure the Tesseract language data (e.g., `fra.traineddata`) is installed. Find language packs at [tesseract-ocr/tessdata](https://github.com/tesseract-ocr/tessdata).
+ * **PaddleOCR:** Ensure the PaddleOCR language data is installed. See supported languages at [PaddleOCR documentation](https://www.paddleocr.ai/main/en/version3.x/algorithm/PP-OCRv5/PP-OCRv5_multi_languages.html).
+ * **AWS Comprehend:** Only English (`en`) and Spanish (`es`) are supported. See [AWS Comprehend PII documentation](https://docs.aws.amazon.com/comprehend/latest/dg/how-pii.html).
+ * **AWS Textract:** Automatically detects language and supports English, Spanish, Italian, Portuguese, French, and German. Note that handwriting, invoices, receipts, identity documents, and queries processing are in English only. See [AWS Textract FAQs](https://aws.amazon.com/textract/faqs/#topic-0).
+
+* **`textract_language_choices`** / **`aws_comprehend_language_choices`**
+ * **Description:** Lists of supported language codes for Textract and Comprehend.
+ * **Default Value:** `['en', 'es', 'fr', 'de', 'it', 'pt']` and `['en', 'es']`
+
+* **`SPLIT_PUNCTUATION_FROM_WORDS`**
+ * **Description:** If set to `"True"`, splits punctuation from words in Textract output.
+ * **Default Value:** `"False"`
+
+* **`MAPPED_LANGUAGE_CHOICES`** / **`LANGUAGE_CHOICES`**
+ * **Description:** Paired lists of full language names and their corresponding short codes for the UI dropdown.
+ * **Default Value:** Predefined lists (see script).
+
+## Duplicate Detection Settings
+
+* **`DEFAULT_DUPLICATE_DETECTION_THRESHOLD`**
+ * **Description:** The similarity score (0.0 to 1.0) above which documents/pages are considered duplicates.
+ * **Default Value:** `0.95`
+
+* **`DEFAULT_MIN_CONSECUTIVE_PAGES`**
+ * **Description:** Minimum number of consecutive pages that must be duplicates to be flagged.
+ * **Default Value:** `1`
+
+* **`USE_GREEDY_DUPLICATE_DETECTION`**
+ * **Description:** If `"True"`, uses a greedy algorithm that may find more duplicates but can be less precise.
+ * **Default Value:** `"True"`
+
+* **`DEFAULT_COMBINE_PAGES`**
+ * **Description:** If `"True"`, text from the same page number across different files is combined before checking for duplicates. If `"False"`, line-level duplicate detection will be enabled instead.
+ * **Default Value:** `"True"`
+
+* **`DEFAULT_MIN_WORD_COUNT`**
+ * **Description:** Pages with fewer words than this value will be ignored by the duplicate detector.
+ * **Default Value:** `10`
+
+* **`REMOVE_DUPLICATE_ROWS`**
+ * **Description:** If `"True"`, enables duplicate row detection in tabular data.
+ * **Default Value:** `"False"`
+
+## File Output Options
+
+* **`USE_GUI_BOX_COLOURS_FOR_OUTPUTS`**
+ * **Description:** If `"True"`, the final redacted PDF will use the same redaction box colors as shown in the review UI.
+ * **Default Value:** `"False"`
+
+* **`CUSTOM_BOX_COLOUR`**
+ * **Description:** Specifies the color for redaction boxes as an RGB tuple string, e.g., `"(0, 0, 0)"` for black. Alternatively, you can use the named color `"grey"` (which maps to RGB `(128, 128, 128)`).
+ * **Default Value:** `"(0, 0, 0)"`
+
+* **`APPLY_REDACTIONS_IMAGES`**, **`APPLY_REDACTIONS_GRAPHICS`**, **`APPLY_REDACTIONS_TEXT`**
+ * **Description:** Advanced control over how redactions are applied to underlying images, vector graphics, and text in the PDF, based on PyMuPDF options. `0` is the default for a standard redaction workflow.
+ * **Default Value:** `0` for all.
+ * **Detailed Options:**
+ * **`APPLY_REDACTIONS_IMAGES`:** `0` = ignore (default), `1` = completely remove images overlapping redaction annotations, `2` = blank out overlapping pixels, `3` = only remove images that are actually visible. Note: Text in images is effectively removed by the overlapping rectangle shape.
+ * **`APPLY_REDACTIONS_GRAPHICS`:** `0` = ignore (default), `1` = remove graphics fully contained in redaction annotation, `2` = remove any overlapping vector graphics.
+ * **`APPLY_REDACTIONS_TEXT`:** `0` = remove all characters whose boundary box overlaps any redaction rectangle (default, complies with legal/data protection intentions), `1` = keep text while redacting graphics/images (does NOT comply with data protection intentions - use at your own risk).
+
+* **`RETURN_PDF_FOR_REVIEW`**
+ * **Description:** If set to `"True"`, a PDF with redaction boxes drawn on it (but text not removed) is generated for the "Review" tab.
+ * **Default Value:** `"True"`
+
+* **`RETURN_REDACTED_PDF`**
+ * **Description:** If set to `'True'`, the application will return a fully redacted PDF at the end of the main task.
+ * **Default Value:** `"True"`
+
+* **`COMPRESS_REDACTED_PDF`**
+ * **Description:** If set to `'True'`, the redacted PDF output will be compressed.
+ * **Default Value:** `"False"`
+ * **Warning:** On low memory systems, the compression options in PyMuPDF can cause the app to crash if the PDF is longer than 500 pages or so. Setting this to `"False"` will save the PDF only with a basic cleaning option enabled, which is more memory-efficient.
+
+* **`SAVE_OUTPUTS_TO_S3`**
+ * **Description:** If set to `'True'`, the application will automatically upload redaction outputs (PDFs, text/tabular outputs, duplicate-analysis files, and Adobe XFDF review files) to Amazon S3 when `RUN_AWS_FUNCTIONS` is also enabled. Uploads use the `S3_OUTPUTS_FOLDER` prefix within the `DOCUMENT_REDACTION_BUCKET`, optionally with a per-session subfolder when `SESSION_OUTPUT_FOLDER` is enabled.
+ * **Default Value:** `'False'`
+
+* **`S3_OUTPUTS_FOLDER`**
+ * **Description:** Base S3 key prefix (folder path) within `DOCUMENT_REDACTION_BUCKET` where redaction outputs are stored, for example `'outputs/'` or `'redaction/outputs/'`. When `SESSION_OUTPUT_FOLDER` is `'True'`, a session-specific subfolder (based on `session_hash`) is appended to this path so each user/session writes to its own S3 subdirectory. If left blank, outputs will not be uploaded to S3 even if `SAVE_OUTPUTS_TO_S3` is `'True'`.
+ * **Default Value:** `''`
+
+* **`S3_OUTPUTS_BUCKET`**
+ * **Description:** Name of the S3 bucket where redaction outputs are stored.
+ * **Default Value:** `'DOCUMENT_REDACTION_BUCKET'`
+
+## Direct Mode & Lambda Configuration
+
+Settings for running the application from the command line (Direct Mode) or as an AWS Lambda function.
+
+**Note:** Many `DIRECT_MODE_*` variables inherit their default values from their corresponding non-direct-mode variables if not explicitly set. For example, `DIRECT_MODE_LANGUAGE` defaults to `DEFAULT_LANGUAGE`, `DIRECT_MODE_IMAGES_DPI` defaults to `IMAGES_DPI`, etc.
+
+### Direct Mode
+
+* **`RUN_DIRECT_MODE`**
+ * **Description:** Set to `'True'` to enable direct command-line mode.
+ * **Default Value:** `'False'`
+
+* **`DIRECT_MODE_DEFAULT_USER`**
+ * **Description:** Default username for CLI requests.
+ * **Default Value:** `''`
+
+* **`DIRECT_MODE_TASK`**
+ * **Description:** The task to perform: `'redact'` or `'deduplicate'`.
+ * **Default Value:** `'redact'`
+
+* **`DIRECT_MODE_INPUT_FILE`** / **`DIRECT_MODE_OUTPUT_DIR`**
+ * **Description:** Path to the input file and output directory for the task.
+ * **Default Values:** `''`, `output/`
+
+* **`DIRECT_MODE_DUPLICATE_TYPE`**
+ * **Description:** Type of duplicate detection for direct mode: `'pages'` or `'tabular'`.
+ * **Default Value:** `'pages'`
+
+* **`DIRECT_MODE_LANGUAGE`**
+ * **Description:** Language for document processing in direct mode.
+ * **Default Value:** Inherits from `DEFAULT_LANGUAGE`
+
+* **`DIRECT_MODE_PII_DETECTOR`**
+ * **Description:** PII detection method for direct mode.
+ * **Default Value:** Inherits from `LOCAL_PII_OPTION`
+
+* **`DIRECT_MODE_OCR_METHOD`**
+ * **Description:** OCR method for PDF/image processing in direct mode.
+ * **Default Value:** `"Local OCR"`
+
+* **`DIRECT_MODE_OCR_FIRST_PASS_MAX_WORKERS`**
+ * **Description:** Maximum number of threads for the OCR first pass in direct mode. Inherits from `OCR_FIRST_PASS_MAX_WORKERS` if not set.
+ * **Default Value:** Same as `OCR_FIRST_PASS_MAX_WORKERS` (e.g. `3`)
+
+* **`DIRECT_MODE_SUMMARY_PAGE_GROUP_MAX_WORKERS`**
+ * **Description:** Maximum number of threads for page-group summarisation in direct mode (when task is summarise). Inherits from `SUMMARY_PAGE_GROUP_MAX_WORKERS` if not set.
+ * **Default Value:** Same as `SUMMARY_PAGE_GROUP_MAX_WORKERS` (e.g. `1`)
+
+* **`DIRECT_MODE_PAGE_MIN`** / **`DIRECT_MODE_PAGE_MAX`**
+ * **Description:** First and last page to process in direct mode. `0` for max means process all pages.
+ * **Default Values:** Inherit from `DEFAULT_PAGE_MIN` / `DEFAULT_PAGE_MAX`
+
+* **`DIRECT_MODE_IMAGES_DPI`**
+ * **Description:** DPI for image processing in direct mode.
+ * **Default Value:** Inherits from `IMAGES_DPI`
+
+* **`DIRECT_MODE_CHOSEN_LOCAL_OCR_MODEL`**
+ * **Description:** Local OCR model choice for direct mode.
+ * **Default Value:** Inherits from `DEFAULT_LOCAL_OCR_MODEL`
+
+* **`DIRECT_MODE_PREPROCESS_LOCAL_OCR_IMAGES`**
+ * **Description:** If set to `"True"`, preprocesses images before OCR in direct mode.
+ * **Default Value:** Inherits from `PREPROCESS_LOCAL_OCR_IMAGES`
+
+* **`DIRECT_MODE_COMPRESS_REDACTED_PDF`**
+ * **Description:** If set to `"True"`, compresses the redacted PDF output in direct mode.
+ * **Default Value:** Inherits from `COMPRESS_REDACTED_PDF`
+
+* **`DIRECT_MODE_RETURN_PDF_END_OF_REDACTION`**
+ * **Description:** If set to `"True"`, returns a PDF at the end of redaction in direct mode.
+ * **Default Value:** Inherits from `RETURN_REDACTED_PDF`
+
+* **`DIRECT_MODE_EXTRACT_FORMS`**
+ * **Description:** If set to `"True"`, extracts forms during Textract analysis in direct mode.
+ * **Default Value:** `"False"`
+
+* **`DIRECT_MODE_EXTRACT_TABLES`**
+ * **Description:** If set to `"True"`, extracts tables during Textract analysis in direct mode.
+ * **Default Value:** `"False"`
+
+* **`DIRECT_MODE_EXTRACT_LAYOUT`**
+ * **Description:** If set to `"True"`, extracts layout during Textract analysis in direct mode.
+ * **Default Value:** `"False"`
+
+* **`DIRECT_MODE_EXTRACT_SIGNATURES`**
+ * **Description:** If set to `"True"`, extracts signatures during Textract analysis in direct mode.
+ * **Default Value:** `"False"`
+
+* **`DIRECT_MODE_MATCH_FUZZY_WHOLE_PHRASE_BOOL`**
+ * **Description:** If set to `"True"`, matches fuzzy whole phrases in direct mode.
+ * **Default Value:** `"True"`
+
+* **`DIRECT_MODE_ANON_STRATEGY`**
+ * **Description:** Anonymisation strategy for tabular data in direct mode.
+ * **Default Value:** Inherits from `DEFAULT_TABULAR_ANONYMISATION_STRATEGY`
+
+* **`DIRECT_MODE_FUZZY_MISTAKES`**
+ * **Description:** Number of fuzzy spelling mistakes allowed in direct mode.
+ * **Default Value:** Inherits from `DEFAULT_FUZZY_SPELLING_MISTAKES_NUM`
+
+* **`DIRECT_MODE_SIMILARITY_THRESHOLD`**
+ * **Description:** Similarity threshold for duplicate detection in direct mode.
+ * **Default Value:** Inherits from `DEFAULT_DUPLICATE_DETECTION_THRESHOLD`
+
+* **`DIRECT_MODE_MIN_WORD_COUNT`**
+ * **Description:** Minimum word count for duplicate detection in direct mode.
+ * **Default Value:** Inherits from `DEFAULT_MIN_WORD_COUNT`
+
+* **`DIRECT_MODE_MIN_CONSECUTIVE_PAGES`**
+ * **Description:** Minimum consecutive pages for duplicate detection in direct mode.
+ * **Default Value:** Inherits from `DEFAULT_MIN_CONSECUTIVE_PAGES`
+
+* **`DIRECT_MODE_GREEDY_MATCH`**
+ * **Description:** If set to `"True"`, uses greedy matching for duplicate detection in direct mode.
+ * **Default Value:** Inherits from `USE_GREEDY_DUPLICATE_DETECTION`
+
+* **`DIRECT_MODE_COMBINE_PAGES`**
+ * **Description:** If set to `"True"`, combines pages for duplicate detection in direct mode.
+ * **Default Value:** Inherits from `DEFAULT_COMBINE_PAGES`
+
+* **`DIRECT_MODE_REMOVE_DUPLICATE_ROWS`**
+ * **Description:** If set to `"True"`, removes duplicate rows in tabular data in direct mode.
+ * **Default Value:** Inherits from `REMOVE_DUPLICATE_ROWS`
+
+* **`DIRECT_MODE_TEXTRACT_ACTION`**
+ * **Description:** Textract action for batch operations in direct mode.
+ * **Default Value:** `''`
+
+* **`DIRECT_MODE_JOB_ID`**
+ * **Description:** Job ID for Textract operations in direct mode.
+ * **Default Value:** `''`
+
+* **`DIRECT_MODE_INFERENCE_SERVER_MODEL`**
+ * **Description:** Inference server model to use for PII detection in direct mode. If empty, uses `CHOSEN_INFERENCE_SERVER_PII_MODEL`.
+ * **Default Value:** `''` (inherits from `CHOSEN_INFERENCE_SERVER_PII_MODEL` when set)
+
+### Lambda Configuration
+
+* **`LAMBDA_POLL_INTERVAL`**
+ * **Description:** Polling interval in seconds for checking Textract job status.
+ * **Default Value:** `30`
+
+* **`LAMBDA_MAX_POLL_ATTEMPTS`**
+ * **Description:** Maximum number of polling attempts before timeout.
+ * **Default Value:** `120`
+
+* **`LAMBDA_PREPARE_IMAGES`**
+ * **Description:** If `"True"`, prepares images for OCR processing within the Lambda environment.
+ * **Default Value:** `"True"`
+
+* **`LAMBDA_EXTRACT_SIGNATURES`**
+ * **Description:** Enables signature extraction during Textract analysis in Lambda.
+ * **Default Value:** `"False"`
+
+* **`LAMBDA_DEFAULT_USERNAME`**
+ * **Description:** Default username for operations initiated by Lambda.
+ * **Default Value:** `"lambda_user"`
+
+## Allow, Deny, & Whole Page Redaction Lists
+
+* **`GET_DEFAULT_ALLOW_LIST`**, **`GET_DEFAULT_DENY_LIST`**, **`GET_DEFAULT_WHOLE_PAGE_REDACTION_LIST`**
+ * **Description:** Booleans (`"True"`/`"False"`) to enable the use of allow, deny, or whole-page redaction lists.
+ * **Default Value:** `"False"`
+ * **Note:** `GET_DEFAULT_WHOLE_PAGE_REDACTION_LIST` is stored as a string value, not converted to boolean (unlike the other two variables).
+
+* **`ALLOW_LIST_PATH`**, **`DENY_LIST_PATH`**, **`WHOLE_PAGE_REDACTION_LIST_PATH`**
+ * **Description:** Local paths to the respective CSV list files.
+ * **Default Value:** `''`
+
+* **`S3_ALLOW_LIST_PATH`**, **`S3_DENY_LIST_PATH`**, **`S3_WHOLE_PAGE_REDACTION_LIST_PATH`**
+ * **Description:** Paths to the respective list files within the `DOCUMENT_REDACTION_BUCKET`.
+ * **Default Value:** `''`
+
+## Cost Code Options
+
+* **`SHOW_COSTS`**
+ * **Description:** If set to `'True'`, cost-related information will be displayed in the UI.
+ * **Default Value:** `'False'`
+
+* **`GET_COST_CODES`**
+ * **Description:** Enables fetching and using cost codes. Set to `'True'` to enable.
+ * **Default Value:** `'False'`
+
+* **`DEFAULT_COST_CODE`**
+ * **Description:** Specifies a default cost code.
+ * **Default Value:** `''`
+
+* **`COST_CODES_PATH`** / **`S3_COST_CODES_PATH`**
+ * **Description:** Local or S3 path to a CSV file containing available cost codes.
+ * **Default Value:** `''`
+ * **File Format:** The CSV file should contain a single table with two columns and a header row. The first column should contain cost codes, and the second column should contain a name or description for each cost code.
+
+* **`ENFORCE_COST_CODES`**
+ * **Description:** If set to `'True'`, makes the selection of a cost code mandatory.
+ * **Default Value:** `'False'`
+
+### Session default cost code (no extra environment variables)
+
+When cost codes are enabled (`GET_COST_CODES` or `ENFORCE_COST_CODES`), users can save a **default cost code** for their session via the "Set default cost code" button. This uses the **existing** cost code and AWS settings; no additional environment variables are required.
+
+* **Behaviour:**
+ * The chosen cost code is stored in a CSV file (`session_default_cost_codes.csv`) with columns `session_hash`, `default_cost_code`, and `saved_at` (ISO timestamp). Duplicate `session_hash` rows are deduplicated on save, keeping only the latest.
+ * **Local save location:** The CSV is written under the app’s **input folder** (the value of the input folder at save time, e.g. from `INPUT_FOLDER` or the session’s `input_folder_textbox`).
+ * **S3 (optional):** If `S3_COST_CODES_PATH` is set and `RUN_AWS_FUNCTIONS` and `DOCUMENT_REDACTION_BUCKET` are set, the same file is also uploaded to `DOCUMENT_REDACTION_BUCKET` in the same key “folder” as the cost codes file (e.g. `config/session_default_cost_codes.csv`).
+ * **Email-only:** Saves and S3 upload only run when the session identifier is a valid **email address** (e.g. when users are signed in via Cognito). If the session identifier is not an email (e.g. when running locally with a Gradio session hash), the default is not written to the CSV and not uploaded to S3. On load, the app does not download the session default cost codes file from S3 when the session identifier is not an email.
+
+* **Relevant existing variables:** `GET_COST_CODES`, `ENFORCE_COST_CODES`, `COST_CODES_PATH`, `S3_COST_CODES_PATH`, `DOCUMENT_REDACTION_BUCKET`, `RUN_AWS_FUNCTIONS`, and the app input folder (e.g. `INPUT_FOLDER` / `GRADIO_INPUT_FOLDER`).
+
+## Whole Document API Options (Textract Async)
+
+* **`SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS`**
+ * **Description:** Controls whether UI options for asynchronous whole document Textract calls are displayed.
+ * **Default Value:** `'False'`
+ * **Note:** This feature is not currently fully implemented in the application.
+
+* **`TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET`**
+ * **Description:** The S3 bucket used for asynchronous Textract analysis.
+ * **Default Value:** `''`
+
+* **`TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER`** / **`..._OUTPUT_SUBFOLDER`**
+ * **Description:** Input and output subfolders within the analysis bucket.
+ * **Default Values:** `'input'`, `'output'`
+
+* **`LOAD_PREVIOUS_TEXTRACT_JOBS_S3`**
+ * **Description:** If set to `'True'`, the application will load data from previous Textract jobs stored in S3.
+ * **Default Value:** `'False'`
+
+* **`TEXTRACT_JOBS_S3_LOC`** / **`TEXTRACT_JOBS_S3_INPUT_LOC`**
+ * **Description:** S3 subfolders where Textract job output and input are stored.
+ * **Default Value:** `'output'`, `'input'`
+
+* **`TEXTRACT_JOBS_LOCAL_LOC`**
+ * **Description:** The local subfolder for storing Textract job data.
+ * **Default Value:** `'output'`
+
+* **`DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS`**
+ * **Description:** Specifies the number of past days for which to display whole document Textract jobs.
+ * **Default Value:** `7`
\ No newline at end of file
diff --git a/src/faq.qmd b/src/faq.qmd
new file mode 100644
index 0000000000000000000000000000000000000000..e25522cce183d3a163ebec6be222c393fc1da684
--- /dev/null
+++ b/src/faq.qmd
@@ -0,0 +1,311 @@
+---
+title: "User FAQ"
+format:
+ html:
+ toc: true # Enable the table of contents
+ toc-depth: 3 # Include headings up to level 2 (##)
+ toc-title: "On this page" # Optional: Title for your TOC
+---
+
+## General Advice:
+* **Read the User Guide**: Many common questions are addressed in the detailed User Guide sections.
+* **Start Simple**: If you're new, try redacting with default options first before customising extensively.
+* **Human Review is Key**: Always manually review the `...redacted.pdf` or use the '**Review redactions**' tab. No automated system is perfect.
+* **Save Incrementally**: When working on the '**Review redactions**' tab, use the '**Save changes on current page to file**' button periodically, especially for large documents.
+
+## General questions
+
+#### What is document redaction and what does this app do?
+Document redaction is the process of removing sensitive or personally identifiable information (PII) from documents. This application is a tool that automates this process for various document types, including PDFs, images, open text, and tabular data (`XLSX`/`CSV`/`Parquet`). It identifies potential PII using different methods and allows users to review, modify, and export the suggested redactions.
+
+#### What types of documents and data can be redacted?
+The app can handle a variety of formats. For documents, it supports `PDF`s and images (`JPG`, `PNG`). For tabular data, it works with `XLSX`, `CSV`, and `Parquet` files. Additionally, it can redact open text that is copied and pasted directly into the application interface.
+
+#### How does the app identify text and PII for redaction?
+The app employs several methods for text extraction and PII identification. Text can be extracted directly from selectable `PDF` text, using a local Optical Character Recognition (OCR) model for image-based content, or through the **AWS Textract service** for more complex documents, handwriting, and signatures (if available). For PII identification, it can use a local model based on the `spacy` package or the **AWS Comprehend service** for more accurate results (if available).
+
+#### Can I customise what information is redacted?
+Yes, the app offers extensive customisation options. You can define terms that should never be redacted (an '**allow list**'), terms that should always be redacted (a '**deny list**'), and specify entire pages to be fully redacted using `CSV` files. You can also select specific types of entities to redact, such as dates, or remove default entity types that are not relevant to your needs.
+
+#### How can I review and modify the suggested redactions?
+The app provides a dedicated '**Review redactions**' tab with a visual interface. You can upload the original document and the generated review file (`CSV`) to see the suggested redactions overlaid on the document. Here, you can move, resize, delete, and add new redaction boxes. You can also filter suggested redactions based on criteria and exclude them individually or in groups.
+
+#### Can I work with tabular data or copy and pasted text?
+Yes, the app has a dedicated tab for redacting tabular data files (`XLSX`/`CSV`) and open text. For tabular data, you can upload your file and select which columns to redact. For open text, you can simply paste the text into a box. You can then choose the redaction method and the desired output format for the anonymised data.
+
+#### What are the options for the anonymisation format of redacted text?
+When redacting tabular data or open text, you have several options for how the redacted information is replaced. The default is to replace the text with '**REDACTED**'. Other options include replacing it with the entity type (e.g., 'PERSON'), redacting completely (removing the text), replacing it with a consistent hash value, or masking it with stars ('*').
+
+#### Can I export or import redactions to/from other software like Adobe Acrobat?
+Yes, the app supports exporting and importing redaction data using the **Adobe Acrobat** comment file format (`.xfdf`). You can export suggested redactions from the app to an `.xfdf` file that can be opened in **Adobe**. Conversely, you can import an `.xfdf` file created in **Adobe** into the app to generate a review file (`CSV`) for further work within the application.
+
+#### Is there a way to try the app without uploading my own documents first?
+Yes. The app includes built-in examples on several tabs so you can see how it works before using your own files.
+* On the **'Redact PDFs/images'** tab, look for the **"Try an example"** section. Click any example to load it with pre-configured settings, then click **'Extract text and redact document'** to run it. Examples include selectable-text PDFs, image OCR, custom entity selection, and deny list / whole-page redaction scenarios.
+* On the **'Word or Excel/CSV files'** tab, you will find examples for `CSV` redaction, Word document redaction, and Excel duplicate detection. Click an example, then click **'Redact text/data files'** to process it.
+
+#### What is the `...redactions_for_review.pdf` file, and how is it different from `...redacted.pdf`?
+The app produces two different `PDF` outputs after redaction:
+* **`...redacted.pdf`** — the final output with redacted text permanently removed and replaced by black boxes. This is the document you would share externally.
+* **`...redactions_for_review.pdf`** — the original document with redaction boxes overlaid but the underlying text still visible. This is a working file intended for review. It can be opened in **Adobe Acrobat** to inspect suggested redactions, and it can be re-uploaded to the app's **'Review redactions'** tab to continue working on redactions at a later session.
+
+#### Does the app support Word (.docx) documents?
+Yes. In addition to `PDF`s, images, `CSV`, and `XLSX` files, the app can also redact **Word** (`.docx`) documents. Go to the **'Word or Excel/CSV files'** tab and upload your `.docx` file. The redaction method and anonymisation output format options available for tabular data apply equally to Word documents.
+
+#### What do the 'Extract text only' and 'Redact selected terms' options do?
+Under **'Redaction settings'** on the **'Redact PDFs/images'** tab, the **'Choose redaction method'** radio button has three options:
+* **'Extract text only'** — runs text extraction (OCR) without applying any redactions. Useful when you only need the `ocr_output.csv` text output or want to inspect what was extracted before deciding on redactions.
+* **'Redact all PII'** (the default) — uses the chosen PII detection method to find and redact personal information across all selected entity types.
+* **'Redact selected terms'** — focuses redaction only on the specific terms in your custom deny list. No automatic PII detection is run; only the terms you have listed will be redacted.
+
+## Troubleshooting
+
+#### Q1: The app missed some personal information or redacted things it shouldn't have. Is it broken?
+A: Not necessarily. The app is not 100% accurate and is designed as an aid. The `README` explicitly states: "**NOTE: The app is not 100% accurate, and it will miss some personal information. It is essential that all outputs are reviewed by a human before using the final outputs.**"
+* **Solution**: Always use the '**Review redactions**' tab to manually inspect, add, remove, or modify redactions.
+
+#### Q2: I uploaded a `PDF`, but no text was found, or redactions are very poor using the '**Local model - selectable text**' option.
+A: This option only works if your `PDF` has actual selectable text. If your `PDF` is an image scan (even if it looks like text), this method won't work well.
+* **Solution**:
+ * Try the '**Local OCR model - PDFs without selectable text**' option. This uses Tesseract OCR to "read" the text from images.
+ * For best results, especially with complex documents, handwriting, or signatures, use the '**AWS Textract service - all PDF types**' if available.
+
+#### Q3: Handwriting or signatures are not being redacted properly.
+A: The '**Local**' text/OCR methods (selectable text or Tesseract) struggle with handwriting and signatures.
+* **Solution**:
+ * Use the '**AWS Textract service**' for text extraction.
+ * Ensure that on the main '**Redact PDFs/images**' tab, under "**Optional - select signature extraction**" (when **AWS Textract** is chosen), you have enabled handwriting and/or signature detection. Note that signature detection has higher cost implications.
+
+#### Q4: The options for '**AWS Textract service**' or '**AWS Comprehend**' are missing or greyed out.
+A: These services are typically only available when the app is running in an **AWS** environment or has been specifically configured by your system admin to access these services (e.g., via `API` keys).
+* **Solution**:
+ * Check if your instance of the app is supposed to have **AWS** services enabled.
+ * If running outside **AWS**, see the "**Using AWS Textract and Comprehend when not running in an AWS environment**" section in the advanced guide. This involves configuring **AWS** access keys, which should be done with IT and data security approval.
+
+#### Q5: I re-processed the same document, and it seems to be taking a long time and potentially costing more with **AWS** services. Can I avoid this?
+A: Yes. If you have previously processed a document with **AWS Textract** or the **Local OCR** model, the app generates a `.json` output file (`..._textract.json` or `..._ocr_results_with_words.json`).
+* **Solution**: When re-uploading your original document for redaction, also upload the corresponding `.json` file. The app should detect this (the "**Existing Textract output file found**" box may be checked), skipping the expensive text extraction step.
+
+#### Q6: My app crashed, or I reloaded the page. Are my output files lost?
+A: If you are logged in via **AWS Cognito** and the server hasn't been shut down, you might be able to recover them.
+* **Solution**: Go to the '**Settings**' tab and open '**View and download all output files from this session**'. Click **'Refresh files in output folder'** to load the list, then tick the box next to a file to display and download it.
+
+#### Q7: My custom allow list (terms to never redact) or deny list (terms to always redact) isn't working.
+A: There are a few common reasons:
+* **File Format**: Ensure your list is a `.csv` file with terms in the first column only, with no column header.
+* **Case Sensitivity**: Terms in the allow/deny list are case sensitive.
+* **Deny List & 'CUSTOM' Entity**: For a deny list to work, you must select the '**CUSTOM**' entity type in '**Redaction settings**' under '**Entities to redact**'.
+* **Manual Additions**: If you manually added terms in the app interface (under '**Manually modify custom allow...**'), ensure you pressed `Enter` after typing each term in its cell.
+* **Fuzzy Search for Deny List**: If you intend to use fuzzy matching for your deny list, ensure '**CUSTOM_FUZZY**' is selected as an entity type, and you've configured the "**maximum number of spelling mistakes allowed.**"
+
+#### Q8: I'm trying to review redactions, but the `PDF` in the viewer looks like it's already redacted with black boxes.
+A: You likely uploaded the `...redacted.pdf` file (the final output with text permanently removed) instead of the correct file.
+* **Solution**: On the '**Review redactions**' tab, the **first upload box (1.)** accepts either:
+ * The **original, unredacted `PDF`** — if you are starting a fresh review and want to see all suggested redactions overlaid, or
+ * The **`...redactions_for_review.pdf`** — if you want to reload a previous set of redactions (this file shows the original text with redaction boxes overlaid but the underlying text still visible).
+ The **second upload box (2.)** is for an `...ocr_results_with_words.csv` or `...ocr_output.csv` file, which enables the **'Search text and redact'** and **'View text'** features. Do **not** upload the `...redacted.pdf` (the version with black boxes) to either box.
+
+#### Q9: I can't move or pan the document in the '**Review redactions**' viewer when zoomed in.
+A: You are likely in "**add redaction boxes**" mode.
+* **Solution**: Scroll to the bottom of the document viewer pane and click the hand icon. This switches to "**modify mode**," allowing you to pan the document by clicking and dragging, and also to move/resize existing redaction boxes.
+
+#### Q10: I accidentally clicked "**Exclude all items in table from redactions**" on the '**Review redactions**' tab without filtering, and now all my redactions are gone!
+A: This can happen if you don't apply a filter first.
+* **Solution**: Click the '**Undo last element removal**' button immediately. This should restore the redactions. Always ensure you have clicked the blue tick icon next to the search box to apply your filter before using "**Exclude all items...**".
+
+#### Q11: Redaction of my `CSV` or `XLSX` file isn't working correctly.
+A: The app expects a specific format for tabular data.
+* **Solution**: Ensure your data file has a simple table format, with the table starting in the first cell (`A1`). There should be no other information or multiple tables within the sheet you intend to redact. For `XLSX` files, each sheet to be redacted must follow this format.
+
+#### Q12: The "**Identify duplicate pages**" feature isn't finding duplicates I expect, or it's flagging too many pages.
+A: This feature uses text similarity based on the `ocr_output.csv` files. The default similarity threshold is 0.95 (95%), which may be too strict or too lenient for your documents.
+* **Solution**:
+ * Ensure you've uploaded the correct `ocr_output.csv` files for all documents you're comparing (these are generated every time you run a redaction task).
+ * On the **'Identify duplicate pages'** tab, open the **'Duplicate matching parameters'** accordion to adjust:
+ * **Similarity threshold** (0–1): Lower this to catch more (potentially looser) matches; raise it to require more exact matches.
+ * **Minimum word count**: Pages with fewer words than this are ignored — useful for skipping near-blank pages.
+ * **Duplicate matching mode**: Choose between **'Find duplicates by page'** (compares full-page text) and **'Find duplicates by text line'** (compares individual lines).
+ * Review the `page_similarity_results.csv` output to inspect the similarity scores and verify matched text side-by-side in the interactive preview.
+
+#### Q13: I exported a review file to Adobe (`.xfdf`), but when I open it in Adobe Acrobat, it can't find the `PDF` or shows no redactions.
+A: When **Adobe Acrobat** prompts you, it needs to be pointed to the exact original `PDF`.
+* **Solution**: Ensure you select the original, unredacted `PDF` file that was used to generate the `..._review_file.csv` (and subsequently the `.xfdf` file) when **Adobe Acrobat** asks for the associated document.
+
+#### Q14: My **AWS Textract API** job (submitted via "**Submit whole document to AWS Textract API...**") is taking a long time, or I don't know if it's finished.
+A: Large documents can take time. The document estimates about five seconds per page as a rough guide.
+* **Solution**:
+ * After submitting, a **Job ID** will appear.
+ * Periodically click the '**Check status of Textract job and download**' button. Processing continues in the background.
+ * Once ready, the `_textract.json` output will appear in the output area.
+
+#### Q15: I'm trying to redact specific terms from my deny list, but they are not being picked up, even though the '**CUSTOM**' entity is selected.
+A: The deny list matches whole words with exact spelling by default.
+* **Solution**:
+ * Double-check the spelling and case in your deny list.
+ * If you expect misspellings to be caught, you need to use the '**CUSTOM_FUZZY**' entity type and configure the "**maximum number of spelling mistakes allowed**" under '**Redaction settings**'. Then, upload your deny list.
+
+#### Q16: I set the "**Lowest page to redact**" and "**Highest page to redact**" in '**Redaction settings**', but the app still seems to process or show redactions outside this range.
+A: The page range setting primarily controls which pages have redactions applied in the final `...redacted.pdf`. The underlying text extraction (especially with OCR/Textract) might still process the whole document to generate the `...ocr_results.csv` or `..._textract.json`. When reviewing, the `review_file.csv` might initially contain all potential redactions found across the document.
+* **Solution**:
+ * Ensure the `...redacted.pdf` correctly reflects the page range.
+ * When reviewing, use the page navigation and filters on the '**Review redactions**' tab to focus on your desired page range. The final application of redactions from the review tab should also respect the range if it's still set, but primarily it works off the `review_file.csv`.
+
+#### Q17: My "**Full page redaction list**" isn't working. I uploaded a `CSV` with page numbers, but those pages aren't blacked out.
+A: Common issues include:
+* **File Format**: Ensure your list is a `.csv` file with page numbers in the first column only, with no column header. Each page number should be on a new row.
+* **Redaction Task**: Simply uploading the list doesn't automatically redact. You need to:
+ 1. Upload the `PDF` you want to redact.
+ 2. Upload the full page redaction `CSV` in '**Redaction settings**'.
+ 3. It's often best to deselect all other entity types in '**Redaction settings**' if you only want to redact these full pages.
+ 4. Run the '**Redact document**' process. The output `...redacted.pdf` should show the full pages redacted, and the `...review_file.csv` will list these pages.
+
+#### Q18: I merged multiple `...review_file.csv` files, but the output seems to have duplicate redaction boxes or some are missing.
+A: The merge feature simply combines all rows from the input review files.
+* **Solution**:
+ * **Duplicates**: If the same redaction (same location, text, label) was present in multiple input files, it will appear multiple times in the merged file. You'll need to manually remove these duplicates on the '**Review redactions**' tab or by editing the merged `...review_file.csv` in a spreadsheet editor before review.
+ * **Missing**: Double-check that all intended `...review_file.csv` files were correctly uploaded for the merge. Ensure the files themselves contained the expected redactions.
+
+#### Q19: I imported an `.xfdf` Adobe comment file, but the `review_file.csv` generated doesn't accurately reflect the highlights or comments I made in Adobe Acrobat.
+A: The app converts Adobe's comment/highlight information into its review_file format. Discrepancies can occur if:
+* **Comment Types**: The app primarily looks for highlight-style annotations that it can interpret as redaction areas. Other Adobe comment types (e.g., sticky notes without highlights, text strike-throughs not intended as redactions) might not translate.
+* **Complexity**: Very complex or unusually shaped Adobe annotations might not convert perfectly.
+* **PDF Version**: Ensure the `PDF` uploaded alongside the `.xfdf` is the exact same original, unredacted `PDF` that the comments were made on in Adobe.
+* **Solution**: After import, always open the generated `review_file.csv` (with the original `PDF`) on the '**Review redactions**' tab to verify and adjust as needed.
+
+#### Q20: The **Textract API** job status table (under "**Submit whole document to AWS Textract API...**") only shows recent jobs, or I can't find an older **Job ID** I submitted.
+A: The table showing **Textract** job statuses might have a limit or only show jobs from the current session or within a certain timeframe (e.g., "up to seven days old" is mentioned).
+* **Solution**:
+ * It's good practice to note down the **Job ID** immediately after submission if you plan to check it much later.
+ * If the `_textract.json` file was successfully created from a previous job, you can re-upload that `.json` file with your original `PDF` to bypass the `API` call and proceed directly to redaction or OCR conversion.
+
+#### Q21: I edited a `...review_file.csv` in Excel (e.g., changed coordinates, labels, colors), but when I upload it to the '**Review redactions**' tab, the boxes are misplaced, the wrong color, or it causes errors.
+A: The `review_file.csv` has specific columns and data formats (e.g., coordinates, `RGB` color tuples like `(0,0,255)`).
+* **Solution**:
+ * **Coordinates (xmin, ymin, xmax, ymax)**: Ensure these are numeric and make sense for `PDF` coordinates. Drastic incorrect changes can misplace boxes.
+ * **Colors**: Ensure the color column uses the `(R,G,B)` format, e.g., `(0,0,255)` for blue, not hex codes or color names, unless the app specifically handles that (the guide mentions `RGB`).
+ * **CSV Integrity**: Ensure you save the file strictly as a `CSV`. Excel sometimes adds extra formatting or changes delimiters if not saved carefully.
+ * **Column Order**: Do not change the order of columns in the `review_file.csv`.
+ * **Test Small Changes**: Modify one or two rows/values first to see the effect before making bulk changes.
+
+#### Q22: The cost and time estimation feature isn't showing up, or it's giving unexpected results.
+A: This feature depends on admin configuration and certain conditions.
+* **Solution**:
+ * **Admin Enabled**: Confirm with your system admin that the cost/time estimation feature is enabled in the app's configuration.
+ * **AWS Services**: Estimation is typically most relevant when using **AWS Textract** or **Comprehend**. If you're only using '**Local**' models, the estimation might be simpler or not show **AWS**-related costs.
+ * **Existing Output**: If "**Existing Textract output file found**" is checked (because you uploaded a pre-existing `_textract.json`), the estimated cost and time should be significantly lower for the **Textract** part of the process.
+
+#### Q23: I'm prompted for a "**cost code**," but I don't know what to enter, or my search isn't finding it.
+A: Cost code selection is an optional feature enabled by system admins for tracking **AWS** usage.
+* **Solution**:
+ * **Contact Admin/Team**: If you're unsure which cost code to use, consult your team lead or the system administrator who manages the redaction app. They should provide the correct code or guidance.
+ * **Search Tips**: Try searching by project name, department, or any known identifiers for your cost center. The search might be case-sensitive or require exact phrasing.
+
+#### Q24: I selected "**hash**" as the anonymisation output format for my tabular data, but the output still shows "**REDACTED**" or something else.
+A: Ensure the selection was correctly registered before redacting.
+* **Solution**:
+ * Double-check on the '**Open text or Excel/csv files**' tab, under '**Anonymisation output format**,' that "**hash**" (or your desired format) is indeed selected.
+ * Try re-selecting it and then click '**Redact text/data files**' again.
+ * If the issue persists, it might be a bug or a specific interaction with your data type that prevents hashing. Report this to your app administrator. "**Hash**" should replace PII with a consistent unique `ID` for each unique piece of PII.
+
+#### Q25: I'm using '**CUSTOM_FUZZY**' for my deny list. I have "**Should fuzzy search match on entire phrases in deny list**" checked, but it's still matching individual words within my phrases or matching things I don't expect.
+A: Fuzzy matching on entire phrases can be complex. The "**maximum number of spelling mistakes allowed**" applies to the entire phrase.
+* **Solution**:
+ * **Mistake Count**: If your phrase is long and the allowed mistakes are few, it might not find matches if the errors are distributed. Conversely, too many allowed mistakes on a short phrase can lead to over-matching. Experiment with the mistake count.
+ * **Specificity**: If "**match on entire phrases**" is unchecked, it will fuzzy match each individual word (excluding stop words) in your deny list phrases. This can be very broad. Ensure this option is set according to your needs.
+ * **Test with Simple Phrases**: Try a very simple phrase with a known, small number of errors to see if the core fuzzy logic is working as you expect, then build up complexity.
+
+#### Q26: I "**locked in**" a new redaction box format on the '**Review redactions**' tab (label, colour), but now I want to change it or go back to the pop-up for each new box.
+A: When a format is locked, a new icon (described as looking like a "**gift tag**") appears at the bottom of the document viewer.
+* **Solution**:
+ * Click the "**gift tag**" icon at the bottom of the document viewer pane.
+ * This will allow you to change the default locked format.
+ * To go back to the pop-up appearing for each new box, click the lock icon within that "**gift tag**" menu again to "**unlock**" it (it should turn from blue to its original state).
+
+#### Q27: I clicked "**Redact document**," processing seemed to complete (e.g., progress bar finished, "complete" message shown), but no output files (`...redacted.pdf`, `...review_file.csv`) appeared in the output area.
+A: This could be due to various reasons:
+* **No PII Found**: If absolutely no PII was detected according to your settings (entities, allow/deny lists), the app might not generate a `...redacted.pdf` if there's nothing to redact, though a `review_file.csv` (potentially empty) and `ocr_results.csv` should still ideally appear.
+* **Error During File Generation**: An unhandled error might have occurred silently during the final file creation step.
+* **Browser/UI Issue**: The `UI` might not have refreshed to show the files.
+* **Permissions**: In rare cases, if running locally, there might be file system permission issues preventing the app from writing outputs.
+* **Solution**:
+ * Try refreshing the browser page (if feasible without losing input data, or after re-uploading).
+ * Check the '**Settings**' tab for '**View and download all output files from this session**' (if logged in via Cognito) – they might be listed there.
+ * Try a very simple document with obvious PII and default settings to see if any output is generated.
+ * Check browser developer console (`F12`) for any error messages.
+
+#### Q28: When reviewing, I click on a row in the '**Search suggested redactions**' table. The page changes, but the specific redaction box isn't highlighted, or the view doesn't scroll to it.
+A: The highlighting feature ("should change the colour of redaction box to blue") is an aid.
+* **Solution**:
+ * Ensure you are on the correct page. The table click should take you there.
+ * The highlighting might be subtle or conflict with other `UI` elements. Manually scan the page for the text/label mentioned in the table row.
+ * Scrolling to the exact box isn't explicitly guaranteed, especially on very dense pages. The main function is page navigation.
+
+#### Q29: I rotated a page in the '**Review redactions**' document viewer, and now all subsequent pages are also rotated, or if I navigate away and back, the rotation is lost.
+A: The `README` states: "**When you switch page, the viewer will stay in your selected orientation, so if it looks strange, just rotate the page again and hopefully it will look correct!**"
+* **Solution**:
+ * The rotation is a viewing aid for the current page session in the viewer. It does not permanently alter the original `PDF`.
+ * If subsequent pages appear incorrectly rotated, use the rotation buttons again for that new page.
+ * The rotation state might reset if you reload files or perform certain actions. Simply re-apply rotation as needed for viewing.
+
+## Reviewing redactions
+
+#### How do I manually search for and redact text that the automatic detection missed?
+Use the **'Search text and redact'** tab within the **'Review redactions'** tab (found to the right of the document viewer, next to the 'Apply redactions to PDF' and 'Save changes on current page' buttons). This tab shows all the word-level text extracted from your document and allows you to:
+1. Type a word or phrase in the **'Multi-word text search'** box. Tick **'Enable regex pattern matching'** if you want to use regular expressions. Click **'Search'** (or press Enter).
+2. The table updates to show only matching rows. Click any row to jump to that page in the document viewer.
+3. Choose how to redact:
+ * **'Redact specific text row'** — redacts only the exact instance on the row you clicked.
+ * **'Redact all words with same text as selected row'** — redacts every occurrence of that word/phrase throughout the document.
+ * **'Redact all text in table'** — redacts everything currently shown in the filtered table in one go.
+4. If you make a mistake, click **'Undo latest redaction'** to reverse the last redaction action (one level of undo only).
+5. Before redacting, you can customise the label and colour for new boxes under the **'Search options'** accordion.
+
+#### What is the 'View text' tab on the 'Review redactions' page?
+The **'View text'** tab (below the 'Search text and redact' tab on the **'Review redactions'** page) displays the line-by-line text extracted from the document. This lets you verify the accuracy of the OCR output. You can search the table using the search bar above it, or filter individual columns by clicking the three dots next to a column header. Clicking a row navigates the document viewer to that page. Click **'Reset OCR output table filter'** to clear any active filters. This table is populated automatically after a redaction run, or when you upload an `...ocr_output.csv` file to the second upload box on the **'Review redactions'** tab.
+
+#### After reviewing and modifying redactions, how do I produce the final redacted PDF?
+Once you are happy with all the redactions in the document viewer, click the **'Apply revised redactions to PDF'** button (found above the 'Save changes on current page to file' button on the **'Review redactions'** tab). This generates a new `...redacted.pdf` (with text permanently removed) and an updated `...redactions_for_review.pdf` (with redaction boxes overlaid on the original text). Both files will appear in the output area.
+
+#### Can I remove individual redactions or all instances of a specific text without using the filter?
+Yes. On the **'Review redactions'** tab, to the right of the document viewer, under the **'Modify redactions'** heading:
+* **'Exclude specific redaction row'** — removes only the single redaction from the last row you clicked in the table. The currently selected row is shown below the button.
+* **'Exclude all redactions with the same text as selected row'** — removes every redaction in the document that has exactly the same underlying text as your selected row.
+* **'Exclude all redactions in table'** — removes all redactions currently visible in the table. Always apply a filter first (using the dropdowns or the filter box, then clicking the blue tick icon) before using this option, otherwise all redactions in the document will be removed.
+* After any of these actions, click **'Reset filters'** to return the table to showing all remaining redactions.
+* If you remove redactions by mistake, click **'Undo last element removal'** immediately to restore them (one level of undo only).
+
+## Working with previous redaction results
+
+#### How do I return to a document I previously redacted to add or change redactions?
+You do not need to re-run the full redaction process. Instead:
+1. On the **'Review redactions'** tab, upload the **`...redactions_for_review.pdf`** file (produced during the original redaction run) into the **first upload box (1.)**. This file contains all the previous redaction boxes embedded in it.
+2. Upload the **`...ocr_results_with_words.csv`** file into the **second upload box (2.)** if you want to use the **'Search text and redact'** feature to find additional terms to redact.
+3. The document viewer and redaction table will be populated with the previous redactions, and you can modify them as usual before applying the final redactions to `PDF`.
+
+#### Can I combine redactions from multiple separate redaction runs of the same document?
+Yes. If you have run several redaction tasks on the same document (for example, using different settings each time) and want to merge all the suggested redaction boxes together:
+1. Go to the **'Settings'** tab and find the **'Combine multiple review PDFs or CSV files'** section.
+2. Upload all the `...redactions_for_review.pdf` files you want to merge.
+3. Click **'Combine multiple review PDFs into one'**. A combined file will be produced containing all the redaction boxes from the uploaded files.
+4. Upload this combined file into the **'Review redactions'** tab to inspect, modify, and finalise the merged redactions.
+
+#### How do I skip re-running OCR or AWS Textract when I redact the same document a second time?
+Every time you redact a document, a **`.json`** output file is produced (either `..._textract.json` for AWS Textract, or `...ocr_outputs_with_words.json` for the local OCR model). To skip the text extraction step on future runs:
+1. When uploading your document on the **'Redact PDFs/images'** tab, select **both** the original `PDF` and the `.json` file at the same time from the upload area.
+2. The app will detect the `.json` file and automatically tick the **'Existing Textract output file found'** or **'Existing local OCR output file found'** checkbox, indicating the text extraction step will be skipped.
+This saves time and, where **AWS Textract** is used, avoids incurring the extraction cost again.
+
+## Additional features
+
+#### Can the app summarise documents?
+Yes, if document summarisation is enabled in your deployment, a **'Document summarisation'** tab will be visible. To summarise a document:
+1. Upload one or more `PDF` files, **or** one or more `...ocr_output.csv` files (from a previous redaction run) using the upload boxes on the tab.
+2. Open the **'Summarisation settings'** accordion to choose:
+ * **LLM inference method** — the language model to use.
+ * **Max pages per page-group summary** — how many pages are summarised together at a time.
+ * **Summary format** — **Concise** (key themes) or **Detailed**.
+ * **Additional summary instructions** (optional) — e.g. "Focus on key obligations."
+3. Click **'Generate summary'**. When finished, the summary appears below and summary files are available for download.
+
+#### What does the 'Redact duplicate pages' checkbox do on the 'Redact PDFs/images' tab?
+When this checkbox is ticked (found alongside the PII identification options under **'Redaction settings'**), the app will automatically detect pages with near-identical text within the document and apply whole-page redaction to any duplicates found, as part of the same redaction run. This is a quick way to handle documents that contain repeated pages. For more control over duplicate detection — such as adjusting the similarity threshold, comparing across multiple documents, or finding duplicate lines of text — use the dedicated **'Identify duplicate pages'** tab as described in the advanced user guide.
\ No newline at end of file
diff --git a/src/installation_guide.qmd b/src/installation_guide.qmd
new file mode 100644
index 0000000000000000000000000000000000000000..02b99835b94181b5b9fcbd94a64b8224921eef3c
--- /dev/null
+++ b/src/installation_guide.qmd
@@ -0,0 +1,253 @@
+---
+title: "App installation guide (with CDK or locally on Windows)"
+format:
+ html:
+ toc: true # Enable the table of contents
+ toc-depth: 3 # Include headings up to level 2 (##)
+ toc-title: "On this page" # Optional: Title for your TOC
+---
+
+# Installation with CDK
+
+This guide gives an overview of how to install the app in an AWS environment using the code in the 'cdk/' folder of this Github repo. The most important thing you need is some familiarity with AWS and how to use it via console or command line, as well as administrator access to at least one region. Then follow the below steps.
+
+## Prerequisites
+
+* Ensure you have an AWS Administrator account in your desired region to be able to deploy all the resources mentioned in cdk_stack.py.
+* Install git on your computer from: [https://git-scm.com](https://git-scm.com)
+* Install nodejs and npm: [https://docs.npmjs.com/downloading-and-installing-node-js-and-npm](https://docs.npmjs.com/downloading-and-installing-node-js-and-npm). If using Windows, it may be easiest to install from the .msi installer at the bottom of the page [here](https://nodejs.org/en/download/).
+* Install AWS CDK v2: [https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html)
+* Bootstrap the environment with CDK in both your primary region, and `us-east-1` if installing CloudFront and associated WAF.
+ ```bash
+ # Bootstrap your primary region
+ cdk bootstrap aws:///eu-west-1
+
+ # Bootstrap the us-east-1 region
+ cdk bootstrap aws:///us-east-1
+ ```
+* In command line, write:
+ ```bash
+ git clone https://github.com/seanpedrick-case/doc_redaction.git
+ ```
+
+## Note on ACM Certificates
+
+To get full HTTPS data transfer through the app, you will need an SSL certificate registered with AWS Certificate Manager.
+
+You can either use the SSL certificate from a domain, or import an existing certificate into Certificate Manager. If you're not sure, ask your IT admin if you need help with this. If getting an SSL certificate for an existing domain, make sure to point the certificate to `*.`.
+
+Update your DNS records to include the CNAME record given by AWS. After your stack has been created, you will also need to create a CNAME DNS record for your domain pointing to your load balancer DNS with a subdomain, e.g., `redaction.`.
+
+## Steps to install the app using CDK
+
+### 1. Create a python environment, load in packages from `requirements.txt`.
+
+You need a `cdk.json` in the `cdk` folder. It should contain the following:
+
+```json
+{
+ "app": "/python.exe app.py",
+ "context": {
+ "@aws-cdk/aws-apigateway:usagePlanKeyOrderInsensitiveId": true,
+ "@aws-cdk/core:stackRelativeExports": true,
+ "@aws-cdk/aws-rds:lowercaseDbIdentifier": true,
+ "@aws-cdk/aws-lambda:recognizeVersionProps": true,
+ "@aws-cdk/aws-lambda:recognizeLayerVersion": true,
+ "@aws-cdk/aws-cloudfront:defaultSecurityPolicyTLSv1.2_2021": true,
+ "@aws-cdk/aws-ecs:arnFormatIncludesClusterName": true,
+ "@aws-cdk/core:newStyleStackSynthesis": true,
+ "aws-cdk:enableDiffNoFail": true,
+ "@aws-cdk/aws-ec2:restrictDefaultSecurityGroup": true,
+ "@aws-cdk/aws-apigateway:disableCloudWatchRole": true,
+ "@aws-cdk/core:target-partitions": [
+ "aws",
+ "aws-cn"
+ ]
+ }
+ }
+```
+
+### 2. Create a `cdk_config.env` file in the `config` subfolder.
+
+Depending on which environment variables you put in this file, you can choose whether to install the app in a completely new VPC, or in an existing VPC. The following shows you example config files that you could use.
+
+#### Deploying the app an a brand new VPC
+
+Here as a minimum it would be useful to put the following details in the cdk_config.env file (below are all example values, other possible variables to use here can be seen in the `cdk` folder/`cdk_config.py`).
+
+```ini
+CDK_PREFIX=example-prefix # This prefix will be added to the name of most of the created elements in your stack
+NEW_VPC_CIDR=10.0.0.0/24 # The CIDR range for your newly created VPC
+AWS_REGION= # Region where elements will be created
+AWS_ACCOUNT_ID=1234567890 # AWS account ID that has administrator access that you will use for deploying the stack
+CDK_FOLDER=C:/path_to_cdk_folder/ # The place where the cdk folder code is located
+CONTEXT_FILE=C:/path_to_cdk_folder/cdk.context.json
+COGNITO_USER_POOL_DOMAIN_PREFIX=redaction-12345 # The prefix of the login / user sign up domain that you want to use with Cognito login. Should not contain the terms amazon, aws, or cognito.
+COGNITO_AUTH=0 # Do you want to do in-app authentication (username and password only, not necessary if you are using an SSL certificate as recommended below)
+USE_CLOUDFRONT=True # Recommended. If you intend to use CloudFront as the front URL to your application load balancer (ALB). This has some extra security features that you won't get with just an ALB, e.g. limiting app access by country.
+RUN_USEAST_STACK=False # Set this to True only if you have permissions to create a Cloudfront distribution and web ACL on top of it in the us-east-1 region. If you don't, the section below shows how you can create the CloudFront resource manually and map it to your application load balancer (as you should have permissions for that if you are admin in your region).
+CLOUDFRONT_DOMAIN=.cloudfront.net # If you already know the domain of the CloudFront distribution that you want to use, you can add this here.
+# If you are using an SSL certificate with your ALB (highly recommended):
+ACM_SSL_CERTIFICATE_ARN= # This is the ARN of the SSL certificate that you have installed in AWS Certificate Manager
+SSL_CERTIFICATE_DOMAIN=redaction.example.com # This is the domain of the SSL certificate that you have installed in AWS Certificate Manager
+```
+
+**Note: If you are using an SSL certificate with Cognito login on the application load balancer (strongly advised), you can set COGNITO_AUTH to 0 above, as you don't need the second login step to get to the app**
+
+#### In an existing VPC
+
+From the above example, remove the variable 'NEW_VPC_CIDR' and replace with the below:
+
+```ini
+VPC_NAME=example-vpc-name # Name of the VPC within which all the other elements will be created
+EXISTING_IGW_ID=igw-1234567890 # (optional) The ID for an existing internet gateway that you want to use instead of creating a new one
+SINGLE_NAT_GATEWAY_ID=nat-123456789 # (optional) The ID for an existing NAT gateway that you want to use instead of creating a new one
+```
+##### Subnets
+
+If you are using an existing VPC then you may want to deploy the app within existing subnets rather than creating new ones:
+
+* If you define no subnets in environment variables, the app will try to use existing private and public subnets. Bear in mind the app may overlap with IP addresses assigned to existing AWS resources. It is advised to at least specify existing subnets that you know are available, or create your own using one of the below methods.
+
+* If you want to use existing subnets, you can list them in the following environment variables:
+```ini
+PUBLIC_SUBNETS_TO_USE=["PublicSubnet1", "PublicSubnet2", "PublicSubnet3"]`
+PRIVATE_SUBNETS_TO_USE=["PrivateSubnet1", "PrivateSubnet2", "PrivateSubnet3"]`
+```
+
+* If you want to create new subnets, you need to also specify CIDR blocks and availability zones for the new subnets. The app will check with you upon deployment whether these CIDR blocks are available before trying to create.
+
+```ini
+PUBLIC_SUBNET_CIDR_BLOCKS=['10.222.333.0/28', '10.222.333.16/28', '10.222.333.32/28']
+PUBLIC_SUBNET_AVAILABILITY_ZONES=['eu-east-1a', 'eu-east-1b', 'eu-east-1c']
+PRIVATE_SUBNET_CIDR_BLOCKS=['10.222.333.48/28', '10.222.333.64/28', '10.222.333.80/28']
+PRIVATE_SUBNET_AVAILABILITY_ZONES=['eu-east-1a', 'eu-east-1b', 'eu-east-1c']
+```
+
+If you try to create subnets in invalid CIDR blocks / availability zones, the console output will tell you and it will show you the currently occupied CIDR blocks to help find a space for new subnets you want to create.
+
+### 3. Deploy your AWS stack using cdk deploy --all
+
+In command line in console, go to your `cdk` folder in the redaction app folder. Run `cdk deploy --all`. This should try to deploy the first stack in the `app.py` file.
+
+Hopefully everything will deploy successfully and you will be able to see your new stack in CloudFormation in the AWS console.
+
+### 4. Tasks for after CDK deployment
+
+The CDK deployment will create all the AWS resources needed to run the redaction app. However, there are some objects in AWS
+
+#### Run `post_cdk_build_quickstart.py`
+
+The following tasks are done by the `post_cdk_build_quickstart.py` file that you can find in the `cdk` folder. You will need to run this when logged in with AWS SSO through command line. I will describe how to do this in AWS console just in case the `.py` file doesn't work for you.
+
+##### Codebuild
+
+You need to build CodeBuild project after stack has finished deploying your CDK stack, as there will be no container in ECR.
+
+If you don't want to run the 'post_cdk_build_quickstart.py' file, in console, go to CodeBuild -> your project -> click Start build. Check the logs, the build should complete in about 6-7 minutes.
+
+##### Create a `config.env` file and upload to S3
+
+The 'post_cdk_build_quickstart' file will upload a config file to S3, as the Fargate task definition references a `config.env` file.
+
+if you want to do this manually:
+
+Create a `config.env` file to upload to the S3 bucket that has at least the following variables:
+
+```ini
+COGNITO_AUTH=0 # If you are using an SSL certificate with your application load balancer, you will be logging in there. Set this to 0 to turn off the default login screen.
+RUN_AWS_FUNCTIONS=1 # This will enable the app to communicate with AWS services.
+SESSION_OUTPUT_FOLDER=True # This will put outputs for each user in separate output folders.
+```
+
+* Then, go to S3 and choose the new `...-logs` bucket that you created. Upload the `config.env` file into this bucket.
+
+##### Update Elastic Container Service
+
+Now that the app container is in Elastic Container Registry, you can proceed to run the app on a Fargate server.
+The 'post_cdk_build_quickstart.py' file will do this for you, but you can also try this in Console. In ECS, go to your new cluster, your new service, and select 'Update service'.
+
+Select 'Force new deployment', and then set 'Desired number of tasks' to 1.
+
+## Additional Manual Tasks
+
+### Update DNS records for your domain (If using a domain for the SSL certificate)
+
+If the SSL certificate you are using is associated with a domain, you will need to update the DNS records for your domain registered with the AWS SSL certificate. To do this, you need to create a CNAME DNS record for your domain pointing to your load balancer DNS from a subdomain of your main domain registration, e.g., `redaction.`.
+
+### Create a user in Cognito
+
+You will next need to a create a user in Cognito to be able to log into the app.
+
+* Go to Cognito and create a user with your own email address. Generate a password.
+* Go to Cognito -> App clients -> Login pages -> View login page.
+* Enter the email and temporary password details that come in the email (don't include the last full stop!).
+* Change your password on the screen that pops up. You should now be able to login to the app.
+
+### Set Multi-Factor Authentication for Cognito logins(optional but recommended)
+On the Cognito user pool page you can also enable MFA, if you are using an SSL certificate with Cognito login on the Application Load Balancer. Go to Cognito -> your user pool -> Sign in -> Multi-factor authentication.
+
+### Create CloudFront distribution
+**Note: this is only relevant if you set `RUN_USEAST_STACK` to 'False' during CDK deployment**
+
+If you were not able to create a CloudFront distribution via CDK, you should be able to do it through console. I would advise using CloudFront as the front end to the app.
+
+Create a new CloudFront distribution.
+
+* **If you have used an SSL certificate in your CDK code:**
+ * **For Origin:**
+ * Choose the domain name associated with the certificate as the origin.
+ * Choose HTTPS only as the protocol.
+ * Keep everything else default.
+ * **For Behavior (modify default behavior):**
+ * Under Viewer protocol policy choose 'Redirect HTTP to HTTPS'.
+
+* **If you have not used an SSL certificate in your CDK code:**
+ * **For Origin:**
+ * Choose your elastic load balancer as the origin. This will fill in the elastic load balancer DNS.
+ * Choose HTTP only as the protocol.
+ * Keep everything else default.
+ * **For Behavior (modify default behavior):**
+ * Under Viewer protocol policy choose 'HTTP and HTTPS'.
+
+#### Security features
+
+You can add security features to your CloudFront distribution (recommended). If you use WAF, you will also need to change the default settings to allow for file upload to the app.
+
+* In your CloudFront distribution, under 'Security' -> Edit -> Enable security protections.
+* Choose rate limiting (default is fine). Then click Create.
+* In CloudFront geographic restrictions -> Countries -> choose an Allow list of countries.
+* Click again on Edit.
+* In AWS WAF protection enabled you should see a link titled 'View details of your configuration'.
+* Go to Rules -> `AWS-AWSManagedRulesCommonRuleSet`, click Edit.
+* Under `SizeRestrictions_BODY` choose rule action override 'Override to Allow'. This is needed to allow for file upload to the app.
+
+### Change Cognito redirection URL to your CloudFront distribution
+
+Go to Cognito -> your user pool -> App Clients -> Login pages -> Managed login configuration.
+
+Ensure that the callback URL is:
+* If not using an SSL certificate and Cognito login - `https://`
+* If using an SSL certificate, you should have three:
+ * `https://`
+ * `https:///oauth2/idpresponse`
+ * `https:///oauth/idpresponse`
+
+### Force traffic to come from specific CloudFront distribution (optional)
+
+Note that this only potentially helps with security if you are not using an SSL certificate with Cognito login on your application load balancer.
+
+Go to EC2 - Load Balancers -> Your load balancer -> Listeners -> Your listener -> Add rule.
+
+* Add Condition -> Host header.
+* Change Host header value to your CloudFront distribution without the `https://` or `http://` at the front.
+* Forward to redaction target group.
+* Turn on group stickiness for 12 hours.
+* Next.
+* Choose priority 1.
+
+Then, change the default listener rule.
+
+* Under Routing action change to 'Return fixed response'.
+
+You should now have successfully installed the document redaction app in an AWS environment using CDK.
diff --git a/src/management_guide.qmd b/src/management_guide.qmd
new file mode 100644
index 0000000000000000000000000000000000000000..3936145792185ac10ac96e46b6bbb511441fdbda
--- /dev/null
+++ b/src/management_guide.qmd
@@ -0,0 +1,226 @@
+---
+title: "User and AWS instance management guide"
+format:
+ html:
+ toc: true # Enable the table of contents
+ toc-depth: 3 # Include headings up to level 2 (##)
+ toc-title: "On this page" # Optional: Title for your TOC
+---
+
+This guide gives an overview of how to manage users of the redaction app, and how to start, stop, and manage instances of the app running on AWS Cloud.
+
+# User management guide
+
+This guide provides an overview for administrators to manage users within an AWS Cognito User Pool, specifically for an application utilising phone-app-based Two-Factor Authentication (2FA).
+
+## Managing Users in AWS Cognito User Pools
+
+AWS Cognito User Pools provide a secure and scalable user directory for your applications. This guide focuses on common administrative tasks within the AWS Management Console.
+
+### Accessing Your User Pool
+
+1. Log in to the AWS Management Console.
+2. Navigate to **Cognito** (you can use the search bar).
+3. In the left navigation pane, select **User Pools**.
+4. Click on the name of the user pool associated with your redaction app.
+
+### Creating Users
+
+Creating a new user in Cognito involves setting their initial credentials and attributes.
+
+1. From your User Pool's dashboard, click on the **Users** tab.
+2. Click the **Create user** button.
+3. **Username:** Enter a unique username for the user. This is what they will use to log in.
+4. **Temporary password:**
+ * Select **Generate a password** to have Cognito create a strong, temporary password.
+ * Alternatively, you can choose **Set a password** and enter one manually. If you do this, ensure it meets the password policy configured for your user pool.
+ * **Important:** Cognito will typically require users to change this temporary password upon their first login.
+5. **Email:** Enter the user's email address. This is crucial for communication and potentially for password recovery if configured.
+6. **Phone number (optional):** The phone number is not needed for login or user management in this app, you can leave this blank.
+7. **Mark email as verified/Mark phone number as verified:** For new users, you can choose to automatically verify their email and/or phone number. If unchecked, the user might need to verify these themselves during the signup process (depending on your User Pool's verification settings).
+8. **Groups (optional):** If you have defined groups in your user pool, you can add the user to relevant groups here. Groups are useful for managing permissions and access control within your application.
+9. Click **Create user**.
+
+### Information to Give to Users to Sign Up
+
+Once a user is created, they'll need specific information to access the application.
+
+* **Application URL:** The web address of your redaction app's login page.
+* **Username:** The username you created for them in Cognito.
+* **Temporary Password:** The temporary password you generated or set.
+* **Instructions for First Login:**
+ * "Upon your first login, you will be prompted to change your temporary password to a new, secure password."
+ * "You will also need to set up Two-Factor Authentication using a phone authenticator app (e.g., Google Authenticator, Authy)."
+
+### Resetting User Access (Password Reset)
+
+If a user forgets their password or needs their access reset, you can do this in the console.
+
+1. From your User Pool's dashboard, click on the **Users** tab.
+2. Locate the user you wish to reset. You can use the search bar.
+3. Click on the user's username.
+4. On the user details page, click the **Reset password** button.
+5. Cognito will generate a new temporary password and mark the user to change it on next login.
+6. **Important:** You will need to communicate this new temporary password to the user securely.
+
+### Two-Factor Authentication (2FA) with Apps Only
+
+Your application uses phone app-based 2FA. This section covers what administrators need to know.
+
+#### How it Works for the User
+
+When a user logs in for the first time or when 2FA is enabled for their account, they will be prompted to set up 2FA. This typically involves:
+
+1. **Scanning a QR Code:** The application will display a QR code.
+2. **Using an Authenticator App:** The user opens their authenticator app (e.g., Google Authenticator, Authy, Microsoft Authenticator) and scans the QR code.
+3. **Entering a Code:** The authenticator app will generate a time-based one-time password (TOTP). The user enters this code into the application to verify the setup.
+
+#### Administrator's Role in 2FA
+
+As an administrator, you generally don't directly "set up" the user's 2FA device in the console. The user performs this self-enrollment process within the application. However, you can manage the 2FA status of a user:
+
+1. **Enabling/Disabling 2FA for a User:**
+ * From your User Pool's dashboard, click on the **Users** tab.
+ * Click on the user's username.
+ * Under the "Multi-factor authentication (MFA)" section, you'll see the current MFA status.
+ * If 2FA is not enabled, you might have the option to "Enable MFA" for the user. If your user pool requires 2FA, it might be automatically enabled upon signup.
+ * You can also **Disable MFA** for a user if necessary. This will remove their registered 2FA device and they will no longer be prompted for a 2FA code during login until they re-enroll.
+2. **Removing a User's 2FA Device:** If a user loses their phone or needs to re-configure 2FA, you can remove their existing MFA device.
+ * On the user's details page, under the "Multi-factor authentication (MFA)" section, you will see a list of registered MFA devices (if any).
+ * Select the device and click **Remove**.
+ * The next time the user logs in, they will be prompted to set up 2FA again.
+
+### Other Useful Information for Administrators
+
+* **User Status:** In the "Users" tab, you'll see the status of each user (e.g., `CONFIRMED`, `UNCONFIRMED`, `FORCE_CHANGE_PASSWORD`, `ARCHIVED`, `COMPROMISED`).
+ * `CONFIRMED`: User has confirmed their account and set their password.
+ * `UNCONFIRMED`: User has been created but hasn't confirmed their account (e.g., through email verification) or changed their temporary password.
+ * `FORCE_CHANGE_PASSWORD`: User must change their password on next login.
+* **Searching and Filtering Users:** The "Users" tab provides search and filtering options to quickly find specific users or groups of users.
+* **User Attributes:** You can view and sometimes edit user attributes (like email, phone number, custom attributes) on the user's detail page.
+* **Groups:**
+ * You can create and manage groups under the **Groups** tab of your User Pool.
+ * Groups are useful for organising users and applying different permissions or configurations through AWS Identity and Access Management (IAM) roles.
+* **User Pool Settings:**
+ * Explore the various settings under the **User Pool Properties** tab (e.g., Policies, MFA and verifications, Message customisations).
+ * **Policies:** Define password complexity requirements.
+ * **MFA and verifications:** Configure whether MFA is optional, required, or disabled, and the types of MFA allowed (SMS, TOTP). Ensure "Authenticator apps" is enabled for your setup.
+ * **Message customisations:** Customise the email and SMS messages sent by Cognito (e.g., for verification codes, password resets).
+* **Monitoring and Logging:**
+ * Integrate your Cognito User Pool with AWS CloudWatch to monitor user activities and potential issues.
+ * Enable CloudTrail logging for Cognito to track API calls and administrative actions.
+* **Security Best Practices:**
+ * Always use strong, unique passwords for your AWS console login.
+ * Enable MFA for your AWS console login.
+ * Regularly review user access and permissions.
+ * Educate users on strong password practices and the importance of 2FA.
+
+By understanding these features and following best practices, administrators can effectively manage users within their AWS Cognito User Pool, ensuring secure and smooth operation of their redaction application.
+
+# Guide to running app instances on AWS
+
+This guide provides basic instructions for administrators to manage service tasks within AWS Elastic Container Service (ECS) using the AWS Management Console, focusing on scaling services on and off and forcing redeployments.
+
+## Basic Service Task Management in AWS ECS Console
+
+AWS Elastic Container Service (ECS) allows you to run, stop, and manage Docker containers on a cluster. This guide focuses on managing your ECS *services*, which maintain a desired number of tasks (container instances).
+
+### Accessing Your ECS Cluster and Services
+
+1. Log in to the AWS Management Console.
+2. Navigate to **ECS (Elastic Container Service)** (you can use the search bar).
+3. In the left navigation pane, select **Clusters**.
+4. Click on the name of the ECS cluster where your redaction app's service is running.
+
+### Understanding Services and Tasks
+
+Before we dive into management, let's clarify key concepts:
+
+* **Task Definition:** A blueprint for your application. It specifies the Docker image, CPU, memory, environment variables, port mappings, and other configurations for your containers.
+* **Task:** An actual running instance of a task definition. It's an individual container or a set of tightly coupled containers running together.
+* **Service:** A mechanism that allows you to run and maintain a specified number of identical tasks simultaneously in an ECS cluster. The service ensures that if a task fails or stops, it's replaced. It also handles load balancing and scaling.
+
+### Setting the Number of Running Tasks to 0 (Turning Everything Off)
+
+Setting the desired number of tasks to 0 for a service effectively "turns off" your application by stopping all its running containers.
+
+1. From your Cluster's dashboard, click on the **Services** tab.
+2. Locate the service associated with your redaction app (e.g., `redaction-app-service`).
+3. Select the service by checking the box next to its name.
+4. Click the **Update** button.
+5. On the "Configure service" page, find the **Number of tasks** field.
+6. Change the value in this field to `0`.
+7. Scroll to the bottom and click **Update service**.
+
+**What happens next:**
+
+* ECS will begin terminating all running tasks associated with that service.
+* The "Running tasks" count for your service will gradually decrease to 0.
+* Your application will become inaccessible as its containers are stopped.
+
+**Important Considerations:**
+
+* **Cost Savings:** Setting tasks to 0 can save costs by stopping the consumption of compute resources (CPU, memory) for your containers.
+* **Associated Resources:** This action *only* stops the ECS tasks. It does not stop underlying EC2 instances (if using EC2 launch type), associated databases, load balancers, or other AWS resources. You'll need to manage those separately if you want to completely shut down your environment.
+* **Container Images:** Your Docker images will still reside in Amazon ECR (or wherever you store them).
+* **Downtime:** This action will cause immediate downtime for your application.
+
+### Turning the Desired Number of Tasks On
+
+To bring your application back online, you'll set the desired number of tasks to your operational value (usually 1 or more).
+
+1. From your Cluster's dashboard, click on the **Services** tab.
+2. Locate the service associated with your redaction app.
+3. Select the service by checking the box next to its name.
+4. Click the **Update** button.
+5. On the "Configure service" page, find the **Number of tasks** field.
+6. Change the value in this field to your desired number of running tasks (e.g., `1`, `2`, etc.).
+7. Scroll to the bottom and click **Update service**.
+
+**What happens next:**
+
+* ECS will begin launching new tasks based on your service's configuration and task definition.
+* The "Running tasks" count will increase until it reaches your desired number.
+* Once tasks are running and healthy (according to your health checks), your application should become accessible again.
+
+**Important Considerations:**
+
+* **Startup Time:** Allow some time for tasks to pull images, start containers, and pass health checks before your application is fully available.
+* **Resource Availability:** Ensure your ECS cluster has sufficient available resources (EC2 instances or Fargate capacity) to launch the desired number of tasks.
+
+### Forcing Redeployment
+
+Forcing a redeployment is useful when you've updated your task definition (e.g., pushed a new Docker image, changed environment variables) but the service hasn't automatically picked up the new version. It's also useful for "restarting" a service.
+
+1. From your Cluster's dashboard, click on the **Services** tab.
+2. Locate the service you want to redeploy.
+3. Select the service by checking the box next to its name.
+4. Click the **Update** button.
+5. On the "Configure service" page, scroll down to the **Deployment options** section.
+6. Check the box next to **Force new deployment**.
+7. Scroll to the bottom and click **Update service**.
+
+**What happens next:**
+
+* ECS will initiate a new deployment for your service.
+* It will launch new tasks using the *latest active task definition revision* associated with your service.
+* Existing tasks will be drained and terminated according to your service's deployment configuration (e.g., `minimum healthy percent`, `maximum percent`).
+* This process effectively replaces all running tasks with fresh instances.
+
+**Important Considerations:**
+
+* **Latest Task Definition:** Ensure you have activated the correct and latest task definition revision before forcing a new deployment if your intention is to deploy new code. You can update the task definition used by a service via the "Update" service flow.
+* **Downtime (minimal if configured correctly):** If your service has a properly configured load balancer and healthy deployment settings (e.g., blue/green or rolling updates), forced redeployments should result in minimal to no downtime. ECS will bring up new tasks before shutting down old ones.
+* **Troubleshooting:** If a deployment gets stuck or tasks fail to start, check the "Events" tab of your service for error messages. Also, check the CloudWatch logs for your tasks.
+
+### Other Useful Information for Administrators
+
+* **Service Events:** On your service's detail page, click the **Events** tab. This provides a chronological log of actions taken by the ECS service, such as task launches, stops, and scaling events. This is invaluable for troubleshooting.
+* **Tasks Tab:** On your service's detail page, click the **Tasks** tab to see a list of all individual tasks running (or recently stopped) for that service. You can click on individual tasks to view their details, including logs, network configuration, and CPU/memory utilisation.
+* **Logs:** For each task, you can often find a link to its CloudWatch Logs under the "Logs" section of the task details. This is critical for debugging application errors.
+* **Metrics:** The **Metrics** tab on your service provides graphs for CPU utilisation, memory utilisation, and the number of running tasks, helping you monitor your service's performance.
+* **Deployment Configuration:** When updating a service, review the **Deployment options** section. This allows you to control how new deployments are rolled out (e.g., minimum healthy percent, maximum percent). Proper configuration here ensures minimal impact during updates.
+* **Auto Scaling (beyond basic management):** For dynamic scaling based on demand, explore **Service Auto Scaling**. This allows ECS to automatically adjust the desired number of tasks up or down based on metrics like CPU utilisation or request count.
+* **Task Definitions:** Before updating a service, you might need to create a new revision of your task definition if you're deploying new code or configuration changes to your containers. You can find Task Definitions in the left navigation pane under ECS.
+
+By mastering these basic service management operations in the AWS Console, administrators can effectively control the lifecycle of their ECS-based applications.
\ No newline at end of file
diff --git a/src/ocr_and_redaction_with_qwen35.qmd b/src/ocr_and_redaction_with_qwen35.qmd
new file mode 100644
index 0000000000000000000000000000000000000000..2cb78c9832b192620bd0372cc4d9b2af2556f4c8
--- /dev/null
+++ b/src/ocr_and_redaction_with_qwen35.qmd
@@ -0,0 +1,251 @@
+---
+title: "OCR and redaction with Qwen 3.5"
+format:
+ html:
+ toc: true # Enable the table of contents
+ toc-depth: 3 # Include headings up to level 3 (##)
+ toc-title: "On this page" # Optional: Title for your TOC
+---
+
+# Overview
+
+The 'doc_redaction' project is an open source GUI and CLI application for OCR and redaction tasks for PDF documents, images, and tabular data. The code for the application can be found [here](https://github.com/seanpedrick-case/doc_redaction){target="_blank"}. Initially the app was based on 'traditional' OCR such as Tesseract and PaddleOCR, with spaCy for entity recognition, alongside API calls to AWS services such as Textract and Comprehend.
+
+Recently the app has incorporated local vision language models (VLMs) for OCR and redaction tasks with on-device deployment. The particular difficulty of the document redaction task compared to most OCR uses is that the specific bounding box location of words is essential for successful redaction, meaning that until now, VLMs have struggled to be useful in this context. Improvements in the past year in both closed and open source VLM models have changed the situation.
+
+In February 2026, I wrote an article looking into the use of VLMs for OCR and redaction tasks in documents. The original article, using Qwen 3 VL 8B Instruct, can be found [here](https://seanpedrick-case.github.io/doc_redaction/src/redaction_with_vlm_and_llms.html){target="_blank"}. In late February 2026, Qwen 3.5 was released and I wanted to see how it compared to Qwen 3 for OCR/redaction tasks. Here I will test Qwen 3.5 for OCR/redaction on three 'difficult' tasks with an updated version of the app (version 2.0.1, which you can test out for yourself [here](https://huggingface.co/spaces/seanpedrickcase/document_redaction_vlm){target="_blank"}).
+
+We will be trying the models with the following tasks:
+
+- OCR of a difficult handwritten note
+- Face detection (with bounding box locations) on a document page
+- Custom entity identification on open text (using LLMs)
+
+My conclusion from the previous article was that PaddleOCR for initial OCR, paired with Qwen 3 VL 8B Instruct for low confidence phrases was the best solution for OCR of 'difficult' pages in documents (e.g. with difficult handwriting). In this post, I will use the following models (along with instructions on how you can try them yourself):
+
+- Qwen 3.5 9b 4 bit quantised, deployed via vLLM.
+- Qwen 3.5 35B A3B 4 bit quantised, deployed via llama.cpp.
+- Qwen 3.5 27B 4 bit quantised, hosted on the Document Redaction VLM space on Hugging Face [here](https://huggingface.co/spaces/seanpedrickcase/document_redaction_vlm){target="_blank"}. You can try this model directly with the examples below at this URL without deploying your own server.
+
+
+### Deploying Qwen 3.5 35B A3B 4 bit quantised (llama.cpp)
+* For llama.cpp, use the Docker Compose file in the main doc_redaction repository [here](https://github.com/seanpedrick-case/doc_redaction/blob/main/docker-compose_llama.yml){target="_blank"}. Use command 'docker compose -f docker-compose_llama.yml --profile 27b up -d'. Recommended VRAM is 24GB, but can be adjusted downwards with the '--n-gpu-layers' or '--n-cpu-moe' parameters (as relevant)in the docker compose file. See the [Unsloth guide to Qwen 3.5 deployment](https://unsloth.ai/docs/models/qwen3.5){target="_blank"} for more details.
+
+### Deploying Qwen 3.5 9B 4 bit quantised (vLLM)
+Deploy with the docker compose file [here](https://github.com/seanpedrick-case/doc_redaction/blob/main/docker-compose_vllm.yml){target="_blank"}. Use command 'docker compose -f docker-compose_vllm.yml --profile vllm-9b up -d', needs 16-17GB GB VRAM. Further instructions for use of vLLM can be found [here](https://docs.vllm.ai/projects/recipes/en/latest/Qwen/Qwen3.5.html){target="_blank"}. For less VRAM usage, you can deploy the 9B model also using an Unsloth GGUF file, similar to the approach for the 35B model above.
+
+## Example 1: Difficult handwritten note
+
+__To try the Qwen 3.5 27B model with this example, at the [Hugging Face space](https://huggingface.co/spaces/seanpedrickcase/document_redaction_vlm){target="_blank"}, click the third example name, 'Unclear text on handwritten note', change the local OCR model option below to 'vlm'. Then click on 'Redact document'.__
+
+The first example is a note with handwriting that is hard to decipher even for a person.
+
+[](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/vlm_blog/Difficult%20handwritten%20note.jpg){target="_blank"}
+
+### Qwen 3 VL 8B Instruct
+
+#### Qwen 3 VL 8B Instruct alone
+As a reminder, Qwen 3 VL 8B Instruct alone found this:
+
+[](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/vlm_blog/3.c%20unclear_text_handwritten_note_vlm/Difficult%20handwritten%20note_page_1_vlm_visualisations.jpg){target="_blank"}
+
+**Rating: 7/10** - the text identification is pretty good, but the model ignored the two text lines on the left (7/10). Bounding boxes are generally accurate, but some do not cover the entire line (7/10).
+
+#### Hybrid PaddleOCR + Qwen 3 VL
+
+A hybrid approach of PaddleOCR + Qwen 3 VL 8B Instruct found this:
+
+[](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/vlm_blog/3.b%20unclear_text_handwritten_note_paddle_hyb/Difficult%20handwritten%20note_page_1_hybrid_paddle_vlm_visualisations.jpg){target="_blank"}
+
+**Rating: 7.75/10** - the text identification is a bit worse than the VLM model alone (7.5/10). This is improved by the fact that due to the hybrid approach, all the bounding boxes have been identified, and the text is generally correctly (8/10).
+
+My conclusion from the last article was that PaddleOCR for initial OCR, paired with Qwen 3 VL 8B Instruct for low confidence phrases was the best solution for 'difficult' pages in documents. This was mainly due to the 'laziness' of the VLM model to identify text in the document - note that Qwen 3 VL 8B Instruct ignored the two text lines on the left. On pages with lots of text, I find that this pattern is repeated - the VLM will tend to miss some text.
+
+### Qwen 3.5 9B (vLLM)
+
+#### VLM alone
+
+[](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/vlm_blog/qwen3.5/Handwritten%20note/Qwen%203.5%209B%20inf/direct/Difficult%20handwritten%20note_page_1_inference_server_visualisations.jpg){target="_blank"}
+
+**Rating: 8/10** - the text identification is generally good, and it has included the two text lines on the left that were missed by Qwen 3 VL 8B Instruct, however and the text is not fully correct (8/10). For bounding boxes, it did miss one text box in the middle of the page (8/10).
+
+#### Hybrid PaddleOCR + Qwen 3.5 9B
+
+[](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/vlm_blog/qwen3.5/Handwritten%20note/Qwen%203.5%209B%20inf/hybrid/Difficult%20handwritten%20note_page_1_hybrid_paddle_inference_server_visualisations.jpg){target="_blank"}
+
+**Rating: 7.75/10** - The bounding boxes are all located correctly (8/10), but the text identification is not that great - particularly noting the Cyrillic characters identified near the bottom of the page (7.5/10).
+
+### Qwen 3.5 35B A3B (llama.cpp)
+
+#### VLM alone
+
+[](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/vlm_blog/qwen3.5/Handwritten%20note/Qwen%203.5%2035B%20inf/direct/Difficult%20handwritten%20note_page_1_inference_server_visualisations.jpg){target="_blank"}
+
+**Rating: 8/10** - the text identification is generally good, but I would say it slightly worse than the 9B model (8/10). No text boxes have been missed, however some boxes seem too large and overlap their neighbours (8/10).
+
+#### Hybrid PaddleOCR + Qwen 3.5 35B A3B
+
+[](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/vlm_blog/qwen3.5/Handwritten%20note/Qwen%203.5%2035B%20inf/hybrid/Difficult%20handwritten%20note_page_1_hybrid_paddle_inference_server_visualisations.jpg){target="_blank"}
+
+**Rating: 8.25/10** - The text identification is slightly better than for the 9B model in the hybrid approach, but still not perfect (particularly near the bottom) (8/10). The improvement in text identification results also in a better word match to bounding boxes (8.5/10).
+
+### Qwen 3.5 27B (HF space)
+
+__Qwen 3.5 is served on the Hugging Face space [here](https://huggingface.co/spaces/seanpedrickcase/document_redaction_vlm){target="_blank"}. Click the third example name, 'Unclear text on handwritten note', change the local OCR model option below to 'vlm'. Then click on 'Redact document'.__
+
+#### VLM alone
+
+[](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/vlm_blog/qwen3.5/Handwritten%20note/Qwen%203.5%2027B%20bnb/direct/Difficult%20handwritten%20note_page_1_vlm_visualisations.jpg){target="_blank"}
+
+**Rating: 8.5/10** - The output as generally very good. This time, the VLM model has identified all the text in the document (8.5/10). The position of the bounding boxes are also generally correct, with less overlap between lines than seen with the 35B A3B model (8.5/10). I have not given a higher score as I have seen with other quants (namely the llama.cpp deployment of the model that you can run [here](https://github.com/seanpedrick-case/doc_redaction/blob/main/docker-compose_llama.yml){target="_blank"}), the model does seem to miss a line or two of text. The unreliability/laziness of the model is still an issue.
+
+#### Hybrid PaddleOCR + Qwen 3.5 27B
+
+[](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/vlm_blog/qwen3.5/Handwritten%20note/Qwen%203.5%2027B%20bnb/hybrid/Difficult%20handwritten%20note_page_1_hybrid_paddle_vlm_visualisations.jpg){target="_blank"}
+
+**Rating: 7.75/10** - The text identification is noticeably worse than using the VLM alone (Cyrillic characters identified near the bottom) (7.5/10). The bounding boxes are also generally worse, with more overlap between lines than seen with the VLM alone, and some very small word boxes are present (8/10).
+
+### Conclusion
+
+| Model | Text Identification | Bounding Boxes | Overall rating |
+|-------|---------------------|----------------|----------------|
+| Qwen 3 VL 8B Instruct | 7/10 | 7/10 | 7/10 |
+| Hybrid PaddleOCR + Qwen 3 VL | 7.5/10 | 8/10 | 7.75/10 |
+| Qwen 3.5 9B | 8/10 | 8/10 | 8/10 |
+| Hybrid PaddleOCR + Qwen 3.5 9B | 7.5/10 | 8/10 | 7.75/10 |
+| Qwen 3.5 35B A3B | 8/10 | 8/10 | 8/10 |
+| Hybrid PaddleOCR + Qwen 3.5 35B A3B | 8/10 | 8.5/10 | 8.25/10 |
+| **Qwen 3.5 27B** | **8.5/10** | **8.5/10** | **8.5/10** |
+| Hybrid PaddleOCR + Qwen 3.5 27B | 7.5/10 | 8/10 | 7.75/10 |
+
+Overall, the Qwen 3.5 27B model alone (i.e. not using the hybrid approach with PaddleOCR) performs best on this task for identifying difficult handwriting. However, the issue with model 'laziness' in terms of missing lines of text in its response still persists, preventing me giving it a near perfect score.
+
+## Example 2: Face identification
+
+The next task is to accurately identify the location of people's faces on a document. The document can be found [here](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/vlm_blog/qwen3.5/Lambeth%20foreword%20faces/Lambeth%202030%20FINAL%20ACC_Ver_Dec.pdf_foreword.pdf){target="_blank"}.
+
+Since the pages contain a lot of typed text, pure VLM analysis is not necessary, and would likely worse due to the general VLM 'laziness' in terms of missing lines of text in their response. So page OCR is conducted with a hybrid PaddleOCR + VLM model approach (for any low confidence lines). Afterwards, a second VLM pass looks specifically for photos of people's faces present on the page, and creates a bounding box for each face.
+
+The example page contains two photos of faces, one on the left side of the page, and one on the right side, and also two cartoon drawings of peoples. So this test also tests the VLM's ability to following instructions to distinguish between photos of faces and cartoon drawings, as well as locating them.
+
+### Qwen 3 VL 8B Instruct
+
+[](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/vlm_blog/qwen3.5/Lambeth%20foreword%20faces/Qwen%203%20VL%208B%20bnb/qwen_3_vl_8b_face_identification.PNG){target="_blank"}
+
+**Rating: 7/10** - Qwen 3 VL 8B Instruct identified the faces and covered them, but the bounding boxes are not perfect - they are extended quite a bit upwards from the face location.
+
+### Qwen 3.5 9B (vLLM)
+
+[](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/vlm_blog/qwen3.5/Lambeth%20foreword%20faces/Qwen%203.5%209B%20inf/qwen_3_5_9b_lambeth_foreword_face_detection.png){target="_blank"}
+
+**Rating: 5/10** - Qwen 3.5 9B identified that there were two photos of faces on the page, but missed both faces completely with the bounding boxes. Disappointingly, this is much worse than the earlier Qwen 3 VL 8B Instruct model.
+
+### Qwen 3.5 35B A3B (llama.cpp)
+
+[](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/vlm_blog/qwen3.5/Lambeth%20foreword%20faces/Qwen%203.5%2035B%20inf/qwen_35_35B_foreword_faces.png){target="_blank"}
+
+**Rating: 7/10** - Qwen 3.5 35B A3B identified the faces and located them roughly in the correct position. However, on the face on the left, the eyes are still visible, so it cannot count as a full redaction.
+
+### Qwen 3.5 27B (Hugging Face space)
+
+__Qwen 3.5 is served on the Hugging Face space [here](https://huggingface.co/spaces/seanpedrickcase/document_redaction_vlm){target="_blank"}. Upload the 'Lambeth 30 FINAL ACC Ver Dec.pdf' file linked above. change the local OCR model option below to 'hybrid-paddle-vlm', and in the entities list below, ensure that 'CUSTOM_VLM_FACES' is part of the list. Then click on 'Redact document'.__
+
+[](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/vlm_blog/qwen3.5/Lambeth%20foreword%20faces/Qwen%203.5%2027B%20bnb/lambeth_foreword_faces_qwen_27b_bnb.jpg){target="_blank"}
+
+**Rating: 7/10** - Like the 35B A3B model, Qwen 3.5 27B identified the faces and located them roughly in the correct position. Like that model, the faces are only partially obscured - not good enough for redaction.
+
+Just to note, I tried the Qwen 3.5 27B model llama.cpp quantised version, and got slightly better results. This highlights that quantisation method can have an impact on the performance of a model, and it is worth trying out different versions to see which works best.
+
+[](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/vlm_blog/qwen3.5/Lambeth%20foreword%20faces/Qwen%203.5%2027B%20inf/qwen_3.5_27b_lambeth_foreword_faces.png){target="_blank"}
+
+### Conclusion
+
+| Model | Face detection |
+|-------|---------------------|
+| Qwen 3 VL 8B Instruct | 7/10 |
+| Qwen 3.5 9B | 5/10 |
+| Qwen 3.5 35B A3B | 7/10 |
+| **Qwen 3.5 27B** | **7/10** |
+
+All three models identified that there were two photos of faces on the page, and ignored the cartoon drawings, so 10/10 on that part of the task. However, their success with locating bounding boxes varied greatly. The 9B model missed completely, while the 35B A3B and 27B models located the faces roughly in the correct position, although both redaction boxes did not fully cover the face. None of these models could be relied upon to redact faces reliably based on this example.
+
+### Additional test - Can a paid Bedrock VLM model (Amazon Nova Pro) do better?
+
+I have found in my testing that Amazon Nova Pro served on AWS is one of the best VLM models available on the platform in terms of locating bounding boxes of text/images on pages (better than even the Claude range of models for this). Based on the above finding, that no Qwen model performs that well on face detection/location on a document page, I wondered if a paid Bedrock VLM model could do better.
+
+[](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/vlm_blog/qwen3.5/Lambeth%20foreword%20faces/Bedrock%20Nova%20Pro/amazon_nova_pro_foreword_face_detection.jpg){target="_blank"}
+
+**Rating: 8/10** - Amazon Nova Pro identified the faces and located them in the correct position. The bounding boxes are not perfect - they cover the face on the right just above the eyes, but don't fully cover the forehead.
+
+So, even paid options cannot complete this task reliably. From this perspective, the larger Qwen 3.5 models are performing pretty well. I wonder the inability to cover the faces completely is a prompting issue (perhaps adding instructions to cover the space around any found face would help), or if it's just the case that VLMs are note quite there in general for this particular task.
+
+## Example 3: Custom entity redaction with LLMs
+
+The third task is to classify text according to specific instructions passed to an LLM model. As noted in the last post, small models < 27B were not good at this task, and so I tried it with Gemma 3 27B, which worked well in identifying all the custom entities in the correct location.
+
+Let's see how Qwen 3.5 does. In this case, due to Hugging Face's limited VRAM, I used the 9B model on the demonstration app. The results for the other two models (35B A3B and 27B) come from the llama.cpp deployment of the model that you can run yourself [here](https://github.com/seanpedrick-case/doc_redaction/blob/main/docker-compose_llama.yml){target="_blank"} (24 GB VRAM ideal - 16 GB VRAM could work with a lower n-gpu-layers setting, or --n-cpu-moe setting for the 35B A3B model).
+
+The custom instructions used for this task were:
+
+__"Redact Lauren's name (always cover the full name if available), email addresses, and phone numbers with the label LAUREN. Redact university names with the label UNIVERSITY. Always include the full university name if available."__
+
+The full prompt and response text log files for this task can be found in the text files in the subfolders [here](https://github.com/seanpedrick-case/document_redaction_examples/tree/main/vlm_blog/qwen3.5/Applying%20professor%20email%20LLMs/){target="_blank"}.
+
+### Qwen 3 VL 8B Instruct
+
+[](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/vlm_blog/qwen3.5/Applying%20professor%20email%20LLMs/Qwen%203%20VL%208B/qwen_3_vl_8b_entity_detection.png){target="_blank"}
+
+**Rating: 6.5/10** - The Qwen 3 VL 8B Instruct model missed the first two instances of the university name, but got the name and the last two university names correct. Not good enough.
+
+### Qwen 3.5 9B (Hugging Face space)
+
+__To try the Qwen 3.5 9B model with this example, at the [Hugging Face space](https://huggingface.co/spaces/seanpedrickcase/document_redaction_vlm){target="_blank"}, click the fourth example name, 'Example email LLM PII detection', then click 'Redact document' below.__
+
+[](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/vlm_blog/qwen3.5/Applying%20professor%20email%20LLMs/Qwen%203.5%209B%20bnb/qwen_35_9b_llm_entity_extraction.png){target="_blank"}
+
+**Rating: 7/10** - The Qwen 3.5 9B model identified the entities mostly correctly, and located them roughly in the correct position. However, it missed the university label towards the bottom of the page in the second LLM call for the document. This model is probably too inconsistent to use reliably for this task.
+
+### Qwen 3.5 35B A3B (llama.cpp)
+
+[](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/vlm_blog/qwen3.5/Applying%20professor%20email%20LLMs/Qwen%203.5%2035B%20inf/qwen_35_35b_llm%20entity%20detection.png){target="_blank"}
+
+**Rating: 8/10** - The Qwen 3.5 35B A3B model identified the entities mostly correctly, and located them roughly in the correct position. However, its labelling is inconsistent. The first labels in the document cover just the university name, while the second pair of university names redacted completely, covering the 'University of...' text, which was asked of by the instructions. Better, but still not quite there.
+
+### Qwen 3.5 27B (llama.cpp)
+
+[](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/vlm_blog/qwen3.5/Applying%20professor%20email%20LLMs/Qwen%203.5%2027B%20inf/qwen_35_27b_llm_entity_detection.png){target="_blank"}
+
+**Rating: 10/10** - The Qwen 3.5 27B model identified the entities correctly, and located them in the correct position.
+
+### Conclusion
+
+| Model | Custom entity identification |
+|-------|---------------------|
+| Qwen 3 VL 8B Instruct | 6.5/10 |
+| Qwen 3.5 9B | 7/10 |
+| Qwen 3.5 35B A3B | 7.5/10 |
+| **Qwen 3.5 27B** | **10/10** |
+
+The Qwen 3.5 27B model aced this one. The advantage over using another model such as Gemma 3 27B is that this model could be used to do both VLM and LLM tasks, and so would be much more VRAM efficient.
+
+# Overall conclusion
+
+| Model | Text identification | Face detection | Custom entity identification | Overall rating |
+|-------|---------------------|----------------|----------------|----------------|
+| Qwen 3 VL 8B Instruct | 7/10 | 7/10 | 6.5/10 | 6.8/10 |
+| Qwen 3.5 9B | 8/10 | 5/10 | 7/10 | 6.7/10 |
+| Qwen 3.5 35B A3B | 8/10 | 7/10 | 7.5/10 | 7.5/10 |
+| **Qwen 3.5 27B** | **8.5/10** | **7/10**| **10/10** | **8.5/10** |
+
+Overall, the Qwen 3.5 27B model is the overall winner across all three tasks. It can read difficult handwritten text, it can identify photos of faces on a page and locate them with some accuracy, and it can follow relatively complex custom instructions to identify custom entities in open text. It can also compete in performance with paid Bedrock VLM models for tasks such as face detection. This leaves open the possibilty that local redaction processes with open source VLMs/LLMs can almost equal paid options with only access to a consumer level GPU needed.
+
+Surprisingly, the Qwen 3 VL 8B Instruct model performed surprisingly well compared to Qwen 3.5 9Bacross all three tasks. Qwen 3.5 9B was also let down by a terrible face detection performance. Both models are not large enough it seems to be reliable enough for 'difficult' redaction tasks, as demonstrated in this post.
+
+# Recommendations for local VLM use in redaction workflows
+
+Based on the above findings, this is what I would recommend for use with different tasks:
+
+- **For general OCR/redaction tasks:** use (in order) simple text extraction with a package like pymupdf, and for pages with images, use a hybrid PaddleOCR + Qwen 3.5 27B VLM approach. PaddleOCR will deal with all the 'easy' typewritten text, and the Qwen 3.5 27B VLM will deal with the more difficult handwriting.
+- **For documents with very difficult handwriting:** use Qwen 3.5 27B VLM, with manual checking and perhaps a second run through the model to pick up any text missed by the model (due to it's inherent 'laziness' in not identifying all text).
+- **Face or signature detection:** use Qwen 3.5 27B VLM, with manual checking to manually adjust the bounding boxes to cover the face or signature if needed. Perhaps adjust the instructions to ask the model to cover the space around the face or signature if needed.
+- **Custom entity identification:** use Qwen 3.5 27B LLM for any custom entity identification tasks.
\ No newline at end of file
diff --git a/src/redaction_with_vlm_and_llms.qmd b/src/redaction_with_vlm_and_llms.qmd
new file mode 100644
index 0000000000000000000000000000000000000000..2a982c57fb59581940e722d5399651b7a66f746c
--- /dev/null
+++ b/src/redaction_with_vlm_and_llms.qmd
@@ -0,0 +1,224 @@
+---
+title: "Redaction with local VLM and LLMs"
+format:
+ html:
+ toc: true # Enable the table of contents
+ toc-depth: 3 # Include headings up to level 3 (##)
+ toc-title: "On this page" # Optional: Title for your TOC
+---
+
+# Overview
+
+**NOTE:** As of March 2026, this article has been superseded with a new article based on Qwen 3.5 and an updated version of the app. See [here](https://seanpedrick-case.github.io/doc_redaction/src/redaction_with_qwen35.html){target="_blank"} for the new article.
+
+Redaction workflows require both text and bounding box identification from documents. In this post, I will test local vision language models (VLMs) from the Qwen 3 VL family against existing OCR tools to see how well they can perform in identifying text and bounding boxes from 'difficult' documents. All the examples can be recreated on the [Document Redaction app VLM space on Hugging Face](https://huggingface.co/spaces/seanpedrickcase/document_redaction_vlm){target="_blank"}. The code underlying this app can be found [here](https://github.com/seanpedrick-case/doc_redaction){target="_blank"}.
+
+[](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/vlm_blog/3.b%20unclear_text_handwritten_note_paddle_hyb/Difficult%20handwritten%20note_page_1_hybrid_paddle_vlm_visualisations.jpg){target="_blank"}
+
+# OCR word-level text extraction vs 'standard' OCR
+
+Many OCR tools exist to extract text from PDFs into formats such as markdown, and recent VLMs are extremely good at text extraction. However, for redacting personal information from documents, it is essential to have the exact page coordinates of every relevant word, as redaction boxes need to be placed on top of the words in their exact positions on the original document. Unlike more 'traditional' OCR solutions, most VLMs were not trained in returning line or word-level bounding boxes.
+
+## Existing OCR tools for word-level text extraction
+
+For some time, local OCR models such as [tesseract](https://github.com/tesseract-ocr/tesseract){target="_blank"} have existed for the purpose of identifying word text and position on the page for clean documents. Additionally, paid options have existed for more complex PDFs, such as [AWS Textract](https://aws.amazon.com/textract/). Both of these options can be tested in the [live Document Redaction app space on Hugging Face](https://huggingface.co/spaces/seanpedrickcase/document_redaction){target="_blank"}.
+
+These tools have limitations - the free tesseract model struggles to accurately identify text as soon as documents get noisy or complex. Other free OCR options such as PaddleOCR can identify line-level text positions, but do not natively support word-level coordinate extraction, and struggle with handwritten text. Paid options such as AWS Textract perform much better on handwriting, but the price can quickly rack up with thousands of pages to process. And even Textract struggles with scrawled handwriting and signatures.
+
+## New options - the Qwen 3 VL family
+
+Despite recent advances in local VLM models, improvements in the redaction workflow space specifically have been lacking. Until recently, there has not been a low-cost way to extract text from complex documents, while identifying word-level coordinates. The Qwen VL family is one of the first local VLM models that has been trained to identify the position of text as well as its content (see [here for examples](https://github.com/QwenLM/Qwen3-VL/blob/main/cookbooks/ocr.ipynb){target="_blank"}).
+
+With the advent of performant local VLMs (such as the Qwen 3 VL family), I thought it was a good time to try them out for testing the text extraction-redaction workflow. I imagine that a typical organisation would have many documents to process, and so would prefer smaller models would reduce cost and increase throughput. Local models also give the possibility of running sensitive documents on-system in secure environments. Finally, using small local models would give organisations the possibility of fine-tuning for specific document types to improve performance (this blog post is using the models 'out of the box', with quantisation to 4-bit).
+
+**NOTE:** At the time of writing, the Qwen 3.5 VL family of models seemed to be imminent - I will reconduct all the tests in this post with the expected Qwen 3.5 VL 9B, and perhaps the 35B MOE model when they are released.
+
+### Why Qwen 3 VL 8B Instruct?
+
+Qwen 3 VL 8B Instruct was chosen for this project as it is a small, fast model that is able to perform well on a range of tasks, including text extraction and redaction. It is also a good balance of performance and cost, as it is able to fit inside a reasonably-sized consumer GPU (16 GB VRAM) when quantised to 4-bit, which at the time of writing could be obtained for £400 (~$500) or less. Like the rest of the Qwen 3 VL family, it has been specifically trained to return line-level bounding boxes ([see here](https://github.com/QwenLM/Qwen3-VL/blob/main/cookbooks/ocr.ipynb){target="_blank"}), which in combination with the word-level segmentation functionality in the [Redaction app](https://huggingface.co/spaces/seanpedrickcase/document_redaction_vlm){target="_blank"}, could make this model suitable for use within a redaction workflow.
+
+All of the examples described below can be recreated on the [Document redaction VLM space on Hugging Face](https://huggingface.co/spaces/seanpedrickcase/document_redaction_vlm){target="_blank"}. Visual outputs from each experiment are also provided below.
+
+# Example 1 - 'Easy' document OCR - where VLMs are not needed
+
+Let's start our investigation into local model OCR with a simple example - the first page of the document found [here](https://github.com/seanpedrick-case/doc_redaction/blob/main/example_data/Partnership-Agreement-Toolkit_0_0.pdf){target="_blank"}
+
+We don't need to use complicate solutions to extract text with word-level coordinates - often the simplest solution is the perfectly adequate. This is demonstrated at the [Document Redaction VLM Hugging Face space](https://huggingface.co/spaces/seanpedrickcase/document_redaction_vlm){target="_blank"}, with the Baseline 'easy' document page example. As indicated by the name, this is a clear, simple document page without noise. Here we can see that current OCR solutions are adequate for text extraction, and the additional use of VLMs is not needed.
+
+__Click on the first example - 'Baseline 'easy' document page', and click through the steps including the button 'Redact document'. Then wait a couple of seconds for the results that appear in the output box.__
+
+## 'Easy' document OCR with tesseract
+
+The default OCR option for this example is [tesseract](https://github.com/tesseract-ocr/tesseract){target="_blank"} - a small, fast OCR model that has been many years in development. Below you can see the OCR output from the tesseract model. As you can see, it identifies the vast majority of the text with high confidence. The left page shows the pre-processed image with the identified bounding boxes overlaid. The right page shows the text identified by the model, with colour indicating the confidence of the analysis (green is high confidence, red is low confidence).
+
+[](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/vlm_blog/1.a%20baseline_easy_document_page_tess/Partnership-Agreement-Toolkit_0_0_page_1_tesseract_visualisations.jpg){target="_blank"}
+
+However, even with this 'simple' example, there are some issues. Tesseract extracts a few words with low confidence, and some are wrong. See 'Connect globally - Thrive locally' in the top left of the page. Can a more performant local model perform better? Let's try PaddleOCR.
+
+## 'Easy' document OCR with PaddleOCR
+
+An alternative option to try is PaddleOCR, which uses a tiny VLM (pp-OCRV5), is quite fast, and performs much better than tesseract in general at the cost of speed and resource needs - [details here](https://huggingface.co/papers/2507.05595){target="_blank"}. Another disadvantage particular to the redaction use case is that PaddleOCR (along with almost all open source OCR solutions) only identifies bounding boxes at the line level, not the word level. To account for this, the [Redaction app]((https://huggingface.co/spaces/seanpedrickcase/document_redaction_vlm){target="_blank"}) uses post-OCR processing to [segment line bounding boxes into word bounding boxes](https://github.com/seanpedrick-case/doc_redaction/blob/main/tools/word_segmenter.py){target="_blank"}.
+
+__To test PaddleOCR with our baseline example, first click on the example at the Hugging Face space, but at step 2 (Choose text extraction (OCR) method), select 'paddle' instead of tesseract. Then run through the steps and click on 'Redact document' as before. Note that the Hugging Face spaces version of the Redaction app uses the CPU-powered version of PaddleOCR due to compatibility issues with torch and Spaces - the GPU-enabled version of PaddlePaddle run locally is much faster.__
+
+[](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/vlm_blog/1.b%20%20baseline_easy_document_page_padd/Partnership-Agreement-Toolkit_0_0_page_1_paddle_visualisations.jpg){target="_blank"}
+
+The results above are __almost__ perfect - with the exception of an 'I' missed off INTERNATIONAL in the top left, and a misplaced bounding box cut in the top right with the first P of Partnership, caused by a relatively large gap in this title word that may have been created by page pre-processing techniques. Additionally, line to word level bounding box segmentation is still a method I am working on and could be improved - see [here](https://github.com/seanpedrick-case/doc_redaction/blob/main/tools/word_segmenter.py){target="_blank"} for the code.
+
+## Conclusion
+
+Overall, I would say the performance on word-level OCR on this page is pretty similar between tesseract and PaddleOCR + post-OCR word segmentation. Personally, I would probably stick with tesseract for such 'easy' pages in documents without selectable text due to the speed increase. However, for anything more difficult that this tesseract clearly falls flat, as we will see in the following examples.
+
+**NOTE:** as the page actually contains almost exclusively selectable text, even quicker text extraction with the [pdfminer.six package](https://github.com/pdfminer/pdfminer.six){target="_blank"} can be used through the option 'Local model - selectable text'. However, the purpose of this post is to compare OCR options, so from now on I will consider document pages with the assumption that some sort of OCR solution is needed.
+
+
+# Example 2 - Scanned, noisy document page
+
+We'll now try a more difficult page from the [same document as above - page 6](https://github.com/seanpedrick-case/doc_redaction/blob/main/example_data/Partnership-Agreement-Toolkit_0_0.pdf){target="_blank"}. This page is primarily a scanned document that contains significant noise, as well as signatures. We will test tesseract, PaddleOCR, and a hybrid PaddleOCR + Qwen 3 VL approach.
+
+## Scanned, noisy document page with tesseract
+
+__Click the second example name, 'Scanned document page with signatures', and on step 2 'Choose text extraction (OCR) method', change the local OCR model option to 'tesseract'. Then run through the other steps and click 'Redact document'.__
+
+Below you can see the results of the OCR process:
+
+[](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/vlm_blog/2.a%20scanned_document_page_sig_tesseract/Partnership-Agreement-Toolkit_0_0_page_6_tesseract_visualisations.jpg){target="_blank"}
+
+As you can see from the above, the results are... pretty bad. The tesseract model seems to have become lost in the noisy text from the scanned document, frequently failing to identify individual words, and misreading the text completely. It has also missed the handwriting and signatures completely. I would say that tesseract is unusable for 'noisy' documents like these.
+
+## Scanned, noisy document page with PaddleOCR
+
+__As before, click the second example name, 'Scanned document page with signatures', and on step 2 'Choose text extraction (OCR) method', change the local OCR model option to 'paddle'. Then run through the other steps and click 'Redact document'.__
+
+The results are as follows:
+
+[](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/vlm_blog/2.b%20scanned_document_page_signatures_padd/Partnership-Agreement-Toolkit_0_0_page_6_paddle_visualisations.jpg){target="_blank"}
+
+We can see that PaddleOCR performs much better than tesseract, but still has some issues. The bounding boxes for the lines are generally good, but at the word levelare not all accurate (which may be more due to to the word segmentation process). It has identified the handwriting and signatures, but the text identified is not always correct - see 'Cit of Lona Beach' near the top, 'Sister Lity Agreement' near the top as a couple of examples. The signature text detection is completely off. I would say that PaddleOCR is an ok option for 'noisy' documents with typed text, if you are not too worried about the accuracy of the handwriting or signature analysis.
+
+## Scanned, noisy document page with a hybrid PaddleOCR + Qwen 3 VL approach
+
+We will now try a hybrid approach with PaddleOCR + Qwen 3 VL. In this approach, PaddleOCR is used to identify the bounding boxes for the lines, and then Qwen 3 VL 8B Instruct is used to re-analyse words that had low confidence from PaddleOCR. Additionally, a second VLM pass is conducted specifically to identify the position of signatures.
+
+__As before, click the second example name, 'Scanned document page with signatures', and on step 2 'Choose text extraction (OCR) method', change the local OCR model option to 'hybrid-paddle-vlm'. Then run through the other steps and click 'Redact document'.__
+
+The results are as follows:
+
+[](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/vlm_blog/2.c%20scanned_document_page_signatures_padd_hyb/Partnership-Agreement-Toolkit_0_0_page_6_hybrid_paddle_vlm_visualisations.jpg){target="_blank"}
+
+Words that the VLM had a try at re-analysing are shown in a grey box on the right hand side of the page. To do this, a small image is 'cut' from the original page image, and passed to the VLM to identify the text. Below is an example of one of these images from this page where the VLM has successfully corrected the phrase 'Cit of Lona Beach' to 'City of Long Beach':
+
+[](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/vlm_blog/2.c%20scanned_document_page_signatures_padd_hyb/Cit%20of%20Lona%20Bea_conf_92_to_City%20of%20Long%20Be_conf_100.png){target="_blank"}
+
+Prompts underlying the above 'hybrid' approach (including the signature second pass) can be found [here](https://github.com/seanpedrick-case/doc_redaction/blob/1da988af088b1357695db9b261b91fdd339439de/tools/run_vlm.py#L781C1-L793C64){target="_blank"}. The hybrid OCR function itself can be found [here](https://github.com/seanpedrick-case/doc_redaction/blob/1da988af088b1357695db9b261b91fdd339439de/tools/custom_image_analyser_engine.py#L6006).
+
+We can see that the hybrid PaddleOCR + Qwen 3 VL approach performs much better than PaddleOCR alone. It is however limited by the performance of the first PaddleOCR pass. As 'Sister Lity Agreement' from PaddleOCR was a high confidence analysis, the VLM did not have a chance to re-analyse it.
+
+The signature text detection is slightly better, but still not perfect. The second pass VLM analysis correctly identified the positions of the signatures on the page, identified as [SIGNATURE] entries in the output (see the right hand page). In terms of reading text, the first signature is identified by the VLM as 'Beperly O'Neill' - almost perfect. The second signature is called 'Dulus' - which is incorrect, but a symptom of the fact that PaddlOCR identified only part of the signature as a valid bounding box for the word.
+
+## Scanned, noisy document page with Qwen 3 VL alone
+
+__Click the second example name, 'Scanned document page with signatures', and on step 2 'Choose text extraction (OCR) method', change the local OCR model option to 'vlm'. Then run through the other steps and click 'Redact document'.__
+
+The results are as follows:
+
+[](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/vlm_blog/2.d%20scanned_document_page_signatures_vlm/Partnership-Agreement-Toolkit_0_0_page_6_vlm_visualisations.jpg){target="_blank"}
+
+There are a number of issues with this analysis. First, the left hand side side of the page shows that frequently the bounding box locations identified by Qwen 3 VL are inaccurate - generally (but not consistently) to the right of where they should be (NOTE: don't think this is a rescaling issue, as we will see for the example below that the bounding box coordinates are generally correct). Based on the results for this page, I wouldn't recommend VLM analysis alone for this page with this model.
+
+## Conclusion
+
+I would say that the hybrid approach is a good option for 'noisy' documents with typed text and mixed handwriting / signatures, if you are not interested in the specific text in the signatures. It is of course a slower and more resource-intensive approach than PaddleOCR alone, but the increased quality may be worth it, and it is still faster and more accurate than VLM analysis alone (33 seconds vs 84 seconds for analysis).
+
+# Example 3 - Unclear text on handwritten note
+
+There are of course many situations where it is important to accurately extract handwritten text from documents. This has long been a struggle for OCR tools, including paid options, but modern VLMs are getting better and better at this. How will a small model like Qwen 3 VL 8B Instruct perform? Let's find out.
+
+The document for this is example is below. As you can see, some of the text is very difficult for a person to read.
+
+[](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/vlm_blog/Difficult%20handwritten%20note.jpg){target="_blank"}
+
+Let's first use PaddleOCR as a baseline to compare with the pure VLM approach.
+
+## Unclear text on handwritten note with PaddleOCR
+
+__Click the third example name, 'Unclear text on handwritten note', and on step 2 'Choose text extraction (OCR) method', change the local OCR model option to 'paddle'. Then run through the other steps and click 'Redact document'.__
+
+[](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/vlm_blog/3.a%20unclear_text_handwritten_note_paddle/Difficult%20handwritten%20note_page_1_paddle_visualisations.jpg){target="_blank"}
+
+The results are quite poor. PaddleOCR has low confidence for the majority of the text, and the text identified is pretty consistently wrong, even in cases where the confidence was high. This shows once again that PaddleOCR is not a good option for handwritten text. At least in general the line-level bounding boxes are well positioned, so it may be worth trying again with the hybrid PaddleOCR + Qwen 3 VL approach.
+
+## Unclear text on handwritten note with a hybrid PaddleOCR + Qwen 3 VL 8B Instruct
+
+___Click the third example name, 'Unclear text on handwritten note', and on step 2 'Choose text extraction (OCR) method', change the local OCR model option to 'hybrid-paddle-vlm'. Then run through the other steps and click 'Redact document'.__
+
+[](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/vlm_blog/3.b%20unclear_text_handwritten_note_paddle_hyb/Difficult%20handwritten%20note_page_1_hybrid_paddle_vlm_visualisations.jpg){target="_blank"}
+
+This analysis is better. The text identified by the VLM is much more accurate, but still not great. The word-level bounding boxes are ok in general, but may be limited by the performance of the first PaddleOCR pass, and the line-level images passed to the VLM as part of the hybrid approach. A number of example line images passed to the VLM can be seen from the PNG files in this folder [here](https://github.com/seanpedrick-case/document_redaction_examples/tree/main/vlm_blog/3.b%20unclear_text_handwritten_note_paddle_hyb/){target="_blank"}.
+
+I would say that this analysis is still not good enough. How could a pure VLM approach + word-level bounding box segmentation perform?
+
+## Unclear text on handwritten note with Qwen 3 VL 8B Instruct alone
+
+__Click the third example name, 'Unclear text on handwritten note', and on step 2 'Choose text extraction (OCR) method', change the local OCR model option to 'vlm'. Then run through the other steps and click 'Redact document'.__
+
+[](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/vlm_blog/3.c%20unclear_text_handwritten_note_vlm/Difficult%20handwritten%20note_page_1_vlm_visualisations.jpg){target="_blank"}
+
+The code for this approach can be found [here](https://github.com/seanpedrick-case/doc_redaction/blob/1da988af088b1357695db9b261b91fdd339439de/tools/custom_image_analyser_engine.py#L2702){target="_blank"}.
+
+The results in terms of text identification are better. The text identified by the VLM is generally accurate, with some exceptions (e.g. 'PANDO - ANDO - REINE this plant', which I think should be 'PANDO - ANDO - REINE thus prevent'). I would say that this is pretty close to the level of accuracy that a human would achieve in transcribing from the same image.
+
+Additional issues include that the analysis has completely missed a few words to the left of the page, an issue that I think could be model-specific, and a larger Qwen 3 VL model would likely perform better.
+
+The bounding box locations at the line level are generally OK, and as before, word-level bounding box issues are more a symptom of the word segmentation process, rather than the VLM itself.
+
+## Conclusion
+
+I would say that Paddle + VLM hybrid analysis is the best option for handwritten text. The text extraction quality may be slightly worse than the pure VLM approach, but I think the increased general reliability in terms of identifying line-level bounding boxes, and not missing text completely, puts it ahead for me.
+
+I would also expect a bigger/more modern model in the Qwen VL family would perform better in text identification and bounding box location accuracy. Additionally, with the Qwen 3.5 series of small local models imminent, this conclusion may change very soon. I will update the analysis as soon as these models are available.
+
+# Example 4 - Bonus VLM features - face identification
+
+The use of VLMs in the redaction process gives rise to some potential bonus features that are not possible with traditional OCR tools. Let's see how the Qwen 3 VL 8B Instruct model performs at identifying faces in a document.
+
+__Choose the example 'CV with photo - face identification'. The OCR method will be 'hybrid-paddle-vlm'. Then run through the other steps and click 'Redact document'.__
+
+[](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/vlm_blog/4.%20cv_photo_face_paddle_hyb/Example-cv-university-graduaty-hr-role-with-photo-2_page_1_hybrid_paddle_vlm_visualisations.jpg){target="_blank"}
+
+The text on the page is relatively simple to analyse, and PaddleOCR performs well, with accurate text and bounding box locations. What's of particular interest here is that we have prompted the second pass VLM analysis to identify photos of people's faces in the document. You can see this in the top left of the page to the right, where the location of the applicant's face is identified as [FACE] in the output. The redacted PDF output shows this face blacked out [here](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/vlm_blog/4.%20cv_photo_face_paddle_hyb/Example-cv-university-graduaty-hr-role-with-photo-2_redacted.pdf){target="_blank"}, with an Adobe Acrobat redaction commented version [here](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/vlm_blog/4.%20cv_photo_face_paddle_hyb/Example-cv-university-graduaty-hr-role-with-photo-2_redactions_for_review.pdf){target="_blank"} (download it and open in a PDF viewer to see the comments added).
+
+The prompt and response for the face identification can be found [here](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/vlm_blog/4.%20cv_photo_face_paddle_hyb/vlm_Example-cv-university-graduaty-hr-role-with-photo-2.pdf_0_page_0000_person_vlm.txt){target="_blank"}. Of course, other visual features could be identified with the VLM with correct prompting.
+
+# Example 5 - Redaction with LLMs (Gemma 3 27B)
+
+For identifying PII in redacted text, small language models have been used for some time, e.g. [spaCy models](https://spacy.io){target="_blank"} perform relatively well throught the [Microsoft Presidio package](https://microsoft.github.io/presidio/){target="_blank"}. Higher performance come from paid solutions such as [AWS Comprehend](https://aws.amazon.com/comprehend/). Both of these are also demonstrated at the [Document Redaction app space](https://huggingface.co/spaces/seanpedrickcase/document_redaction){target="_blank"}. Both solutions lack contextual understanding of the text, and offer only a standard list of entity types to identify, defined in their pre-training.
+
+Recent improvements in LLM text understanding opens up the possibility of using models for 'intelligent' text redaction according to specific user prompts and the broader context of the passages under review. This functionality is still very nascent in the [Redaction app]((https://huggingface.co/spaces/seanpedrickcase/document_redaction_vlm){target="_blank"}), however the basic ability to add custom labels to redaction tasks with local/cloud (AWS) LLMs has been added. From my experiments, I found that a local LLM model of moderate size (~30GB) can perform quite well at identifying PII in open text, and gives the ability to add custom entities to the analysis. In this example I will use a 4-bit quantised [Gemma 3 27B](https://huggingface.co/unsloth/gemma-3-27b-it-bnb-4bit){target="_blank"} model, which should fit within a consumer GPU (approx 20-24GB VRAM needed).
+
+The example for this is 'Example LLM PII detection' on the [Redaction app VLM Hugging Face space](https://huggingface.co/spaces/seanpedrickcase/document_redaction_vlm), and this will analyse [the emails from this simple document](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/example_of_emails_sent_to_a_professor_before_applying.pdf). If you look at step 3, you will see that 'Local transformers LLM' is now the PII identification method, and a custom prompt has been added to the box below:
+
+__'Redact Lauren's name, email addresses, and phone numbers with the label LAUREN. Redact university names with the label UNIVERSITY.'__.
+
+The results from the first email in the document are below.
+
+[](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/vlm_blog/5.a%20example_email_txt_extract_llm_pii_gemma_27b/University_Lauren_example.png){target="_blank"}
+
+The prompt and response for the text above can be found [here](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/vlm_blog/5.a%20example_email_txt_extract_llm_pii_gemma_27b/llm_example_of_emails_sent_to_a_professor_before_applying_page_0001_batch_0001.txt){target="_blank"}, and [here](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/vlm_blog/5.a%20example_email_txt_extract_llm_pii_gemma_27b/llm_example_of_emails_sent_to_a_professor_before_applying_page_0001_batch_0002.txt){target="_blank"}.
+
+The LLM follows the instructions, identifying only text directly related to Lauren and the university names. This is quite limited functionality compared to what a large language model could probably offer - in the near future I will do more experimentation with small, local LLMs to see what is possible.
+
+
+# Conclusion
+
+In this post, we have seen how small local VLMs (from the Qwen 3 VL family), alone or alongside existing OCR tools, can be used as part of a redaction workflow to help extract text from 'difficult' documents and accurately locate text bounding boxes.
+
+In these examples, Qwen 3 VL 8B Instruct was used alone or with PaddleOCR, and Gemma 3 27B was used for PII identification. Gemma 3 27B was quantised to 4-bit using bitsandbytes, resulting in a VRAM footprint of approximately 10-12GB for the VLM model and 20GB for the LLM model.
+
+We have seen that for 'easy' documents (typed text), tesseract or PaddleOCR are generally sufficient. For 'noisy' documents, e.g. scanned documents with typed text, and small amounts of handwriting and signatures, the hybrid PaddleOCR + Qwen 3 VL approach is a good option. For difficult handwritten text, it is close between the hybrid and the the pure VLM approach as to the best quality approach, but I sided with the hybrid approach due to the increased reliability in identifying text locations, and including all text. This conclusion may change in the near future with the imminent release of the small Qwen 3.5 range of VL models, after which I will update this post.
+
+Local VLMs such as Qwen 3 VL can also be used to add additional features to redaction workflows to identify photos of people's faces and signatures in the document, and accurately locate them on the page.
+
+Local LLMs of moderate size (~30GB) can be used to augment PII identification tasks, with the ability to identify custom entities in open text with some accuracy.
+
+Overall, local VLMs have the potential to be used for redaction tasks, however, they are not quite there alone. Currently, for bounding box detection, they work better with the assistance of a 'traditional' OCR model such as PaddleOCR. With rapid advancements in AI, including local models, this of course could change in the very near future.
\ No newline at end of file
diff --git a/src/user_guide.qmd b/src/user_guide.qmd
new file mode 100644
index 0000000000000000000000000000000000000000..29fdd9b47c327b81b011311c67dc08745ec1a782
--- /dev/null
+++ b/src/user_guide.qmd
@@ -0,0 +1,1218 @@
+---
+title: "User guide"
+format:
+ html:
+ toc: true # Enable the table of contents
+ toc-depth: 3 # Include headings up to level 3 (##)
+ toc-title: "On this page" # Optional: Title for your TOC
+---
+
+## Table of contents
+
+### Getting Started
+- [Quickstart - Test the app with built-in examples](#quickstart---test-the-app-with-built-in-examples)
+ - [PDF document examples](#pdf-document-examples)
+ - [CSV/Excel file examples](#csvexcel-file-examples)
+- [Basic redaction](#basic-redaction)
+ - [Upload files to the app](#upload-files-to-the-app)
+ - [Text extraction](#text-extraction)
+ - [AWS Textract signature extraction](#aws-textract-signature-extraction)
+ - [PII redaction method](#pii-redaction-method)
+ - [Duplicate page redaction](#duplicate-page-redaction)
+ - [Allow list, deny list, and whole-page redaction](#allow-list-deny-list-and-whole-page-redaction)
+ - [Cost and time estimation](#cost-and-time-estimation)
+ - [Cost code selection](#cost-code-selection)
+ - [Redact only specific pages](#redact-only-specific-pages)
+ - [Run redaction](#run-redaction)
+ - [Redaction outputs](#redaction-outputs)
+- [Reviewing and modifying suggested redactions](#reviewing-and-modifying-suggested-redactions)
+ - [Uploading documents for review](#uploading-documents-for-review)
+ - [Page navigation](#page-navigation)
+ - [Document viewer](#document-viewer)
+ - [Modify existing redactions](#modify-existing-redactions)
+ - [Search text and redact](#search-text-and-redact)
+ - [Navigating through the document using 'View text'](#navigating-through-the-document-using-view-text)
+ - [Apply revised redactions to PDF](#apply-revised-redactions-to-pdf)
+- [Loading in previous results to continue redaction](#loading-in-previous-results-to-continue-redaction)
+ - [Loading in previous results from the redactions_for_review.pdf file](#loading-in-previous-results-from-the-redactions_for_reviewpdf-file)
+ - [Loading in OCR results to search for new redactions](#loading-in-ocr-results-to-search-for-new-redactions)
+ - [Using a previous OCR results file to skip redoing OCR for future redaction tasks](#using-a-previous-ocr-results-file-to-skip-redoing-ocr-for-future-redaction-tasks)
+- [Document summarisation](#document-summarisation)
+- [Redacting Word, tabular data files (CSV/XLSX) or open text](#redacting-word-tabular-data-files-xlsxcsv-or-open-text)
+ - [Word or tabular data files (XLSX/CSV)](#word-or-tabular-data-files-xlsxcsv)
+ - [Choosing output anonymisation format](#choosing-output-anonymisation-format)
+ - [Redacting open text](#redacting-open-text)
+
+### Advanced user guide
+- [Identifying and redacting duplicate pages with custom settings](#identifying-and-redacting-duplicate-pages-with-custom-settings)
+ - [Duplicate page detection in documents](#duplicate-page-detection-in-documents)
+ - [Duplicate detection in tabular data](#duplicate-detection-in-tabular-data)
+- [Export redacted document files to Adobe Acrobat](#export-redacted-document-files-to-adobe-acrobat)
+ - [Using redactions_for_review.pdf files with Adobe Acrobat](#using-redactions_for_reviewpdf-files-with-adobe-acrobat)
+ - [Exporting comment files to Adobe Acrobat](#exporting-comment-files-to-adobe-acrobat)
+ - [Importing comment files from Adobe Acrobat](#importing-comment-files-from-adobe-acrobat)
+- [Submit documents to the AWS Textract API service for faster OCR](#submit-documents-to-the-aws-textract-api-service-for-faster-ocr)
+- [Advanced OCR settings - Efficient OCR, overwrite existing OCR](#advanced-ocr-settings---efficient-ocr-overwrite-existing-ocr)
+- [Modifying existing redaction review files](#modifying-existing-redaction-review-files)
+
+### Features for expert users/system administrators
+- [Using AWS Textract and Comprehend when not running in an AWS environment](#using-aws-textract-and-comprehend-when-not-running-in-an-aws-environment)
+- [Advanced OCR model options (inlcuding Hybrid OCR)](#advanced-ocr-options-hybrid-ocr)
+- [PII identification with LLMs](#pii-identification-with-llms)
+- [Command Line Interface (CLI)](#command-line-interface-cli)
+
+## Quickstart - Test the app with built-in examples
+
+### PDF document examples
+
+The app provides some built-in examples so you can see how it works before trying one of your own files.
+
+**For PDF/image redaction:** On the 'Redact PDFs/images' tab, you'll see a section titled "Try an example - Click on an example below and then the 'Extract text and redact document' button". Simply click on any of the available examples to load them with pre-configured settings:
+
+- **PDF with selectable text redaction** - Uses local text extraction with standard PII detection
+- **Image redaction with local OCR** - Processes an image file using OCR
+- **PDF redaction with custom entities** - Demonstrates custom entity selection (Titles, Person, Dates)
+- **PDF redaction with AWS services and signature detection** - Shows AWS Textract with signature extraction (if AWS is enabled)
+- **PDF redaction with custom deny list and whole page redaction** - Demonstrates the use of redacting specific named terms and whole pages
+
+Once you have clicked on an example, you can click the 'Extract text and redact document' button to redact the document. You can then click the 'Review and modify redactions' button below this to review and modify suggested redactions. See the 'Basic redaction' section below for more details on redacting your own documents.
+
+### CSV/Excel file examples
+
+**For tabular data:** On the 'Word or Excel/CSV files' tab, you'll find examples for both redaction and duplicate detection:
+
+- **CSV file redaction** - Shows how to redact specific columns in tabular data
+- **Word document redaction** - Demonstrates Word document processing
+- **Excel file duplicate detection** - Shows how to find duplicate rows in spreadsheet data
+
+Once you have clicked on an example, you can click the 'Redact text/data files' button directly to redact the example file. Once done, you can click the 'Review redactions' button to review and modify suggested redaction boxes.
+
+## Basic redaction
+
+The document redaction app can detect personally-identifiable information (PII) in documents. Documents can be redacted directly, or suggested redactions can be reviewed and modified using a graphical user interface. Basic document redaction can be performed quickly using the default options.
+
+**Where to work:** All of the main redaction options and the redact button are on the **'Redact PDFs/images'** tab.
+
+
+
+### Upload files to the app
+
+On the **'Redact PDFs/images'** tab, the **'Redaction settings'** accordion at the top accepts PDFs and image files (JPG, PNG) for redaction. Click on the **'Drop files here or Click to Upload'** area, or select from one of the [examples provided](#pdf-document-examples).
+
+### Text extraction
+
+Under **'Redaction settings'** accordion, you can see **'Change default text extraction settings'**. You may have the following options available depending on your configuration - if not, AWS Textract will likely be the default option:
+
+- **'Local model - selectable text'** - (optional) Reads text directly from PDFs that have selectable text.
+- **'Local OCR model - PDFs without selectable text'** - (optional) Uses a local OCR model to extract text from PDFs/images. Handles most typed text without selectable text but is less accurate for handwriting and signatures; use the AWS Textract option in this case.
+- **'AWS Textract service - all PDF types'** - Available when the app is configured for AWS. Textract runs in the cloud and is more capable for complex layouts, handwriting, and signatures. It incurs a (relatively small) cost per page.
+
+### AWS Textract signature extraction
+
+
+
+If you select **'AWS Textract service - all PDF types'** as the text extraction method, an accordion **'Enable AWS Textract signature detection (default is off)'** appears. Open it to turn on handwriting and/or signature detection. Enabling signatures has a cost impact (~£2.66 ($3.50) per 1,000 pages vs ~£1.14 ($1.50) per 1,000 pages without signature detection).
+
+
+
+
+
+**NOTE:** Form, layout, table extraction, or face detection can be enabled if required for specific use cases; they are off by default - please contact your system administrator if you need these features.
+
+### PII redaction method
+
+Next we need to choose our model to identify personally-identifiable information (PII) in the document. Under **'Change PII identification method'** accordion (under **'Change default redaction settings'**) you will see **'Choose redaction method'**, a radio with three options:
+- **'Extract text only'** runs text extraction without redaction — useful when you only need OCR output or want to review text before redacting.
+- **'Redact all PII'** (the default) uses the chosen PII detection method to find and redact personal information across a range of standard entity types, e.g. addresses, names, dates, etc.
+- **'Redact selected terms'** will focus redaction only on the specific terms in the [custom allow/deny lists](#allow-list-deny-list-and-whole-page-redaction) below.
+
+Still under **'Change default redaction settings'**, you may see the **'Change PII identification model'** section, if enabled, which lets you choose how PII is detected. You may have the choice of the following options. If not, AWS Comprehend will likely be the default option:
+
+- **'Local'** - (optional) Uses a local model (e.g. spaCy) to detect PII at no extra cost, but with less accuracy than the alternative options.
+- **'AWS Comprehend'** - Uses AWS Comprehend for PII detection when the app is configured for AWS; typically more accurate but incurs a cost (around £0.0075 ($0.01) per 10,000 characters).
+- Other options may be available depending on the app settings (e.g. AWS Bedrock, local LLM models).
+
+Under **'Select entity types to redact'** you can choose which types of PII to redact (e.g. names, emails, dates). Click in the box or near the dropdown arrow to see the full list. Any entity type that remains in the box will be searched for during the redaction process.
+
+
+
+### Duplicate page redaction
+
+Alongside the 'Change PII identification method' section, you will see 'Redact duplicate pages'. If this is enabled, following the main redaction process, the app will identify pages with duplicate text in the document and redact them in the same run. If you want to modify the duplicate page detection settings, you can do so on the **Identify duplicate pages** tab - please refer to the [Identifying and redacting duplicate pages](#identifying-and-redacting-duplicate-pages) section for more details.
+
+### Allow list, deny list, and whole-page redaction
+
+Underneath you will see **'Terms to always include or exclude in redactions, and whole page redaction'**. Here you can:
+
+- **Allow list** – Terms that if found, will never be redacted. To use, ensure that CUSTOM is selected in the **'Select entity types to redact'** dropdown.
+- **Deny list** – Terms that if found, will always be redacted. To use, ensure that CUSTOM is selected in the **'Select entity types to redact'** dropdown.
+- **Fully redact these pages** – Page numbers that will be fully redacted with a box that covers the entire page.
+- **Maximum spelling mistakes for matching deny list terms** – Number (0–9) used for [fuzzy matching](#fuzzy-search-and-redaction) for terms in the Deny list when **CUSTOM_FUZZY** is selected in the entity list. For example, if this is set to 1, then all terms set in the Deny list will be matched with up to 1 spelling mistake. Note that setting this value greater than 0 will increase the time taken to redact the document.
+
+To add items to the allow list, deny list, or fully redacted pages list, you can do so by typing the item into the box and pressing enter. You can also remove items by clicking the **'x'** next to the item, or pressing the backspace key when the box is selected.
+
+
+
+You can add or remove terms directly in these controls. To load many terms from a file (e.g. a CSV), use the file upload areas at the top of the **Settings** tab. If you upload a csv into one of these boxes containing a single column of terms, the terms will be loaded into the allow list, deny list on the Redact PDFs/images tab. Similarly, if you upload a csv into the 'Fully redact these pages' box containing a single column of page numbers, the pages will be fully redacted on the next redaction run, and the relevant box on the Redact PDFs/images tab will be filled with the page numbers.
+
+
+
+### Cost and time estimation
+
+If enabled, an **'Estimated costs and time taken'** section appears on the **'Redact PDFs/images'** tab. **'Existing Textract output file found'** (or **'Existing local OCR output file found'**) is ticked automatically when previous extraction output for the same document already exists in the output folder or was [uploaded by you](#aws-textract-outputs), which indicates that the text extraction process will not be repeated for future redaction tasks.
+
+
+
+### Cost code selection
+
+If cost codes are enabled, an **'Assign task to cost code'** section appears on the same tab. Choose a cost code before running redaction. You can search the table or type in the **'Choose cost code for analysis'** dropdown to filter. You can then set a default cost code for future redaction tasks with your login by clicking on the **'Set default cost code'** button.
+
+
+
+### Redact only specific pages
+
+To redact only a subset of pages (e.g. only page 1), go to the **Settings** tab and open the **'Redact only selected pages'** accordion. Set **'Lowest page to redact (set to 0 to redact from the first page)'** and **'Highest page to redact (set to 0 to redact to the last page)'** (e.g. both 1 for only page 1). The next redaction run will only process that range; output filenames will include a suffix like **'..._1_1.pdf'**.
+
+
+
+### Run redaction
+
+Once all the above settings have been prepared, at the bottom of the **'Redact PDFs/images'** tab, open the **'Extract text and redact document'** accordion and click **'Extract text and redact document'**. The app will process the document (typically around 30 pages per minute, depending on options). When finished, a message will indicate completion and output files will appear in the **'Output files'** area. Use **'Review and modify redactions'** to open the review tab.
+
+### Redaction outputs
+
+After you click **'Extract text and redact document'**, the **'Output files'** area on the **'Redact PDFs/images'** tab shows:
+
+
+
+- **'...redacted.pdf'** – The original PDF with suggested redactions applied (text removed and replaced by a black box).
+- **'...redactions_for_review.pdf'** – The original PDF with redaction boxes overlaid but text still visible. Use this in Adobe Acrobat or other PDF viewers to review suggested redactions before finalising.
+- **'...ocr_outputs.csv'** – Line-by-line extracted text from the document (useful for searching text in Excel or similar).
+- **'...ocr_outputs_with_words.csv'** – Word-level extracted text from the document with bounding boxes .
+- **'...review_file.csv'** – Details and locations of all suggested redactions; required for the [review process](#reviewing-and-modifying-suggested-redactions).
+
+#### Additional AWS Textract / local OCR outputs
+
+You may also see a **'..._textract.json'** file, and/or a **'...ocr_outputs_with_words.json'** file. You can save this to your computer, and upload it later alongside your input document to skip calling Textract again for the same document:
+
+
+
+
+#### Log file outputs and other optional outputs
+
+On the **Settings** tab, open the **'Log file outputs'** accordion to access log and optional output files. You may see a **'decision_process_table.csv'** (decisions made per page) and, if enabled by your administrator, **'..._visualisations.jpg'** images showing OCR bounding boxes per page:
+
+
+
+#### Downloading output files from previous redaction tasks
+
+If you are logged in via AWS Cognito and lose the app page (e.g. after a crash or reload), you may still be able to recover output files if the server has not been restarted. When enabled, open the **Settings** tab and use **'View and download all output files from this session'** at the bottom. Click **'Refresh files in output folder'**, then tick the box next to a file to display and download it.
+
+
+
+## Reviewing and modifying suggested redactions
+
+Sometimes the app will suggest redactions that are incorrect, or will miss phrases with personal information. The app allows you to review and modify suggested redactions to compensate for this. You can do this on the 'Review redactions' tab.
+
+We will go through ways to review suggested redactions with an example. On the **'Redact PDFs/images'** tab, upload the ['Example of files sent to a professor before applying.pdf'](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/example_of_emails_sent_to_a_professor_before_applying.pdf) file. Keep the default 'Local model - selectable text' option and click **'Extract text and redact document'**. Once the outputs are created, go to the **'Review redactions'** tab.
+
+On the 'Review redactions' tab you have a visual interface that allows you to inspect and modify redactions suggested by the app. There are quite a few options to look at, so we'll go from top to bottom.
+
+
+
+### Uploading documents for review
+
+The top area has a file upload area where you can upload documents to review redactions. If you have just done a redaction task, these boxes should already be filled with the relevant files. In the left box (1.), upload the original PDF file. If you have a document that you have previously redacted, you can also upload the '...redactions_for_review.pdf' file that is produced by the redaction process, which will load in the previous redactions.
+
+In the second input file box to the right (2.), you can upload a '..._ocr_result_with_words' file, that will allow you to search through the text and easily [add new redactions based on word search](#searching-and-adding-custom-redactions). You can also upload one of the '..._ocr_output.csv' file here that comes out of a redaction task, so that you can navigate the extracted text from the document. Click the button '2. Upload Review or OCR csv files' load in these files.
+
+Now you can review and modify the suggested redactions using the interface described below.
+
+
+
+### Page navigation
+
+You can change the page viewed either by clicking 'Previous page' or 'Next page', or by typing a specific page number in the 'Current page' box and pressing Enter on your keyboard. Each time you switch page, it will save redactions you have made on the page you are moving from, so you will not lose changes you have made.
+
+You can also navigate to different pages by clicking on rows in the tables under 'Search suggested redactions' to the right, or 'search all extracted text' (if enabled) beneath that.
+
+### Document viewer
+
+In the centre left of the tab, you will see the first page of the document. On the selected page, each redaction is highlighted with a box next to its suggested redaction label (e.g. person, email). To zoom in and out of the page, use your mouse wheel. When selected, press spacebar to go back to the default zoom.
+
+
+
+There are a number of different options to add and modify redaction boxes and page on the document viewer pane. At the top and bottom of the document viewer, you will see controls to add and modify redaction boxes. You should see a box icon, a hand icon, and two arrows pointing counter-clockwise and clockwise (details below).
+
+
+
+
+#### Modify existing redactions (hand icon)
+
+After clicking on the hand icon (or press 'd' on your keyboard when the document viewer is selected), the interface allows you to modify existing redaction boxes. When in this mode, you can click and hold on an existing box to move it. Click on one of the small boxes at the edges to change the size of the box. Double click on a box to enter the menu to change label, colour, or remove the box.
+
+
+
+To delete a box, click on it to highlight it, then press delete on your keyboard. Alternatively, double click on a box and click 'Remove' on the box that appears.
+
+
+
+#### Add new redaction boxes (box icon)
+
+To change to 'add redaction boxes' mode, scroll to the bottom of the page. Click on the box icon, and your cursor will change into a crosshair. Now you can add new redaction boxes where you wish. A popup will appear when you create a new box so you can select a label and colour for the new box.
+
+#### 'Locking in' new redaction box format
+
+It is possible to lock in a chosen format for new redaction boxes so that you don't have the popup appearing each time. When you make a new box, select the options for your 'locked' format, and then click on the lock icon on the left side of the popup, which should turn blue.
+
+
+
+You can now add new redaction boxes without a popup appearing. If you want to change or 'unlock' the your chosen box format, you can click on the new icon that has appeared at the bottom of the document viewer pane that looks a little like a gift tag. You can then change the defaults, or click on the lock icon again to 'unlock' the new box format - then popups will appear again each time you create a new box.
+
+
+
+#### Saving new page redactions on the Document viewer pane
+
+If you are working on a page and you have created boxes manually or modified existing boxes, you are advised to click on the 'Save changes on current page to file' button to the right to ensure that they are saved to your output files.
+
+### Modify existing redactions
+
+To the right of the Document viewer pane you should see a heading above a table called 'Modify redactions' (see below). The table shows a list of all the suggested redactions in the document alongside the page, label, and text (if available).
+
+
+
+If you click on one of the rows in this table, you will be taken to the page of the redaction. Clicking on a redaction row on the same page will change the colour of redaction box to blue to help you locate it in the document viewer (just when using the app, not in redacted output PDFs).
+
+
+
+You can choose a specific entity type to see which pages the entity is present on. If you want to go to the page specified in the table, you can click on a cell in the table and the review page will be changed to that page.
+
+To filter the 'Search suggested redactions' table you can:
+1. Click on one of the dropdowns (Redaction category, Page, Text), and select an option, or
+2. Write text in the 'Filter' box just above the table. Click the blue box to apply the filter to the table.
+
+Once you have filtered the table, or selected a row from the table, you have a few options underneath on what you can do with the filtered rows:
+
+- Click the **Exclude all redactions in table** button to remove all redactions visible in the table from the document. **Important:** ensure that you have clicked the blue tick icon next to the search box before doing this to first filter the table, or you will remove all redactions from the document. If you do end up doing this, click the 'Undo last element removal' button below to restore the redactions.
+- Click the **Exclude specific redaction row** button to remove only the redaction from the last row you clicked on from the document. The currently selected row is visible below.
+- Click the **Exclude all redactions with the same text as selected row** button to remove all redactions from the document that are exactly the same as the selected row text.
+
+**NOTE**: After excluding redactions using any of the above options, click the 'Reset filters' button below to ensure that the dropdowns and table return to seeing all remaining redactions in the document.
+
+If you made a mistake, click the 'Undo last element removal' button to restore the Search suggested redactions table to its previous state (can only undo the last action).
+
+### Search text and redact
+
+After a document has been processed, you may need to redact specific terms, names, or phrases that the automatic PII detection might have missed. The **"Search text and redact"** tab gives you the power to find and redact any text within your document manually.
+
+#### **Step 1: Search for Text**
+
+
+
+
+
+1. Navigate to the **"Search text and redact"** tab to the right of the **'Review redactions'** tab, under the 'Apply redactions to PDF' and 'Save changes on current page to file' buttons.
+2. The main table will initially be populated with all the text extracted from the document for a page, broken down by word. Use the page selector to view a specific page, or 'ALL' to see all words.
+3. To find specific words or phrases to redact, use the **"Multi-word text search"** box to type the word or phrase you want to find (this will search the whole document).
+4. If you want to do a regex-based search, tick the 'Enable regex pattern matching' box under 'Search options' below.
+5. Click the **"Search"** button or press Enter.
+6. The table below will update to show only the rows containing text that matches your search query.
+
+Below the search button you can customise the appearance and label of the new redactions under the **"Search options"** accordion:
+
+* **Label for new redactions:** Change the text that appears on the redaction box (default is "Redaction"). You could change this to "CONFIDENTIAL" or "CUSTOM".
+* **Colour for labels:** Set a custom color for the redaction boxes. You can use the colour picker interface that pops up to select a colour.
+
+
+
+
+
+> **Tip:** If you make a mistake during the search, to clear all filters and see the full text again, click the **"Reset table to original state"** button. You can also click the 'Undo latest redaction' button to remove the latest redaction item.
+
+#### **Step 2: Select and Review a Match**
+
+Your redaction search results will appear in the table underneath the search options, with each found word on a different row.
+
+
+
+
+
+When you click on any row in the search results table:
+
+* The document preview on the left will automatically jump to that page, allowing you to see the word in its original context.
+* The details of your selection will appear in the smaller **"Selected row"** table for confirmation.
+
+#### **Step 3: Choose Your Redaction Method**
+
+You have several options for redacting the text you've found:
+
+* **Redact a single, specific instance of the phrase:**
+ * Click on the exact row in the table you want to redact.
+ * Click the **`Redact specific text row`** button.
+ * Only that single instance will be redacted.
+
+* **Redact all instances of a word/phrase:**
+ * Let's say you want to redact the project name "John Smith" everywhere it appears.
+ * Find and select one instance of "John Smith" in the table.
+ * Click the **`Redact all words with same text as selected row`** button.
+ * The application will find and redact every single occurrence of "John Smith" throughout the entire document.
+
+
+
+
+
+* **Redact all current search results:**
+ * Perform a search (e.g., for a specific person's name).
+ * If you are confident that every result shown in the filtered table should be redacted, click the **`Redact all text in table`** button.
+ * This will apply a redaction to all currently visible items in the table in one go, across all relevant pages in the document.
+
+An example of the outputs you can see in the document view pane is shown below.
+
+
+
+
+#### **Undoing a Mistake**
+
+If you make a mistake, you can reverse the last redaction action you performed on this tab.
+
+* Click the **`Undo latest redaction`** button. This will revert the last set of redactions you added (whether it was a single row, all of a certain text, or all search results).
+
+> **Important:** This undo button only works for the *most recent* action. It maintains a single backup state, so it cannot undo actions that are two or more steps in the past.
+
+### Navigating through the document using 'View text'
+
+The 'View text' table shows the text extracted from the document so you can check for correctness of the OCR process. If you have just completed a redaction task, you should see this table already filled in. If you have uploaded a '..._ocr_output.csv' file alongside a document file on the Review redactions tab as [described above](#uploading-documents-for-review).
+
+
+
+You can search through the extracted text by using the search bar just above the table. You can also filter specific columns by clicking on the three dots next to the column header and clicking 'Filter'. When you click on a row, the Document viewer pane to the left will change to the selected page. To return the table to its original content, click the button below the table 'Reset OCR output table filter'.
+
+
+
+
+
+### Apply revised redactions to PDF
+
+Once you have reviewed all the redactions in your document and you are happy with the outputs, you can click 'Apply revised redactions to PDF' to create a new '_redacted.pdf' and '_redactions_for_review.pdf' outputs.
+
+
+
+## Loading in previous results to continue redaction
+
+You may want to return to the same document later to modify existing redactions and add new ones. In this case, especially for large documents, you do not want to waste the time and expense to redo the redaction task from scratch. The Redaction app allows you to load in previous redaction results so you can continue from where you left off.
+
+### Loading in previous results from the _redactions_for_review.pdf file
+
+The redaction process produces a file with the suffix '_redactions_for_review.pdf', which contains all the redaction information needed to reuse in the app, and is also suitable for continuing with redactions in Adobe Acrobat.
+
+
+
+To continue with a file previously redacted, import your 'redactions_for_review.pdf' file on the 'Review redactions' tab in the [first file input box](#uploading-documents-for-review). Once your file is uploaded, you should see the first page of the document appear in the Document view pane below, and under the 'Modify redactions' tab to the right, you should see the table filled in with details of the previous redactions.
+
+### Merging previous redaction review files
+
+Say you have done multiple redaction tasks with the same file, and you want to merge all the redaction boxes together into one combined review document. You can merge multiple output 'redactions for review.pdf' files using the interface on the Settings page under 'Combine multiple review PDFs or CSV files'.
+
+
+
+In the file input box, upload your multiple 'redactions_for_review.pdf' files, then click 'Combine multiple review PDFs into one'. You should then get out a combined file that contains all the redaction boxes from the input files. You can then upload this file into the file input on the 'Review redactions' tab to review and modify redactions [as detailed above](#loading-in-previous-results-from-the-_redactions_for_review.pdf-file).
+
+
+
+### Loading in OCR results to search for new redactions
+
+When loading in results following the above method, you will notice that the table under 'Search text and redact' is still empty. To use this feature we need to load in the OCR results from your previous redaction task.
+
+When you redact a document, one of the outputs has the suffix 'ocr_results_with_words.csv'. It's this file that we need to upload to be able to search for new redactions. To do this, find the relevant ocr_results_with_words.csv file, and upload it into the second upload box on the 'Review redactions' tab in the top right (labelled 2.). When you do this, you should see the 'Search text and redact' tab table filled in with data. The table under the 'View text' tab should also be filled in.
+
+
+
+### Using a previous OCR results file to skip redoing OCR for future redaction tasks
+
+If you have a large document that you want to redact a second time in future, you can save time and money by retaining output files from the first run, and uploading them alongside the document next time you use the app for redaction.
+
+Every time you redact, a file with the suffix '.json' is produced. This file contains all the OCR results from your analysis. To use this file next time and skip waiting and paying for the OCR process, you can upload this alongside your document. On the 'Redact PDFs/images' tab, the file input area allows you to upload multiple files at the same time. With your document file and the .json file in the same folder, click on Drop files here or click to upload, and select both the PDF file and the .json file from the previous analysis output. The app should load both in.
+
+To check the previous OCR result upload has been successful, lower down the page you should see two checkboxes, called 'Existing Textract output file found' and 'Existing local OCR output found'. If one of these has become checked, then the OCR output has been successfully uploaded. When you next redact the document with the given method, it should load in the existing OCR results to skip re-analysis.
+
+## Document summarisation
+
+When summarisation is enabled, a **Document summarisation** tab is shown in the app. It lets you generate LLM-based summaries from PDFs, or from OCR output CSVs (e.g. from a previous redaction run).
+
+
+
+Here is how you can summarise a document:
+
+1. **Upload Files:** * **1.a Upload PDF files:** In the summarisation tab, use "Upload one or multiple PDF files to summarise" to attach one or more PDF files, or
+ * **1.b Upload OCR output files:** In the summarisation tab, use "Upload one or multiple 'ocr_output.csv' files to summarise" to attach one or more `*_ocr_output.csv` files.
+2. **Summarisation settings (accordion):**
+ * **Choose LLM inference method for summarisation**: Choose from the LLM options available in the app settings.
+ * **Max pages per page-group summary**: Limits how many pages are summarised together.
+ * **Summary format**: **Concise** or **Detailed**.
+ * **Additional summary instructions (optional)**: e.g. "Focus on key obligations."
+3. **Generate summary:** Click **"Generate summary"** to run the summarisation.
+4. **Outputs:** When finished, you can download summary files and view the summary that appears below.
+
+
+
+## Redacting Word, tabular data files (XLSX/CSV) or open text
+
+### Word or tabular data files (XLSX/CSV)
+
+The app can be used to redact Word (.docx), or tabular data files such as XLSX or CSV files. For redaction with tabular files to work properly, your data file (CSV, Excel) needs to be in a simple table format, with a single table starting from the first cell (A1), and no other information in the sheet. Similarly for XLSX files, each sheet in the file that you want to redact should be in this simple format.
+
+To demonstrate this, we can use [the example csv file 'combined_case_notes.csv'](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/combined_case_notes.csv), which is a small dataset of dummy social care case notes. Go to the 'Open text or Excel/csv files' tab. Drop the file into the upload area. After the file is loaded, you should see the suggested columns for redaction in the box underneath. You can select and deselect columns to redact as you wish from this list.
+
+
+
+If you were instead to upload an XLSX file, you would see also a list of all the sheets in the XLSX file that can be redacted. The 'Select columns' area underneath will suggest a list of all columns in the file across all sheets.
+
+
+
+Once you have chosen your input file and data, you can choose the redaction method. If visible, 'Local' will use a small local model, which is the same as that used for documents on the **'Redact PDFs/images'** tab. 'AWS Comprehend' will give better results, at a slight cost.
+
+When you click Redact text/data files, you will see the progress of the redaction task by file and sheet, and you will receive a CSV output with the redacted data.
+
+### Choosing output anonymisation format
+
+You can also choose the anonymisation format of your output results. Open the tab 'Anonymisation output format' to see the options. By default, any detected PII will be replaced with the word 'REDACTED' in the cell. You can choose one of the following options as the form of replacement for the redacted text:
+
+- **replace with 'REDACTED'**: Replaced by the word 'REDACTED' (default)
+- **replace with **: Replaced by e.g. 'PERSON' for people, 'EMAIL_ADDRESS' for emails etc.
+- **redact completely**: Text is removed completely and replaced by nothing.
+- **hash**: Replaced by a unique long ID code that is consistent with entity text. I.e. a particular name will always have the same ID code.
+- **mask**: Replace with stars '*'.
+
+
+
+### Redacting open text
+You can also write open text into an input box and redact that using the same methods as described above. To do this, write or paste text into the 'Enter open text' box that appears when you open the 'Redact open text' tab. Then select a redaction method, and an anonymisation output format as described above. The redacted text will be printed in the output textbox, and will also be saved to a simple CSV file in the output file box.
+
+You can test this by copy and pasting the text "John Smith is a person" into the input box, and selecting the **replace with 'REDACTED'** option. After pressing Redact text/data files, you should see the text "REDACTED" in the output textbox, and a csv file with the redacted text in the output file box.
+
+
+
+#### Redaction log outputs
+A list of the suggested redaction outputs from tabular data / Word/ open text data redaction is also provided in the output box with the name '..._log.csv'. This file gives a tabular breakdown of the specific redactions applied, the entity type, position, and underlying text.
+
+# Advanced user guide
+
+If the default settings and options for redaction are not sufficient for your needs, you can adjust the settings following the guide below.
+
+## Identifying and redacting duplicate pages with custom settings
+
+Simple redaction of duplicate pages using the checkbox on the 'Redact PDFs/images' tab during initial redaction is described [here](#duplicate-page-redaction). If the default settings are not sufficient for your needs, or if you want to identify duplicate lines of text or passages, you can adjust the settings following the guide below.
+
+The files for this section are stored [here](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/duplicate_page_find_in_app/).
+
+Some redaction tasks involve removing duplicate pages of text that may exist across multiple documents. This feature helps you find and remove duplicate content that may exist in single or multiple documents. It can identify everything from single identical pages to multi-page sections (subdocuments). The process involves three main steps: configuring the analysis, reviewing the results in the interactive interface, and then using the generated files to perform the redactions.
+
+### Duplicate page detection in documents
+
+This section covers finding duplicate pages across PDF documents using OCR output files. For a quick introduction to this functionality, you could run one of the following examples:
+
+**For duplicate page detection:** On the 'Identify duplicate pages' tab, you'll find examples for finding duplicate content in documents:
+
+- **Find duplicate pages of text in document OCR outputs** - Uses page-level analysis with a similarity threshold of 0.95 and minimum word count of 10
+- **Find duplicate text lines in document OCR outputs** - Uses line-level analysis with a similarity threshold of 0.95 and minimum word count of 3
+
+Once you have clicked on an example, you can click the 'Identify duplicate pages/subdocuments' button to load the example into the app and find duplicate content.
+
+
+
+**Step 1: Upload and Configure the Analysis**
+First, navigate to the **'Identify duplicate pages'** tab. In **'Step 1: Configure and run analysis'**, upload all the ocr_output.csv files you wish to compare. These files are generated every time you run a redaction task and contain the text for each page of a document.
+
+For our example, you can upload the four 'ocr_output.csv' files provided in the example folder. Open the **'Duplicate matching parameters'** accordion to set:
+
+- **Similarity threshold:** A score from 0 to 1. Pages or sequences with text similarity above this value are considered a match (default 0.95).
+- **Minimum word count:** Pages or lines with fewer words than this are ignored (default 10).
+- **Duplicate matching mode:** **'Find duplicates by page'** compares full-page text; **'Find duplicates by text line'** compares individual lines.
+
+**Matching strategy** (below the parameters):
+- **'Combine consecutive matches into a single match (subdocument match)'** (default: checked): Finds the longest possible sequence of matching pages (subdocuments). Uncheck to use the next option.
+- **Minimum consecutive matches** (slider, shown when subdocument matching is unchecked): Only report sequences of at least this many consecutive matches (e.g. 3 for at least 3-page runs). Set to 1 for single-page matching.
+
+Once your parameters are set, click **'Identify duplicate pages/subdocuments'**.
+
+In case you want to see the original PDFs for the example, they are available [here](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/duplicate_page_find_in_app/input_pdfs/).
+
+
+
+**Step 2: Review Results in the Interface**
+After the analysis is complete, the results will be displayed directly in the interface.
+
+*Analysis Summary:* A table will appear showing a summary of all the matches found. The columns will change depending on the matching strategy you chose. For subdocument matches, it will show the start and end pages of the matched sequence.
+
+*Interactive Preview:* This is the most important part of the review process. Click on any row in the summary table. The full text of the matching page(s) will appear side-by-side in the "Full Text Preview" section below, allowing you to instantly verify the accuracy of the match.
+
+
+
+**Step 3: Download and Use the Output Files**
+The analysis also generates a set of downloadable files for your records and for performing redactions.
+
+- page_similarity_results.csv: This is a detailed report of the analysis you just ran. It shows a breakdown of the pages from each file that are most similar to each other above the similarity threshold. You can compare the text in the two columns 'Page_1_Text' and 'Page_2_Text'. For single-page matches, it will list each pair of matching pages. For subdocument matches, it will list the start and end pages of each matched sequence, along with the total length of the match.
+
+
+
+- [Original_Filename]_pages_to_redact.csv: For each input document that was found to contain duplicate content, a separate redaction list is created. This is a simple, one-column CSV file containing a list of all page numbers that should be removed. To use these files, you can either upload the original document (i.e. the PDF) on the 'Review redactions' tab, and then click on the 'Apply relevant duplicate page output to document currently under review' button. You should see the whole pages suggested for redaction on the 'Review redactions' tab. Alternatively, you can reupload the file into the whole page redaction section as described in the ['Full page redaction list example' section](#full-page-redaction-list-example).
+
+
+
+If you want to combine the results from this redaction process with previous redaction tasks for the same PDF, you could merge review file outputs following the steps described in [Merging existing redaction review files](#merging-existing-redaction-review-files) above.
+
+**Redact duplicate pages on the Redact PDFs/images tab**
+
+On the **'Redact PDFs/images'** tab, in the **'Redaction settings'** area (accordion **'Terms to always include or exclude in redactions, and whole page redaction'**), there is a **'Redact duplicate pages'** checkbox. When this is enabled, the app will identify duplicate pages and apply whole-page redaction to them as part of the same redaction run. This option is shown when PII identification options are enabled. You can use it instead of (or in addition to) running duplicate detection on the **'Identify duplicate pages'** tab and then applying the output to a document on **'Review redactions'**.
+
+### Duplicate detection in tabular data
+
+The app also includes functionality to find duplicate cells or rows in CSV, Excel, or Parquet files. This is particularly useful for cleaning datasets where you need to identify and remove duplicate entries.
+
+**Step 1: Upload files and configure analysis**
+
+Navigate to the 'Word or Excel/CSV files' tab and scroll down to the "Find duplicate cells in tabular data" section. Upload your tabular files (CSV, Excel, or Parquet) and configure the analysis parameters:
+
+- **Similarity threshold**: Score (0-1) to consider cells a match. 1 = perfect match
+- **Minimum word count**: Cells with fewer words than this value are ignored
+- **Do initial clean of text**: Remove URLs, HTML tags, and non-ASCII characters
+- **Remove duplicate rows**: Automatically remove duplicate rows from deduplicated files
+- **Select Excel sheet names**: Choose which sheets to analyze (for Excel files)
+- **Select text columns**: Choose which columns contain text to analyze
+
+
+
+**Step 2: Review results**
+
+After clicking "Find duplicate cells/rows", the results will be displayed in a table showing:
+- File1, Row1, File2, Row2
+- Similarity_Score
+- Text1, Text2 (the actual text content being compared)
+
+Click on any row to see more details about the duplicate match in the preview boxes below.
+
+**Step 3: Remove duplicates**
+
+The deduplicated output files will be available for download in the output box. If you have selected 'Remove duplicate rows from deduplicated files', the duplicate rows will be removed from the deduplicated files. If not, then the duplicate rows will be indicated in the output files in the column named 'duplicated', which can be TRUE or FALSE.
+
+
+
+## Export redacted document files to Adobe Acrobat
+
+Files for this section are stored [here](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/export_to_adobe/).
+
+The Document Redaction app has enhanced features for working with Adobe Acrobat. You can now export suggested redactions to Adobe, import Adobe comment files into the app, and use the new `redactions_for_review.pdf` files directly in Adobe Acrobat.
+
+### Using redactions_for_review.pdf files with Adobe Acrobat
+
+The app now generates `redactions_for_review.pdf` files that contain the original PDF with redaction boxes overlaid but the original text still visible underneath. These files are specifically designed for use in Adobe Acrobat and other PDF viewers where you can:
+
+- See the suggested redactions without the text being permanently removed
+- Review redactions before finalising them
+- Use Adobe Acrobat's built-in redaction tools to modify or apply the redactions
+- Export the final redacted version directly from Adobe
+
+Simply open the `redactions_for_review.pdf` file in Adobe Acrobat to begin reviewing and modifying the suggested redactions.
+
+### Exporting comment files to Adobe Acrobat
+
+To convert suggested redactions to Adobe format in the format of xfdf comment files, you need to have the original PDF and a review file csv in the input box at the top of the Review redactions page.
+
+
+
+Then, you can find the export to Adobe option at the bottom of the Review redactions tab. Adobe comment files will be output here.
+
+
+
+Once the input files are ready, you can click on the 'Convert review file to Adobe comment format'. You should see a file appear in the output box with a '.xfdf' file type. To use this in Adobe, after download to your computer, you should be able to double click on it, and a pop-up box will appear asking you to find the PDF file associated with it. Find the original PDF file used for your redaction task. The file should be opened up in Adobe Acrobat with the suggested redactions.
+
+
+
+### Importing comment files from Adobe Acrobat
+
+The app also allows you to import .xfdf files from Adobe Acrobat. To do this, go to the same Adobe import/export area as described above at the bottom of the Review Redactions tab. In this box, you need to upload a .xfdf Adobe comment file, along with the relevant original PDF for redaction.
+
+
+
+When you click the 'convert .xfdf comment file to review_file.csv' button, the app should take you up to the top of the screen where the new review file has been created and can be downloaded.
+
+
+
+## Submit documents to the AWS Textract API service for faster OCR
+
+If enabled by your administrator, an accordion **'Submit whole document to AWS Textract API (quickest text extraction for large documents)'** appears on the **'Redact PDFs/images'** tab. This sends whole documents to Textract for fast extraction, separate from the main redaction flow. For very large documents, this can be a significant time saver in getting your OCR results, which you can then use for redaction tasks in the app.
+
+
+
+### Starting a new Textract API job
+
+To use this feature, first upload a document file [in the usual way](#upload-files-to-the-app) on the **'Redact PDFs/images'** tab. Under **'Enable AWS Textract signature detection'** (inside **'Change default redaction settings'**) you can choose whether to analyse signatures or not (with a [cost implication](#enable-aws-textract-signature-extraction)).
+
+Then, open the section under the heading 'Submit whole document to AWS Textract API...'.
+
+
+
+Click 'Analyse document with AWS Textract API call'. After a few seconds, the job should be submitted to the AWS Textract service. The box 'Job ID to check status' should now have an ID filled in. If it is not already filled with previous jobs (up to seven days old), the table should have a row added with details of the new API job.
+
+Click the button underneath, 'Check status of Textract job and download', to see progress on the job. Processing will continue in the background until the job is ready, so it is worth periodically clicking this button to see if the outputs are ready. In testing, and as a rough estimate, it seems like this process takes about five seconds per page. However, this has not been tested with very large documents. Once ready, the '_textract.json' output should appear below.
+
+### Textract API job outputs
+
+The '_textract.json' output can be used to speed up further redaction tasks as [described previously](#optional---costs-and-time-estimation), the 'Existing Textract output file found' flag should now be ticked.
+
+
+
+You can now easily get the '..._ocr_output.csv' redaction output based on this '_textract.json' (described in [Redaction outputs](#redaction-outputs)) by clicking on the button 'Convert Textract job outputs to OCR results'. You can now use this file e.g. for [identifying duplicate pages](#identifying-and-redacting-duplicate-pages), or for redaction review.
+
+## Advanced OCR settings - Efficient OCR, overwrite existing OCR
+
+On the Settings tab you may see a section called 'Advanced OCR settings'. Here you may see a few options.
+
+### Efficient OCR
+
+When checked, redaction tasks will first check all pages for text that can be extracted simply without needing to analyse with the OCR model. If there are more words on the page than the value shown in 'Minimum words for efficient OCR', the app will follow this simple extraction process, and only run the more expensive OCR process for the remaining pages.
+
+This field 'Min. page-area fraction for an embedded image to force OCR' is shown when Efficient OCR is enabled. It sets the smallest fraction of the page area (on the PDF page) that a **single** embedded image must cover—on one placement on the page—for that page to be sent through the full OCR step **even if** the page already has enough extractable words for the text-only path. That way, pages with plenty of selectable text but a large image (for example a photo or embedded scan) can still be analysed by OCR, which can pick up text that exists only inside the image and is not separate selectable text.
+
+Values are typically small decimals (for example `0.01` for 1% of the page area). Enter **0** to turn off this rule: routing then depends only on the minimum word count (the label in the app notes this as “word count only”). To disable Efficient OCR entirely, uncheck the 'Efficient OCR' checkbox and run your redaction task.
+
+**Note:** *AWS Textract* with *Extract signatures* still analyses **all** pages and disables Efficient OCR for that workflow.
+
+### Overwrite existing OCR results
+
+When checked, redaction tasks will overwrite any existing OCR results with the new results. This is useful if you have already run a redaction task and you want to re-run it with a different method or settings. To disable this feature, simply uncheck the 'Overwrite existing OCR results' checkbox and run your redaction task.
+
+### High-quality Textract OCR
+
+This option may not be visible unless you system administrator has enabled it. When checked, redaction tasks will use a hybrid approach to OCR. This will first use the relatively efficient Textract OCR process to extract text, and then use the more expensive Vision Language Model (VLM) OCR process to extract any text that was missed by the efficient OCR process. This is useful if you have a document that is a mix of text and images, and you want to ensure that all text is extracted. To disable this feature, simply uncheck the 'High quality Textract OCR' checkbox and run your redaction task.
+
+### Save page OCR visualisations
+When checked, redaction tasks will save page OCR visualisations to the output folder. This is useful for debugging and to see the OCR results for each page. The visualisations will be saved as '_page_ocr_visualisations.png' files in the output folder. To disable this feature, simply uncheck the 'Save page OCR visualisations' checkbox and run your redaction task.
+
+## Modifying existing redaction review files
+You can find the folder containing the files discussed in this section [here](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/merge_review_files/).
+
+As well as serving as inputs to the document redaction app's review function, the 'review_file.csv' output can be modified insider or outside of the app. This gives you the flexibility to change redaction details outside of the app.
+
+If you open up a 'review_file' csv output using a spreadsheet software program such as Microsoft Excel you can easily modify redaction properties. Open the file '[Partnership-Agreement-Toolkit_0_0_redacted.pdf_review_file_local.csv](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/merge_review_files/Partnership-Agreement-Toolkit_0_0.pdf_review_file_local.csv)', and you should see a spreadshet with just four suggested redactions (see below). The following instructions are for using Excel.
+
+
+
+The first thing we can do is remove the first row - 'et' is suggested as a person, but is obviously not a genuine instance of personal information. Right click on the row number and select delete on this menu. Next, let's imagine that what the app identified as a 'phone number' was in fact another type of number and so we wanted to change the label. Simply click on the relevant label cells, let's change it to 'SECURITY_NUMBER'. You could also use 'Find & Select' -> 'Replace' from the top ribbon menu if you wanted to change a number of labels simultaneously.
+
+How about we wanted to change the colour of the 'email address' entry on the redaction review tab of the redaction app? The colours in a review file are based on an RGB scale with three numbers ranging from 0-255. [You can find suitable colours here](https://rgbcolorpicker.com). Using this scale, if I wanted my review box to be pure blue, I can change the cell value to (0,0,255).
+
+Imagine that a redaction box was slightly too small, and I didn't want to use the in-app options to change the size. In the review file csv, we can modify e.g. the ymin and ymax values for any box to increase the extent of the redaction box. For the 'email address' entry, let's decrease ymin by 5, and increase ymax by 5.
+
+I have saved an output file following the above steps as '[Partnership-Agreement-Toolkit_0_0_redacted.pdf_review_file_local_mod.csv](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/merge_review_files/outputs/Partnership-Agreement-Toolkit_0_0.pdf_review_file_local_mod.csv)' in the same folder that the original was found. Let's upload this file to the app along with the original pdf to see how the redactions look now.
+
+
+
+We can see from the above that we have successfully removed a redaction box, changed labels, colours, and redaction box sizes.
+
+# Features for expert users/system administrators
+This advanced user guide covers features that require system administration access or command-line usage. These options are not enabled by default but can be configured by your system administrator, and are not available to users who are just using the graphical user interface. These features are typically used by system administrators or advanced users who need more control over the redaction process.
+
+## Using AWS Textract and Comprehend when not running in an AWS environment
+
+AWS Textract and Comprehend give much better results for text extraction and document redaction than the local model options in the app. The most secure way to access them in the Redaction app is to run the app in a secure AWS environment with relevant permissions. Alternatively, you could run the app on your own system while logged in to AWS SSO with relevant permissions.
+
+However, it is possible to access these services directly via API from outside an AWS environment by creating IAM users and access keys with relevant permissions to access AWS Textract and Comprehend services. Please check with your IT and data security teams that this approach is acceptable for your data before trying the following approaches.
+
+To do the following, in your AWS environment you will need to create a new user with permissions for "textract:AnalyzeDocument", "textract:DetectDocumentText", and "comprehend:DetectPiiEntities". Under security credentials, create new access keys - note down the access key and secret key.
+
+### Direct access by passing AWS access keys through app
+The Redaction Settings tab now has boxes for entering the AWS access key and secret key. If you paste the relevant keys into these boxes before performing redaction, you should be able to use these services in the app.
+
+### Picking up AWS access keys through an .env file
+The app also has the capability of picking up AWS access key details through a .env file located in a '/config/aws_config.env' file (default), or alternative .env file location specified by the environment variable AWS_CONFIG_PATH. The env file should look like the following with just two lines:
+
+AWS_ACCESS_KEY= your-access-key
+AWS_SECRET_KEY= your-secret-key
+
+The app should then pick up these keys when trying to access the AWS Textract and Comprehend services during redaction.
+
+## Advanced OCR model options
+
+The app supports advanced OCR options that combine multiple OCR engines for improved accuracy. These options are not enabled by default but can be configured by changing the app_config.env file in your '/config' folder, or system environment variables in your system.
+
+### Available OCR models
+
+A range of local and cloud OCR models are available for text extraction. The options shown in the app depend on which environment variables are enabled in your `app_config.env` file (see [Enabling advanced OCR options](#enabling-advanced-ocr-options) below). Once enabled, the models appear under **'Change default redaction settings'** on the **'Redact PDFs/images'** tab.
+
+
+
+- **Tesseract** (`tesseract`, default, requires `SHOW_LOCAL_OCR_MODEL_OPTIONS=True`): Best for documents with clear, well-formatted text. Provides a good balance of speed and accuracy with precise word-level bounding boxes, but struggles with handwriting or noisy/scanned documents.
+- **PaddleOCR** (`paddle`, requires `SHOW_PADDLE_MODEL_OPTIONS=True`): More powerful than Tesseract and handles unclear typed text on scanned documents reasonably well, but is slower. Word-level bounding boxes are estimated from line-level output so may be less precise.
+- **VLM** (`vlm`, requires `SHOW_VLM_MODEL_OPTIONS=True`): Uses a Vision Language Model locally (recommended: Qwen 3.5, configurable via the `SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL` environment variable). Excellent for difficult handwriting and noisy documents, but significantly slower than the options above. Inference runs via the `transformers` package and can be quantised with `bitsandbytes` if `QUANTISE_VLM_MODELS=True`; for faster inference, use an [inference server](#inference-server-options) instead.
+- **Inference server** (`inference-server`, requires `SHOW_INFERENCE_SERVER_VLM_OPTIONS=True`): Uses an OpenAI-compatible API endpoint such as [llama-cpp (llama-server)](https://github.com/ggml-org/llama.cpp) or [vLLM](https://docs.vllm.ai/en/stable). Produces results comparable to the in-app VLM but is much faster, and supports GGUF or AWQ/GPTQ quantised models. Requires a separately configured server. See [below for details](#inference-server-options).
+- **AWS Bedrock VLM** (`AWS Bedrock VLM OCR - all PDF types`, requires `SHOW_BEDROCK_VLM_MODELS=True`): Cloud-based OCR using a Bedrock vision model (e.g. Qwen VL). Extracts text and optionally detects people and signatures from PDFs and images without a local GPU. Requires AWS credentials and Bedrock model access, and is subject to Bedrock quotas and pricing.
+
+#### Hybrid options
+
+Hybrid models (enabled with `SHOW_HYBRID_MODELS=True`) use PaddleOCR first to identify bounding boxes and extract text, then pass low-confidence regions to a more powerful model for correction. **Note:** In practice, results are not significantly better than using Paddle or VLM/inference server analysis alone (particularly with Qwen 3 VL), but the options are provided for comparison.
+
+- **Hybrid-paddle-vlm** (`hybrid-paddle-vlm`): Combines PaddleOCR's bounding box detection with an in-app VLM for low-confidence regions. PaddleOCR's stronger bounding box identification makes this the most practical hybrid option, provided both Paddle and the VLM model can run in the same environment.
+- **Hybrid-paddle-inference-server** (`hybrid-paddle-inference-server`): The same as above but uses an inference server instead of an in-app VLM, allowing the use of GGUF or AWQ/GPTQ quantised models via llama.cpp or vLLM. See [below for details](#inference-server-options).
+
+### Enabling advanced OCR options
+
+To enable these options, you need to change your system environment variables, or modify the app_config.env file in your '/config' folder and set the following environment variables:
+
+**Basic OCR model selection:**
+```
+SHOW_LOCAL_OCR_MODEL_OPTIONS=True
+```
+
+**To enable PaddleOCR options (paddle):**
+```
+SHOW_PADDLE_MODEL_OPTIONS=True
+```
+
+**To enable Vision Language Model options (vlm):**
+```
+SHOW_VLM_MODEL_OPTIONS=True
+```
+
+**To enable AWS Bedrock VLM OCR (cloud-based VLM text extraction):**
+```
+SHOW_BEDROCK_VLM_MODELS=True
+```
+
+**To enable Inference Server options (inference-server):**
+```
+SHOW_INFERENCE_SERVER_VLM_OPTIONS=True
+```
+
+**To enable Hybrid OCR options (hybrid-paddle-vlm, hybrid-paddle-inference-server):**
+```
+SHOW_HYBRID_MODELS=True
+```
+
+See the [app settings documentation](https://seanpedrick-case.github.io/doc_redaction/src/app_settings.html), or below, for more details on these options. Once enabled, when running the app, you will see a "Change default local OCR model" (or text extraction method) section in the redaction settings where they can choose between the available models based on what has been enabled, including Bedrock VLM analysis when configured.
+
+### OCR configuration parameters
+
+The following parameters can be configured to fine-tune OCR behaviour. Set them in your `app_config.env` file or as system environment variables. See the [app settings documentation](https://seanpedrick-case.github.io/doc_redaction/src/app_settings.html) for a full reference.
+
+#### General OCR settings
+
+- **DEFAULT_LOCAL_OCR_MODEL** (default: `"tesseract"`): Sets the default OCR engine without requiring a UI selection. Valid values: `"tesseract"`, `"paddle"`, `"vlm"`, `"inference-server"`, `"hybrid-paddle-vlm"`, `"hybrid-paddle-inference-server"`.
+- **SHOW_LOCAL_OCR_MODEL_OPTIONS** (default: False): If enabled, users can select the local OCR model from the UI.
+- **SHOW_OCR_GUI_OPTIONS** (default: True): If enabled, OCR-related options (e.g. model selection, Paddle options) are shown in the UI.
+- **EFFICIENT_OCR** (default: False): If enabled, uses a two-step process for PDFs — tries selectable text extraction per page first and only runs OCR on pages where no text could be extracted. Saves time and cost.
+- **EFFICIENT_OCR_MIN_WORDS** (default: 20): Minimum extractable words on a page to use the text-only route when `EFFICIENT_OCR` is enabled; pages below this threshold go through OCR.
+- **EFFICIENT_OCR_MIN_IMAGE_COVERAGE_FRACTION** (default: 0.03): When `EFFICIENT_OCR` is enabled, the minimum fraction of the page area a single embedded image must cover for that page to be routed through OCR regardless of word count. Set to `0` to rely on word count only.
+- **MERGE_BOUNDING_BOXES** (default: True): When enabled, merges nearby bounding boxes in OCR/Textract results (reconstruction, grouping, and horizontal merge).
+- **MODEL_CACHE_PATH** (default: `"./model_cache"`): Directory where OCR models are cached locally.
+- **OVERWRITE_EXISTING_OCR_RESULTS** (default: False): If enabled, always runs OCR fresh instead of loading from existing JSON result files.
+- **HF_TOKEN** (default: `''`): Hugging Face token. Required for downloading gated or private VLM models from the Hub.
+
+#### Tesseract settings
+
+- **TESSERACT_SEGMENTATION_LEVEL** (default: 11): Tesseract PSM (Page Segmentation Mode) level. Valid values are 0–13. Higher values provide more detailed segmentation but may be slower.
+- **TESSERACT_WORD_LEVEL_OCR** (default: True): If enabled, uses Tesseract word-level OCR rather than line-level.
+- **TESSERACT_MAX_WORKERS** (default: 4): Maximum number of worker threads for running Tesseract on multiple pages in parallel. Keep lower than `MAX_WORKERS` to avoid saturating CPU/RAM.
+- **TESSERACT_FOLDER** (default: `''`): Path to the local Tesseract installation folder. On Windows, install Tesseract 5.5.0 from [UB-Mannheim/tesseract](https://github.com/UB-Mannheim/tesseract/wiki) and point this variable at the folder (e.g. `tesseract/`).
+- **TESSERACT_DATA_FOLDER** (default: `"/usr/share/tessdata"`): Path to the Tesseract trained data files (`tessdata`).
+
+#### PaddleOCR settings
+
+- **SHOW_PADDLE_MODEL_OPTIONS** (default: False): If enabled, PaddleOCR options will be shown in the UI.
+- **PADDLE_USE_TEXTLINE_ORIENTATION** (default: False): If enabled, PaddleOCR will detect and correct text line orientation.
+- **PADDLE_DET_DB_UNCLIP_RATIO** (default: 1.2): Controls the expansion ratio of detected text regions. Higher values expand the detection area more.
+- **CONVERT_LINE_TO_WORD_LEVEL** (default: False): If enabled, converts PaddleOCR line-level results to approximate word-level bounding boxes for better precision.
+- **LOAD_PADDLE_AT_STARTUP** (default: False): If enabled, loads the PaddleOCR model when the application starts, reducing latency for first use but increasing startup time.
+- **PADDLE_MAX_WORKERS** (default: 2): Maximum number of worker threads for running PaddleOCR on multiple pages in parallel. Paddle is often GPU-bound; keep this low (e.g. 1–2) to avoid VRAM contention.
+- **PADDLE_MODEL_PATH** (default: `''`): Custom directory for PaddleOCR model storage, useful for environments like AWS Lambda.
+- **PADDLE_FONT_PATH** (default: `''`): Custom font path for PaddleOCR. If empty, the app uses system fonts.
+- **POPPLER_FOLDER** (default: `''`): Path to the local Poppler `bin` folder, required for PDF-to-image conversion. On Windows, install from [oschwartz10612/poppler-windows](https://github.com/oschwartz10612/poppler-windows) and point this at the `bin` folder (e.g. `poppler/poppler-24.02.0/Library/bin/`).
+
+#### Image preprocessing
+
+- **PREPROCESS_LOCAL_OCR_IMAGES** (default: True): If enabled, images are preprocessed before local OCR. Testing has shown this doesn't always improve results and can slow processing — consider setting to `False` if speed is a priority.
+- **SAVE_PREPROCESS_IMAGES** (default: False): If enabled, saves the preprocessed images for debugging.
+- **SAVE_PAGE_OCR_VISUALISATIONS** (default: False): If enabled, saves images with detected bounding boxes overlaid on page images for debugging.
+- **INCLUDE_OCR_VISUALISATION_IN_OUTPUT_FILES** (default: False): If enabled, includes OCR visualisation files (created when `SAVE_PAGE_OCR_VISUALISATIONS` is True) in the final output file list shown in the Gradio interface.
+
+#### Vision Language Model (VLM) settings
+
+- **SHOW_VLM_MODEL_OPTIONS** (default: False): If enabled, VLM OCR options will be shown in the UI.
+- **SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL** (default: `"Qwen3-VL-8B-Instruct"`): The local transformers VLM model to use for OCR. Available options: `"Nanonets-OCR2-3B"`, `"Dots.OCR"`, `"Qwen3-VL-2B-Instruct"`, `"Qwen3-VL-4B-Instruct"`, `"Qwen3-VL-8B-Instruct"`, `"Qwen3-VL-30B-A3B-Instruct"`, `"Qwen3-VL-235B-A22B-Instruct"`, `"PaddleOCR-VL"`. Generally the Qwen 3 VL range offers the best accuracy.
+- **OVERRIDE_VLM_REPO_ID** (default: `''`): If non-empty, overrides the Hugging Face repo ID or local path used for the selected VLM. Useful for pointing at a custom checkpoint or local folder.
+- **LOAD_TRANSFORMERS_VLM_MODEL_AT_START** (default: True): If enabled, loads the VLM weights at application startup. If disabled, weights load on the first VLM OCR call — reducing startup memory at the cost of a slower first run.
+- **QUANTISE_VLM_MODELS** (default: False): If enabled, quantises VLM models to 4-bit using `bitsandbytes` to reduce VRAM requirements.
+- **MAX_NEW_TOKENS** (default: 4096): Maximum number of tokens the VLM may generate per response.
+- **MAX_INPUT_TOKEN_LENGTH** (default: 8192): Maximum number of tokens that can be input to the VLM.
+- **VLM_MAX_IMAGE_SIZE** (default: 819200): Upper bound on total pixels (width × height) for images sent to the VLM. Larger images are downscaled while preserving aspect ratio.
+- **VLM_MIN_IMAGE_SIZE** (default: 614400): Minimum total pixels for full-page VLM OCR. Smaller pages are upscaled.
+- **VLM_HYBRID_MIN_IMAGE_SIZE** (default: 153600): Minimum total pixels for hybrid VLM line/crop passes. Smaller crops are upscaled.
+- **VLM_MIN_DPI** (default: 200.0): Minimum effective DPI after image preparation. Images below this DPI are upscaled.
+- **VLM_MAX_DPI** (default: 300.0): Maximum effective DPI after image preparation. High-DPI scans are downscaled accordingly.
+- **VLM_MAX_ASPECT_RATIO** (default: 10.0): Upper bound on image aspect ratio sent to the VLM. Very long/thin crops (e.g. hybrid line regions) are white-padded to stay within this limit.
+- **USE_FLASH_ATTENTION** (default: False): If enabled, uses flash attention for improved VLM performance. Not available on Windows.
+- **VLM_DISABLE_QWEN3_5_THINKING** (default: False): If enabled, disables the Qwen3.5 "thinking" chain for local transformers VLM calls, making responses faster by skipping the reasoning step.
+- **MAX_SPACES_GPU_RUN_TIME** (default: 60): Maximum seconds to run GPU operations on Hugging Face Spaces.
+- **ADD_VLM_BOUNDING_BOX_RULES** (default: False): If enabled, adds bounding box rules to the VLM prompt (e.g. coordinate format constraints for OCR output).
+- **REPORT_VLM_OUTPUTS_TO_GUI** (default: False): If enabled, reports VLM outputs to the GUI with info boxes as they are processed — useful for monitoring long OCR jobs.
+- **SAVE_VLM_INPUT_IMAGES** (default: False): If enabled, saves input images sent to the VLM for debugging.
+- **VLM_SEED** (default: `''`): Random seed for VLM generation. If empty, generation is non-deterministic.
+- **VLM generation parameters** (`VLM_DEFAULT_TEMPERATURE`, `VLM_DEFAULT_TOP_P`, `VLM_DEFAULT_MIN_P`, `VLM_DEFAULT_TOP_K`, `VLM_DEFAULT_REPETITION_PENALTY`, `VLM_DEFAULT_DO_SAMPLE`, `VLM_DEFAULT_PRESENCE_PENALTY`): Control sampling behaviour for VLM generation. All default to `''` (model-specific defaults are used when empty).
+
+#### Inference server settings
+
+- **SHOW_INFERENCE_SERVER_VLM_OPTIONS** (default: False): If enabled, inference server OCR options will be shown in the UI.
+- **INFERENCE_SERVER_API_URL** (default: `"http://localhost:8080"`): Base URL of the inference server API for remote VLM OCR processing.
+- **INFERENCE_SERVER_MODEL_NAME** (default: `''`): Optional model name to send in inference server API requests. If empty, the server's default model is used.
+- **DEFAULT_INFERENCE_SERVER_VLM_MODEL** (default: `"qwen_3_vl_30b_a3b_it"`): Default model name for inference server VLM OCR calls.
+- **INFERENCE_SERVER_TIMEOUT** (default: 300): Timeout in seconds for inference server API requests.
+- **INFERENCE_SERVER_DISABLE_THINKING** (default: False): If enabled, disables chain-of-thought "thinking" for inference server VLM calls (adds `{"enable_thinking": false}` to request payloads). Useful when running a Qwen3.5 reasoning model and thinking tokens are unnecessary, such as in hybrid line-crop OCR — eliminates overhead and avoids needing a large `HYBRID_OCR_MAX_NEW_TOKENS` budget.
+- **SHOW_INFERENCE_SERVER_VLM_MODEL_OPTIONS** (default: False): If enabled, allows users to select from available inference server VLM models in the UI.
+
+#### Hybrid OCR settings
+
+- **SHOW_HYBRID_MODELS** (default: False): If enabled, hybrid OCR options (`hybrid-paddle-vlm`, `hybrid-paddle-inference-server`) will be shown in the UI.
+- **HYBRID_OCR_CONFIDENCE_THRESHOLD** (default: 95): Confidence score below which the secondary OCR engine (VLM or inference server) will be used for re-extraction. Lower values mean more text will be re-extracted by the secondary model.
+- **HYBRID_OCR_PADDING** (default: 1): Padding in pixels added to word bounding boxes before re-extraction with the secondary engine.
+- **HYBRID_OCR_MAX_NEW_TOKENS** (default: 1024): Maximum tokens the inference server (or local VLM) may generate per hybrid line-crop OCR call. For reasoning models like Qwen3.5, thinking tokens count against this budget — increase to 2048 or higher if you see "Inference server returned no results" when using `hybrid-paddle-inference-server` with a reasoning model.
+- **HYBRID_OCR_MAX_WORDS** (default: 50): Maximum words allowed in a hybrid OCR result for a single text line. Results exceeding this are discarded as likely hallucinations, and the original OCR result is kept instead.
+- **SAVE_EXAMPLE_HYBRID_IMAGES** (default: False): If enabled, saves comparison images showing Tesseract vs. secondary engine results when using hybrid modes.
+
+### Inference server options
+
+If using a local inference server, I would suggest using [llama.cpp](https://github.com/ggml-org/llama.cpp), or [vLLM](https://docs.vllm.ai/en/stable/) as they are much faster than transformers/torch inference provided in the app through the 'vlm' OCR route or 'Local Transformers LLM' route. Llama.cpp is more flexible than vLLM for low VRAM systems, as Llama.cpp will offload to cpu/system RAM automatically rather than failing as vLLM tends to do.
+
+To help with installation of the Redaction app with Llama.cpp and vLLM, I have created Docker compose files for both Llama.cpp and vLLM. These can be found in the doc_redaction repo. For Llama.cpp, you can use the [docker-compose_llama.yml](https://github.com/seanpedrick-case/doc_redaction/blob/main/docker-compose_llama.yml) file, and for vLLM, you can use the [docker-compose_vllm.yml](https://github.com/seanpedrick-case/doc_redaction/blob/main/docker-compose_vllm.yml) file. To run, Docker / Docker Desktop should be installed, and then you can run the commands suggested in the top of the files to run the servers.
+
+You will need ~30-50GB of disk space to run everything depending on the model chosen from the Compose file. For the vLLM server, you will need 24 GB VRAM. For the Llama.cpp server, 24 GB VRAM is needed to run at full speed, but the n-gpu-layers and n-cpu-moe parameters in the Docker compose file can be adjusted to fit into your system. I would suggest that 8 GB VRAM is needed as a bare minimum for decent inference speed. See the [Unsloth guide](https://unsloth.ai/docs/models/qwen3.5) for more details on working with GGUF files for Qwen 3.5.
+
+### Identifying people and signatures with VLMs
+
+If VLM or inference server options are enabled, you can also use the VLM to identify photos of people's faces and signatures in the document, and redact them accordingly.
+
+On the 'Redaction Settings' tab, select the CUSTOM_VLM_FACES and CUSTOM_VLM_SIGNATURE entities. When you conduct an OCR task with the VLM or inference server, it will identify the bounding boxes for photos of people's faces and signatures in the document, and redact them accordingly if a redaction option is selected.
+
+With **Efficient OCR** enabled, **CUSTOM_VLM_FACES** follows the same page rules as OCR (VLM only on OCR-classified pages); **CUSTOM_VLM_SIGNATURE** still scans all pages. Textract **Extract signatures** uses full-document Textract OCR and disables Efficient OCR.
+
+### PII identification with LLMs
+
+In addition to rule-based (Local) and AWS Comprehend PII detection, the app can use **Large Language Models (LLMs)** to identify and label personal information. This is useful for entity types that are context-dependent (e.g. job titles, organisation names) or when you want custom instructions (e.g. "do not redact the name of the university"). LLM-based PII can be run via **AWS Bedrock**, a **local transformers** model, or a **local inference server** (e.g. llama.cpp, vLLM).
+
+
+
+**Options (when enabled by your administrator):**
+
+- **LLM (AWS Bedrock)**: Uses a Bedrock model (e.g. Claude, Nova) for PII detection. Requires AWS credentials and Bedrock model access. No local GPU needed.
+- **Local transformers LLM**: Runs a Hugging Face transformers model on your machine for PII detection. Requires sufficient RAM/VRAM. The default model is Qwen 3.5 9B; other supported models include Qwen 3.5 (0.8B–122B), Gemma 3 (12B/27B), GPT-OSS 20B, Ministral 3 14B, and NVIDIA Nemotron 3 30B (see the [config.py file for the updated list](https://github.com/seanpedrick-case/doc_redaction/blob/main/tools/config.py)).
+- **Local inference server**: Sends text to an OpenAI-compatible API (e.g. llama.cpp, vLLM) for PII detection. You run the server separately; the app only calls the API. Note: this uses `INFERENCE_SERVER_API_URL`, the same URL endpoint used for VLM OCR. See [Inference server options](#inference-server-options) for installation, as calls to the Local inference server will use the same model as the OCR inference-server model by default.
+
+**Using LLM PII detection**
+
+On the **'Redact PDFs/images'** tab, under **'Redaction settings'**, choose the desired **PII identification** method. You can then:
+
+- Select which **LLM entities** to detect (e.g. NAME, EMAIL_ADDRESS, PHONE_NUMBER, ADDRESS, CUSTOM). Custom entity types can also be added directly in the dropdown box.
+- Optionally add **custom instructions** to guide the LLM — for example: `"Do not redact company names"` or `"Redact all organisation names with the label ORGANISATION"`. When custom instructions are provided, the app can optionally use a more capable model (see `CLOUD_LLM_PII_CUSTOM_INSTRUCTIONS_MODEL_CHOICE` below).
+
+The LLM processes text page by page. Results include entity type, text span, and confidence score, which are then used to place redaction boxes in the same way as the local or Comprehend methods.
+
+**Enabling LLM PII options**
+
+Visibility of these methods is controlled by environment variables in `app_config.env`:
+
+- **SHOW_AWS_BEDROCK_LLM_MODELS** (default: False): Show "LLM (AWS Bedrock)" in the PII identification dropdown.
+- **SHOW_TRANSFORMERS_LLM_PII_DETECTION_OPTIONS** (default: False): Show "Local transformers LLM".
+- **SHOW_INFERENCE_SERVER_PII_OPTIONS** (default: False): Show "Local inference server".
+
+#### LLM PII configuration variables
+
+##### Model selection
+
+- **CLOUD_LLM_PII_MODEL_CHOICE** (default: `"amazon.nova-pro-v1:0"`): Default Bedrock (or cloud) model ID for LLM-based PII detection.
+- **CLOUD_LLM_PII_CUSTOM_INSTRUCTIONS_MODEL_CHOICE** (default: `"amazon.nova-pro-v1:0"`): If non-empty, overrides `CLOUD_LLM_PII_MODEL_CHOICE` when custom instructions are provided. Leave empty to always use `CLOUD_LLM_PII_MODEL_CHOICE`.
+- **LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE** (default: `"Qwen 3.5 9B"`): The local transformers model for PII detection. Supported values: `"Qwen 3.5 0.8B"`, `"Qwen 3.5 2B"`, `"Qwen 3.5 4B"`, `"Qwen 3.5 9B"`, `"Qwen 3.5 27B"`, `"Qwen 3.5 35B-A3B"`, `"Qwen 3.5 122B-A10B"`, `"Gemma 3 12B"`, `"Gemma 3 27B"`, `"GPT-OSS 20B"`, `"Ministral 3 14B Instruct"`, `"NVIDIA Nemotron 3 Nano 30B A3B NVFP4"`.
+- **USE_TRANSFORMERS_VLM_MODEL_AS_LLM** (default: False): If enabled, reuses the already-loaded VLM model (`SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL`) for LLM PII detection instead of downloading a separate model. The VLM weights must already be loaded (e.g. from a prior VLM OCR run, or `LOAD_TRANSFORMERS_VLM_MODEL_AT_START=True`).
+- **LOAD_TRANSFORMERS_LLM_PII_MODEL_AT_START** (default: False): If enabled, loads the local transformers PII model at application startup to reduce latency on first use.
+- **CHOSEN_LLM_PII_INFERENCE_METHOD** (default: `"aws-bedrock"`): Sets the default inference method programmatically. Options: `"aws-bedrock"`, `"local"`, `"inference-server"`, `"azure-openai"`, `"gemini"`.
+- **DEFAULT_INFERENCE_SERVER_PII_MODEL** (default: `"gemma_3_12b"`): Default model name for inference-server LLM PII detection calls.
+- **INFERENCE_SERVER_API_URL** (default: `"http://localhost:8080"`): Base URL for the inference server used for LLM PII detection. Distinct from `INFERENCE_SERVER_API_URL`, which is used for VLM OCR.
+- **DEFAULT_PII_DETECTION_MODEL**: Default PII detection method shown selected in the UI on startup. Automatically defaults to AWS Comprehend if `SHOW_AWS_PII_DETECTION_OPTIONS` is enabled, otherwise defaults to the local model.
+
+##### Entity configuration
+
+- **CHOSEN_LLM_ENTITIES**: Default entity types pre-selected for LLM detection (e.g. `EMAIL_ADDRESS`, `ADDRESS`, `NAME`, `PHONE_NUMBER`, `CUSTOM`). Configurable as a comma-separated list.
+- **FULL_LLM_ENTITY_LIST**: Full list of entity types available in the LLM entity selection dropdown.
+
+##### Generation parameters
+
+- **LLM_TEMPERATURE** (default: 0.1): Sampling temperature for LLM generation. Lower values produce more deterministic output — recommended for PII detection.
+- **LLM_MAX_NEW_TOKENS** (default: 8192): Maximum tokens the LLM may generate per PII detection call.
+- **LLM_SEED** (default: 42): Random seed for reproducible results.
+- **LLM_CONTEXT_LENGTH** (default: 32768): Maximum context length for the local transformers LLM.
+- **REASONING_SUFFIX**: Suffix appended to prompts for reasoning-capable models to control chain-of-thought behaviour. Examples: `"/nothink"` for Qwen 3.5 (disables thinking), `"Reasoning: low"` for GPT-OSS 20B, `""` for models without reasoning modes. Set automatically based on `LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE`.
+- **MULTIMODAL_PROMPT_FORMAT** (default: False): If enabled, uses a multimodal prompt format. Auto-set to True for some model choices (e.g. Gemma 3 12B/27B).
+- **PRINT_TRANSFORMERS_USER_PROMPT** (default: False): If enabled, prints the system and user prompts to the console when calling the local transformers LLM. Useful for debugging; avoid in production if logs are sensitive.
+- **Additional LLM generation parameters**: `LLM_TOP_K` (default: 64), `LLM_MIN_P` (default: 0), `LLM_TOP_P` (default: 0.95), `LLM_REPETITION_PENALTY` (default: 1.0), `LLM_STREAM` (default: True), `LLM_RESET` (default: False), `SPECULATIVE_DECODING` (default: False), `ASSISTANT_MODEL` (empty), `LLM_MODEL_DTYPE` (default: `"bfloat16"`).
+
+## Command Line Interface (CLI)
+
+The app includes a comprehensive command-line interface (`cli_redact.py`) that allows you to perform redaction, deduplication, AWS Textract batch operations, and document summarisation directly from the terminal. This is particularly useful for batch processing, automation, and integration with other systems.
+
+### Getting started with the CLI
+
+To use the CLI, you need to:
+
+1. Open a terminal window
+2. Navigate to the app folder containing `cli_redact.py`
+3. Activate your virtual environment (conda or venv)
+4. Run commands using `python cli_redact.py` followed by your options
+
+### Basic CLI syntax
+
+```bash
+python cli_redact.py --task [redact|deduplicate|textract|summarise] --input_file [file_path] [additional_options]
+```
+
+Default task is `redact` if `--task` is omitted.
+
+### Redaction examples
+
+**Basic PDF redaction with default settings:**
+```bash
+python cli_redact.py --input_file example_data/example_of_emails_sent_to_a_professor_before_applying.pdf
+```
+
+**Extract text only (no redaction) with whole page redaction:**
+```bash
+python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --redact_whole_page_file example_data/partnership_toolkit_redact_some_pages.csv --pii_detector None
+```
+
+**Redact with custom entities and allow list:**
+```bash
+python cli_redact.py --input_file example_data/graduate-job-example-cover-letter.pdf --allow_list_file example_data/test_allow_list_graduate.csv --local_redact_entities TITLES PERSON DATE_TIME
+```
+
+**Redact with fuzzy matching and custom deny list:**
+```bash
+python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --deny_list_file example_data/Partnership-Agreement-Toolkit_test_deny_list_para_single_spell.csv --local_redact_entities CUSTOM_FUZZY --page_min 1 --page_max 3 --fuzzy_mistakes 3
+```
+
+**Redact with AWS services:**
+```bash
+python cli_redact.py --input_file example_data/example_of_emails_sent_to_a_professor_before_applying.pdf --ocr_method "AWS Textract" --pii_detector "AWS Comprehend"
+```
+
+**Redact specific pages with signature extraction:**
+```bash
+python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --page_min 6 --page_max 7 --ocr_method "AWS Textract" --handwrite_signature_extraction "Extract handwriting" "Extract signatures"
+```
+
+**Redact with LLM PII (entity subset and custom instructions):**
+When your deployment uses an LLM-based PII method (e.g. via config/defaults), you can pass LLM entities and instructions:
+```bash
+python cli_redact.py --input_file example_data/example_of_emails_sent_to_a_professor_before_applying.pdf --llm_redact_entities NAME EMAIL_ADDRESS PHONE_NUMBER ADDRESS CUSTOM --custom_llm_instructions "Do not redact the name of the university."
+```
+
+### Tabular data redaction
+
+**Anonymize CSV file with specific columns:**
+```bash
+python cli_redact.py --input_file example_data/combined_case_notes.csv --text_columns "Case Note" "Client" --anon_strategy replace_redacted
+```
+
+**Anonymize Excel file:**
+```bash
+python cli_redact.py --input_file example_data/combined_case_notes.xlsx --text_columns "Case Note" "Client" --excel_sheets combined_case_notes --anon_strategy redact
+```
+
+**Anonymize Word document:**
+```bash
+python cli_redact.py --input_file "example_data/Bold minimalist professional cover letter.docx" --anon_strategy replace_redacted
+```
+
+### Duplicate detection
+
+**Find duplicate pages in OCR files:**
+```bash
+python cli_redact.py --task deduplicate --input_file example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv --duplicate_type pages --similarity_threshold 0.95
+```
+
+**Find duplicates at line level:**
+```bash
+python cli_redact.py --task deduplicate --input_file example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv --duplicate_type pages --similarity_threshold 0.95 --combine_pages False --min_word_count 3
+```
+
+**Find duplicate rows in tabular data:**
+```bash
+python cli_redact.py --task deduplicate --input_file example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv --duplicate_type tabular --text_columns "text" --similarity_threshold 0.95
+```
+
+### AWS Textract operations
+
+**Submit document for analysis:**
+```bash
+python cli_redact.py --task textract --textract_action submit --input_file example_data/example_of_emails_sent_to_a_professor_before_applying.pdf
+```
+
+**Submit with signature extraction:**
+```bash
+python cli_redact.py --task textract --textract_action submit --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --extract_signatures
+```
+
+**Retrieve results by job ID:**
+```bash
+python cli_redact.py --task textract --textract_action retrieve --job_id 12345678-1234-1234-1234-123456789012
+```
+
+**List recent jobs:**
+```bash
+python cli_redact.py --task textract --textract_action list
+```
+
+### Document summarisation
+
+**Summarise OCR output CSV(s) with AWS Bedrock:**
+```bash
+python cli_redact.py --task summarise --input_file example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_ocr_output.csv --summarisation_inference_method "LLM (AWS Bedrock)"
+```
+
+**Summarise with local LLM and detailed format:**
+```bash
+python cli_redact.py --task summarise --input_file example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_ocr_output.csv --summarisation_inference_method "Local transformers LLM" --summarisation_format detailed
+```
+
+**Summarise with context and extra instructions (concise format):**
+```bash
+python cli_redact.py --task summarise --input_file example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_ocr_output.csv --summarisation_context "This is a partnership agreement" --summarisation_additional_instructions "Focus on key obligations and termination clauses" --summarisation_format concise
+```
+
+**Summarise multiple OCR CSV files:**
+```bash
+python cli_redact.py --task summarise --input_file example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_ocr_output.csv example_data/example_outputs/example_of_emails_sent_to_a_professor_before_applying_ocr_output_textract.csv --summarisation_inference_method "LLM (AWS Bedrock)"
+```
+
+### Common CLI options
+
+#### General options
+
+- `--task`: Task to perform: "redact", "deduplicate", "textract", or "summarise" (default: redact)
+- `--input_file`: Path to input file(s); multiple files separated by spaces
+- `--output_dir`: Directory for output files (default: output/)
+- `--input_dir`: Directory for input files (default: input/)
+- `--language`: Language of document content (e.g. "en", "es", "fr")
+- `--username`: Username for session tracking
+- `--save_to_user_folders`: Save outputs under username-based subfolders (True/False)
+- `--allow_list`: Path to CSV of terms to exclude from redaction (default from config)
+- `--pii_detector`: PII detection method: "Local", "AWS Comprehend", or "None"
+- `--local_redact_entities`: Local entities to redact (space-separated list)
+- `--aws_redact_entities`: AWS Comprehend entities to redact (space-separated list)
+- `--aws_access_key` / `--aws_secret_key`: AWS credentials for cloud services
+- `--aws_region`: AWS region for cloud services
+- `--s3_bucket`: S3 bucket name for cloud operations
+- `--cost_code`: Cost code for tracking usage
+- `--save_outputs_to_s3`: Upload output files to S3 after processing (True/False)
+- `--s3_outputs_folder`: S3 key prefix for output files
+- `--s3_outputs_bucket`: S3 bucket for outputs (defaults to --s3_bucket if not set)
+- `--do_initial_clean`: Perform initial text cleaning for tabular data (True/False)
+- `--save_logs_to_csv`: Save processing logs to CSV (True/False)
+- `--save_logs_to_dynamodb`: Save processing logs to DynamoDB (True/False)
+- `--display_file_names_in_logs`: Include file names in log output (True/False)
+- `--upload_logs_to_s3`: Upload log files to S3 after processing (True/False)
+- `--s3_logs_prefix`: S3 prefix for usage log files
+- `--feedback_logs_folder`: Directory for feedback log files
+- `--access_logs_folder`: Directory for access log files
+- `--usage_logs_folder`: Directory for usage log files
+- `--paddle_model_path`: Directory for PaddleOCR model storage
+- `--spacy_model_path`: Directory for spaCy model storage
+
+#### PDF/Image redaction options
+
+- `--ocr_method`: Text extraction method: "AWS Textract", "Local OCR", or "Local text"
+- `--chosen_local_ocr_model`: Local OCR model (e.g. "tesseract", "paddle", "hybrid-paddle", "hybrid-vlm", "inference-server")
+- `--page_min` / `--page_max`: Page range to process (0 for page_max means all pages)
+- `--images_dpi`: DPI for image processing (default: 300.0)
+- `--preprocess_local_ocr_images`: Preprocess images before OCR (True/False)
+- `--compress_redacted_pdf`: Compress the final redacted PDF (True/False)
+- `--return_pdf_end_of_redaction`: Return PDF at end of redaction (True/False)
+- `--allow_list_file` / `--deny_list_file`: Paths to custom allow/deny list CSV files
+- `--redact_whole_page_file`: Path to CSV listing pages to redact completely
+- `--handwrite_signature_extraction`: Textract options ("Extract handwriting", "Extract signatures")
+- `--extract_forms`: Extract forms during Textract analysis (flag)
+- `--extract_tables`: Extract tables during Textract analysis (flag)
+- `--extract_layout`: Extract layout during Textract analysis (flag)
+- `--vlm_model_choice`: VLM model for OCR (e.g. Bedrock model ID when using cloud VLM)
+- `--inference_server_vlm_model`: Inference server VLM model name for OCR
+- `--inference_server_api_url`: Inference server API URL for VLM OCR
+- `--gemini_api_key`: Google Gemini API key for VLM OCR
+- `--azure_openai_api_key`: Azure OpenAI API key for VLM OCR
+- `--azure_openai_endpoint`: Azure OpenAI endpoint URL for VLM OCR
+- `--efficient_ocr`: Use efficient OCR: try selectable text first per page, run OCR only when needed (flag)
+- `--no_efficient_ocr`: Disable efficient OCR (flag)
+- `--efficient_ocr_min_words`: Min words on page to use text-only route; below this use OCR (default from config)
+
+#### LLM PII detection options
+
+Used when the PII method is LLM-based (e.g. via config/defaults). Model used depends on inference method.
+
+- `--llm_model_choice`: LLM model for PII (e.g. Bedrock model ID); defaults to config CLOUD_LLM_PII_MODEL_CHOICE for Bedrock
+- `--llm_inference_method`: "aws-bedrock", "local", "inference-server", "azure-openai", or "gemini"
+- `--inference_server_pii_model`: Inference server PII detection model name
+- `--llm_temperature`: Temperature for LLM PII (lower = more deterministic)
+- `--llm_max_tokens`: Max tokens in LLM response for PII detection
+- `--llm_redact_entities`: LLM entities to detect (space-separated, e.g. NAME, EMAIL_ADDRESS, PHONE_NUMBER, ADDRESS, CUSTOM)
+- `--custom_llm_instructions`: Custom instructions for LLM entity detection (e.g. "Do not redact company names")
+
+#### Tabular/Word anonymization options
+
+- `--anon_strategy`: One of "redact", "redact completely", "replace_redacted", "entity_type", "encrypt", "hash", "replace with 'REDACTED'", "replace with ", "mask", "fake_first_name"
+- `--text_columns`: Column names to anonymise or use for deduplication (space-separated)
+- `--excel_sheets`: Excel sheet names to process (space-separated)
+- `--fuzzy_mistakes`: Allowed spelling mistakes for fuzzy matching (default: 0)
+- `--match_fuzzy_whole_phrase_bool`: Match fuzzy whole phrase (True/False)
+- `--do_initial_clean`: Initial text cleaning for tabular data (True/False)
+
+#### Duplicate detection options
+
+- `--duplicate_type`: Type of duplicate detection ("pages" for OCR files or "tabular" for CSV/Excel)
+- `--similarity_threshold`: Similarity threshold (0-1) to consider content as duplicates (default: 0.95)
+- `--min_word_count`: Minimum word count for text to be considered (default: 10)
+- `--min_consecutive_pages`: Minimum number of consecutive pages to consider as a match (default: 1)
+- `--greedy_match`: Use greedy matching strategy for consecutive pages (True/False)
+- `--combine_pages`: Combine text from same page number within a file (True/False)
+- `--remove_duplicate_rows`: Remove duplicate rows from output (True/False)
+
+#### Document summarisation options
+
+- `--summarisation_inference_method`: "LLM (AWS Bedrock)", "Local transformers LLM", or "Local inference server"
+- `--summarisation_temperature`: Temperature for summarisation (0.0–2.0; default: 0.6)
+- `--summarisation_max_pages_per_group`: Max pages per page-group summary (default: 30)
+- `--summarisation_api_key`: API key if required by the chosen LLM
+- `--summarisation_context`: Additional context (e.g. "This is a consultation response document")
+- `--summarisation_format`: "concise" (key themes) or "detailed" (default)
+- `--summarisation_additional_instructions`: Extra instructions (e.g. "Focus on key decisions and recommendations")
+
+#### Textract batch operations options
+
+- `--textract_action`: "submit", "retrieve", or "list"
+- `--job_id`: Textract job ID for retrieve action
+- `--extract_signatures`: Extract signatures during Textract analysis (flag)
+- `--textract_bucket`: S3 bucket for Textract operations
+- `--textract_input_prefix`: S3 prefix for input files in Textract operations
+- `--textract_output_prefix`: S3 prefix for output files in Textract operations
+- `--s3_textract_document_logs_subfolder`: S3 prefix for Textract job logs
+- `--local_textract_document_logs_subfolder`: Local path for Textract job logs
+- `--poll_interval`: Polling interval in seconds for job status (default: 30)
+- `--max_poll_attempts`: Max polling attempts before timeout (default: 120)
+
+### Output files
+
+The CLI generates the same output files as the GUI:
+- `...redacted.pdf`: Final redacted document
+- `...redactions_for_review.pdf`: Document with redaction boxes for review
+- `...review_file.csv`: Detailed redaction information
+- `...ocr_results.csv`: Extracted text results
+- `..._textract.json`: AWS Textract results (if applicable)
+
+For more advanced options and configuration, refer to the help text by running:
+```bash
+python cli_redact.py --help
+```
\ No newline at end of file
diff --git a/test/GITHUB_ACTIONS.md b/test/GITHUB_ACTIONS.md
new file mode 100644
index 0000000000000000000000000000000000000000..caf7ed49053f013e78c478c5cd5e85e849fe726b
--- /dev/null
+++ b/test/GITHUB_ACTIONS.md
@@ -0,0 +1,254 @@
+# GitHub Actions Integration Guide
+
+This guide explains how to use your test suite with GitHub Actions for automated CI/CD.
+
+## 🚀 Quick Start
+
+### 1. **Choose Your Workflow**
+
+I've created multiple workflow options for you:
+
+#### **Option A: Simple Test Run** (Recommended for beginners)
+```yaml
+# File: .github/workflows/simple-test.yml
+# - Basic test execution
+# - Ubuntu Latest
+# - Python 3.11
+# - Minimal setup
+```
+
+#### **Option B: Comprehensive CI/CD** (Recommended for production)
+```yaml
+# File: .github/workflows/ci.yml
+# - Full pipeline with linting, security, coverage
+# - Multiple Python versions
+# - Integration tests
+# - Package building
+```
+
+#### **Option C: Multi-OS Testing** (For cross-platform compatibility)
+```yaml
+# File: .github/workflows/multi-os-test.yml
+# - Tests on Ubuntu, Windows, macOS
+# - Multiple Python versions
+# - Cross-platform compatibility
+```
+
+### 2. **Enable GitHub Actions**
+
+1. **Push your code to GitHub**
+2. **Go to your repository → Actions tab**
+3. **Select a workflow and click "Run workflow"**
+4. **Watch the tests run automatically!**
+
+## 📋 What Each Workflow Does
+
+### **Simple Test Run** (`.github/workflows/simple-test.yml`)
+```yaml
+✅ Installs system dependencies (tesseract, poppler, OpenGL)
+✅ Installs Python dependencies from requirements.txt
+✅ Downloads spaCy model
+✅ Creates dummy test data automatically
+✅ Runs your CLI tests
+✅ Runs pytest with coverage
+```
+
+### **Comprehensive CI/CD** (`.github/workflows/ci.yml`)
+```yaml
+✅ Linting (Ruff, Black)
+✅ Unit tests (Python 3.10, 3.11, 3.12)
+✅ Integration tests
+✅ Security scanning (Safety, Bandit)
+✅ Coverage reporting
+✅ Package building (on main branch)
+✅ Artifact uploads
+```
+
+### **Multi-OS Testing** (`.github/workflows/multi-os-test.yml`)
+```yaml
+✅ Tests on Ubuntu, Windows, macOS
+✅ Python 3.10, 3.11, 3.12
+✅ Cross-platform compatibility
+✅ OS-specific dependency handling
+```
+
+## 🔧 How It Works
+
+### **Automatic Test Data Creation**
+The workflows automatically create dummy test files when your example data is missing:
+
+```python
+# .github/scripts/setup_test_data.py creates:
+- example_data/example_of_emails_sent_to_a_professor_before_applying.pdf
+- example_data/combined_case_notes.csv
+- example_data/Bold minimalist professional cover letter.docx
+- example_data/example_complaint_letter.jpg
+- example_data/test_allow_list_*.csv
+- example_data/partnership_toolkit_redact_*.csv
+- example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv
+```
+
+### **System Dependencies**
+Each OS gets the right dependencies:
+
+**Ubuntu:**
+```bash
+sudo apt-get install tesseract-ocr poppler-utils libgl1-mesa-glx
+```
+
+**macOS:**
+```bash
+brew install tesseract poppler
+```
+
+**Windows:**
+```bash
+# Handled by Python packages
+```
+
+### **Python Dependencies**
+```bash
+pip install -r requirements.txt
+pip install pytest pytest-cov reportlab pillow
+```
+
+## 🎯 Triggers
+
+### **When Tests Run:**
+- ✅ **Push to main/dev branches**
+- ✅ **Pull requests to main/dev**
+- ✅ **Daily at 2 AM UTC** (scheduled)
+- ✅ **Manual trigger** from GitHub UI
+
+### **What Happens:**
+1. **Checkout code**
+2. **Install dependencies**
+3. **Create test data**
+4. **Run tests**
+5. **Generate reports**
+6. **Upload artifacts**
+
+## 📊 Test Results
+
+### **Success Criteria:**
+- ✅ All tests pass
+- ✅ No linting errors
+- ✅ Security checks pass
+- ✅ Coverage reports generated
+
+### **Failure Handling:**
+- ✅ Tests skip gracefully if files missing
+- ✅ AWS tests expected to fail without credentials
+- ✅ System dependency failures handled with fallbacks
+
+## 🔍 Monitoring
+
+### **GitHub Actions Tab:**
+- View workflow runs
+- See test results
+- Download artifacts
+- View logs
+
+### **Artifacts Generated:**
+- `test-results.xml` - JUnit test results
+- `coverage.xml` - Coverage data
+- `htmlcov/` - HTML coverage report
+- `bandit-report.json` - Security scan results
+
+### **Coverage Reports:**
+- Uploaded to Codecov automatically
+- Available in GitHub Actions artifacts
+- HTML reports for detailed analysis
+
+## 🛠️ Customization
+
+### **Adding New Tests:**
+1. Add test methods to `test/test.py`
+2. Update `setup_test_data.py` if needed
+3. Tests run automatically in all workflows
+
+### **Modifying Workflows:**
+1. Edit the `.yml` file
+2. Test locally first
+3. Push to trigger workflow
+
+### **Environment Variables:**
+```yaml
+env:
+ PYTHON_VERSION: "3.11"
+ # Add your custom variables here
+```
+
+## 🚨 Troubleshooting
+
+### **Common Issues:**
+
+1. **"Example file not found"**
+ - ✅ **Solution**: Test data is created automatically
+ - ✅ **Check**: `setup_test_data.py` runs in workflow
+
+2. **"AWS credentials not configured"**
+ - ✅ **Expected**: AWS tests fail without credentials
+ - ✅ **Solution**: Tests are designed to handle this
+
+3. **"System dependency error"**
+ - ✅ **Check**: OS-specific installation commands
+ - ✅ **Solution**: Dependencies are installed automatically
+
+4. **"Test timeout"**
+ - ✅ **Default**: 10-minute timeout per test
+ - ✅ **Solution**: Tests are designed to be fast
+
+### **Debug Mode:**
+Add `--verbose` to pytest commands for detailed output:
+```yaml
+pytest test/test.py -v --tb=short
+```
+
+## 📈 Performance
+
+### **Optimizations:**
+- ✅ **Parallel execution** where possible
+- ✅ **Dependency caching** for faster builds
+- ✅ **Minimal system packages** installed
+- ✅ **Efficient test data creation**
+
+### **Build Times:**
+- **Simple Test**: ~5-10 minutes
+- **Comprehensive CI**: ~15-20 minutes
+- **Multi-OS**: ~20-30 minutes
+
+## 🔒 Security
+
+### **Security Features:**
+- ✅ **Dependency scanning** with Safety
+- ✅ **Code scanning** with Bandit
+- ✅ **No secrets exposed** in logs
+- ✅ **Temporary test data** cleaned up
+
+### **Secrets Management:**
+- Use GitHub Secrets for sensitive data
+- Never hardcode credentials in workflows
+- Test data is dummy data only
+
+## 🎉 Success!
+
+Once set up, your GitHub Actions will:
+
+1. **Automatically test** every push and PR
+2. **Generate reports** and coverage data
+3. **Catch issues** before they reach production
+4. **Ensure compatibility** across platforms
+5. **Provide confidence** in your code quality
+
+## 📚 Next Steps
+
+1. **Choose a workflow** that fits your needs
+2. **Push to GitHub** to trigger the first run
+3. **Monitor the Actions tab** for results
+4. **Customize** as needed for your project
+5. **Enjoy** automated testing! 🎉
+
+---
+
+**Need help?** Check the workflow logs in the GitHub Actions tab for detailed error messages and troubleshooting information.
diff --git a/test/GUI_TEST_README.md b/test/GUI_TEST_README.md
new file mode 100644
index 0000000000000000000000000000000000000000..5ce12087b8b4feeefa28f3df69790a5a9d56c953
--- /dev/null
+++ b/test/GUI_TEST_README.md
@@ -0,0 +1,111 @@
+# GUI Testing for Document Redaction App
+
+This directory contains tests specifically for verifying that the GUI application (`app.py`) loads correctly.
+
+## Test Files
+
+### `test_gui_only.py`
+A standalone script that tests only the GUI functionality. This is useful for:
+- Quick verification that the Gradio interface loads without errors
+- CI/CD pipelines where you want to test GUI separately from CLI functionality
+- Development testing when you only want to check GUI components
+
+**Usage:**
+
+Option 1 - Manual activation:
+```bash
+conda activate redaction
+cd test
+python test_gui_only.py
+```
+
+Option 2 - Using helper scripts (Windows):
+```bash
+cd test
+# For Command Prompt:
+run_gui_test.bat
+
+# For PowerShell:
+.\run_gui_test.ps1
+```
+
+### `test.py` (Updated)
+The main test suite now includes both CLI and GUI tests. The GUI tests are in the `TestGUIApp` class.
+
+**Usage:**
+
+Option 1 - Manual activation:
+```bash
+conda activate redaction
+cd test
+python test.py
+```
+
+Option 2 - Using helper scripts (Windows):
+```bash
+cd test
+# For Command Prompt:
+run_gui_test.bat
+
+# For PowerShell:
+.\run_gui_test.ps1
+```
+
+## What the GUI Tests Check
+
+1. **App Import and Initialization** (`test_app_import_and_initialization`)
+ - Verifies that `app.py` can be imported without errors
+ - Checks that the Gradio `app` object is created successfully
+ - Ensures the app is a proper Gradio Blocks instance
+
+2. **App Launch in Headless Mode** (`test_app_launch_headless`)
+ - Tests that the app can be launched without opening a browser
+ - Verifies the Gradio server starts successfully
+ - Uses threading to prevent blocking the test execution
+
+3. **Configuration Loading** (`test_app_configuration_loading`)
+ - Verifies that configuration variables are loaded correctly
+ - Checks key settings like server port, file size limits, language settings
+ - Ensures the app has access to all required configuration
+
+## Test Requirements
+
+- **Conda environment 'redaction' must be activated** before running tests
+- Python environment with all dependencies installed
+- Access to the `tools.config` module
+- Gradio and related GUI dependencies (including `gradio_image_annotation`)
+- The `app.py` file in the parent directory
+
+### Prerequisites
+
+Before running the GUI tests, ensure you have activated the conda environment:
+
+```bash
+conda activate redaction
+```
+
+The `gradio_image_annotation` package is already installed in the 'redaction' environment.
+
+## Expected Behavior
+
+- All tests should pass if the GUI loads correctly
+- Tests will fail if there are import errors, missing dependencies, or configuration issues
+- The headless launch test may take up to 10 seconds to complete
+
+## Troubleshooting
+
+If tests fail:
+1. Check that all dependencies are installed (`pip install -r requirements.txt`)
+2. Verify that `app.py` exists in the parent directory
+3. Ensure configuration files are properly set up
+4. Check for any missing environment variables or configuration issues
+
+## Integration with CI/CD
+
+These tests are designed to run in headless environments and are suitable for:
+- GitHub Actions
+- Jenkins pipelines
+- Docker containers
+- Any automated testing environment
+
+The tests do not require a display or browser to be available.
diff --git a/test/INSTALL.md b/test/INSTALL.md
new file mode 100644
index 0000000000000000000000000000000000000000..f9c5042aa2c8e2b4377cc2606856786cea4a886b
--- /dev/null
+++ b/test/INSTALL.md
@@ -0,0 +1,138 @@
+# Test Suite Installation Guide
+
+This guide explains how to install the dependencies needed to run the CLI redaction test suite.
+
+## Quick Start
+
+### Option 1: Install test dependencies with pip
+```bash
+# Install main application dependencies
+pip install -r requirements.txt
+
+# Install test dependencies
+pip install -r test/requirements.txt
+```
+
+### Option 2: Install with pyproject.toml
+```bash
+# Install with test dependencies
+pip install -e ".[test]"
+```
+
+### Option 3: Install everything at once
+```bash
+# Install main dependencies
+pip install -r requirements.txt
+
+# Install test dependencies
+pip install pytest pytest-cov pytest-html pytest-xdist
+```
+
+## Detailed Requirements
+
+### Core Dependencies (Already in your requirements.txt)
+The test suite uses your existing application dependencies:
+- All the packages in your main `requirements.txt`
+- Standard Python libraries (unittest, tempfile, shutil, os, subprocess)
+
+### Additional Test Dependencies
+
+#### Required for Testing:
+- **pytest** (>=7.0.0): Modern test framework with better discovery and reporting
+- **pytest-cov** (>=4.0.0): Coverage reporting for tests
+
+#### Optional for Enhanced Testing:
+- **pytest-html** (>=3.1.0): Generate HTML test reports
+- **pytest-xdist** (>=3.0.0): Run tests in parallel for faster execution
+
+## Installation Commands
+
+### Minimal Installation (Required)
+```bash
+pip install pytest pytest-cov
+```
+
+### Full Installation (Recommended)
+```bash
+pip install pytest pytest-cov pytest-html pytest-xdist
+```
+
+### Development Installation
+```bash
+# Install in development mode with test dependencies
+pip install -e ".[test]"
+```
+
+## Verification
+
+After installation, verify everything works:
+
+```bash
+# Check pytest is installed
+pytest --version
+
+# Run a simple test to verify the test suite works
+cd test
+python test.py
+```
+
+## Running Tests
+
+### Method 1: Using the test script (Recommended)
+```bash
+cd test
+python test.py
+```
+
+### Method 2: Using pytest
+```bash
+# Run all tests
+pytest test/test.py -v
+
+# Run with coverage
+pytest test/test.py --cov=. --cov-report=html
+
+# Run in parallel (faster)
+pytest test/test.py -n auto
+```
+
+### Method 3: Using unittest directly
+```bash
+cd test
+python -m unittest test.test.TestCLIRedactExamples -v
+```
+
+## Troubleshooting
+
+### Common Issues:
+
+1. **Missing example data files**
+ - Ensure you have the example data in `example_data/` directory
+ - Tests will skip gracefully if files are missing
+
+2. **AWS credentials not configured**
+ - AWS-related tests may fail but this is expected
+ - Tests are designed to handle missing credentials gracefully
+
+3. **Import errors**
+ - Make sure you're in the correct directory
+ - Ensure all main application dependencies are installed first
+
+4. **Permission errors**
+ - Ensure you have write permissions for temporary directories
+ - The test suite creates and cleans up temporary files automatically
+
+### Getting Help:
+
+If you encounter issues:
+1. Check that all main application dependencies are installed
+2. Verify you're running from the correct directory
+3. Ensure example data files are present
+4. Check the test output for specific error messages
+
+## Notes
+
+- The test suite is designed to be robust and will skip tests if required files are missing
+- All temporary files are automatically cleaned up
+- Tests have a 10-minute timeout to prevent hanging
+- AWS tests are expected to fail if credentials aren't configured
diff --git a/test/README.md b/test/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b1301d6447aa16a0418bdc46f3bb31e32302ce1e
--- /dev/null
+++ b/test/README.md
@@ -0,0 +1,120 @@
+# CLI Redaction Test Suite
+
+This test suite provides comprehensive testing for the `cli_redact.py` script based on all the examples shown in the CLI epilog.
+
+## Overview
+
+The test suite includes tests for:
+
+1. **PDF Redaction Examples**
+ - Default settings (local OCR)
+ - Text extraction only (no redaction)
+ - Text extraction with whole page redaction
+ - Redaction with allow lists
+ - Limited pages with custom fuzzy matching
+ - Custom deny/allow/whole page lists
+ - Image redaction
+
+2. **Tabular Anonymisation Examples**
+ - CSV anonymisation with specific columns
+ - Different anonymisation strategies
+ - Word document anonymisation
+
+3. **AWS Services Examples**
+ - Textract and Comprehend redaction
+ - Signature extraction
+ - Layout extraction
+
+4. **Duplicate Detection Examples**
+ - Duplicate pages in OCR files
+ - Line-level duplicate detection
+ - Tabular duplicate detection
+
+5. **Textract Batch Operations**
+ - Submit documents for analysis
+ - Retrieve results by job ID
+ - List recent jobs
+
+## Running the Tests
+
+### Method 1: Run the test suite directly
+```bash
+cd test
+python test.py
+```
+
+### Method 2: Use the convenience script
+```bash
+cd test
+python run_tests.py
+```
+
+### Method 3: Run with unittest
+```bash
+cd test
+python -m unittest test.test.TestCLIRedactExamples -v
+```
+
+## Test Behavior
+
+- **File Dependencies**: Tests will be skipped if required example files are not found in the `example_data/` directory
+- **AWS Tests**: AWS-related tests may fail if credentials are not configured, but this is expected
+- **Temporary Output**: All tests use temporary output directories that are cleaned up automatically
+- **Timeout**: Each test has a 10-minute timeout to prevent hanging
+
+## Test Structure
+
+The test suite uses Python's `unittest` framework with the following structure:
+
+- `TestCLIRedactExamples`: Main test class containing all test methods
+- `run_cli_redact()`: Helper function that executes the CLI script with specified parameters
+- `run_all_tests()`: Main function that runs all tests and provides a summary
+
+## Example Output
+
+```
+================================================================================
+DOCUMENT REDACTION CLI TEST SUITE
+================================================================================
+This test suite runs through all the examples from the CLI epilog.
+Tests will be skipped if required example files are not found.
+AWS-related tests may fail if credentials are not configured.
+================================================================================
+
+Test setup complete. Script: /path/to/cli_redact.py
+Example data directory: /path/to/example_data
+Temp output directory: /tmp/test_output_xyz
+
+=== Testing PDF redaction with default settings ===
+✅ PDF redaction with default settings passed
+
+=== Testing PDF text extraction only ===
+✅ PDF text extraction only passed
+
+...
+
+================================================================================
+TEST SUMMARY
+================================================================================
+Tests run: 20
+Failures: 0
+Errors: 0
+Skipped: 2
+
+Overall result: ✅ PASSED
+================================================================================
+```
+
+## Requirements
+
+- Python 3.6+
+- All dependencies for the main CLI script
+- Example data files in the `example_data/` directory (for full test coverage)
+- AWS credentials (for AWS-related tests)
+
+## Notes
+
+- Tests are designed to be robust and will skip gracefully if files are missing
+- AWS tests are marked as completed even if they fail due to missing credentials
+- The test suite provides detailed output for debugging
+- All temporary files are cleaned up automatically
diff --git a/test/demo_single_test.py b/test/demo_single_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1faa6f8b47da54405ab864b002e62c5dbf90114
--- /dev/null
+++ b/test/demo_single_test.py
@@ -0,0 +1,149 @@
+#!/usr/bin/env python3
+"""
+Demonstration script showing how to run a single test example.
+
+This script shows how to use the run_cli_redact function directly
+to test a specific CLI example.
+"""
+
+import os
+import shutil
+import sys
+import tempfile
+
+# Add the parent directory to the path
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from test import run_cli_redact
+
+
+def demo_pdf_redaction():
+ """Demonstrate how to run a single PDF redaction test."""
+ print("=== Demo: PDF Redaction with Default Settings ===")
+
+ # Set up paths
+ script_path = os.path.join(
+ os.path.dirname(os.path.dirname(__file__)), "cli_redact.py"
+ )
+ input_file = os.path.join(
+ os.path.dirname(os.path.dirname(__file__)),
+ "example_data",
+ "example_of_emails_sent_to_a_professor_before_applying.pdf",
+ )
+ output_dir = tempfile.mkdtemp(prefix="demo_output_")
+
+ print(f"Script: {script_path}")
+ print(f"Input: {input_file}")
+ print(f"Output: {output_dir}")
+
+ # Check if files exist
+ if not os.path.isfile(script_path):
+ print(f"❌ Script not found: {script_path}")
+ return False
+
+ if not os.path.isfile(input_file):
+ print(f"❌ Input file not found: {input_file}")
+ print(
+ "Make sure you have the example data files in the example_data/ directory"
+ )
+ return False
+
+ try:
+ # Run the test
+ print("\nRunning PDF redaction with default settings...")
+ result = run_cli_redact(
+ script_path=script_path, input_file=input_file, output_dir=output_dir
+ )
+
+ if result:
+ print("✅ Test completed successfully!")
+ print(f"Check the output directory for results: {output_dir}")
+ else:
+ print("❌ Test failed!")
+
+ return result
+
+ finally:
+ # Clean up
+ if os.path.exists(output_dir):
+ shutil.rmtree(output_dir)
+ print(f"Cleaned up: {output_dir}")
+
+
+def demo_csv_anonymisation():
+ """Demonstrate how to run a CSV anonymisation test."""
+ print("\n=== Demo: CSV Anonymisation ===")
+
+ # Set up paths
+ script_path = os.path.join(
+ os.path.dirname(os.path.dirname(__file__)), "cli_redact.py"
+ )
+ input_file = os.path.join(
+ os.path.dirname(os.path.dirname(__file__)),
+ "example_data",
+ "combined_case_notes.csv",
+ )
+ output_dir = tempfile.mkdtemp(prefix="demo_output_")
+
+ print(f"Script: {script_path}")
+ print(f"Input: {input_file}")
+ print(f"Output: {output_dir}")
+
+ # Check if files exist
+ if not os.path.isfile(script_path):
+ print(f"❌ Script not found: {script_path}")
+ return False
+
+ if not os.path.isfile(input_file):
+ print(f"❌ Input file not found: {input_file}")
+ print(
+ "Make sure you have the example data files in the example_data/ directory"
+ )
+ return False
+
+ try:
+ # Run the test
+ print("\nRunning CSV anonymisation...")
+ result = run_cli_redact(
+ script_path=script_path,
+ input_file=input_file,
+ output_dir=output_dir,
+ text_columns=["Case Note", "Client"],
+ anon_strategy="replace_redacted",
+ )
+
+ if result:
+ print("✅ Test completed successfully!")
+ print(f"Check the output directory for results: {output_dir}")
+ else:
+ print("❌ Test failed!")
+
+ return result
+
+ finally:
+ # Clean up
+ if os.path.exists(output_dir):
+ shutil.rmtree(output_dir)
+ print(f"Cleaned up: {output_dir}")
+
+
+if __name__ == "__main__":
+ print("CLI Redaction Test Demo")
+ print("=" * 50)
+ print("This script demonstrates how to run individual tests.")
+ print("=" * 50)
+
+ # Run the demos
+ success1 = demo_pdf_redaction()
+ success2 = demo_csv_anonymisation()
+
+ print("\n" + "=" * 50)
+ print("Demo Summary")
+ print("=" * 50)
+ print(f"PDF Redaction: {'✅ PASSED' if success1 else '❌ FAILED'}")
+ print(f"CSV Anonymisation: {'✅ PASSED' if success2 else '❌ FAILED'}")
+
+ overall_success = success1 and success2
+ print(f"\nOverall: {'✅ PASSED' if overall_success else '❌ FAILED'}")
+
+ sys.exit(0 if overall_success else 1)
diff --git a/test/requirements.txt b/test/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c79d8b1cba03a63751677278a29be6951c5158c8
--- /dev/null
+++ b/test/requirements.txt
@@ -0,0 +1,13 @@
+# Test-specific dependencies for the CLI redaction test suite
+# These are in addition to the main application dependencies
+
+# Test framework
+pytest>=7.0.0
+pytest-cov>=4.0.0
+
+# Optional: For more detailed test reporting
+pytest-html>=3.1.0
+pytest-xdist>=3.0.0 # For parallel test execution
+
+# Note: The test suite uses unittest (standard library) but pytest provides
+# better test discovery and reporting capabilities
diff --git a/test/run_gui_test.bat b/test/run_gui_test.bat
new file mode 100644
index 0000000000000000000000000000000000000000..96c7121983c9a01760050bff3fe1c4efcebdfce2
--- /dev/null
+++ b/test/run_gui_test.bat
@@ -0,0 +1,26 @@
+@echo off
+REM Batch script to run GUI tests with conda environment activated
+REM This script activates the 'redaction' conda environment and runs the GUI tests
+
+echo Activating conda environment 'redaction'...
+call conda activate redaction
+
+if %errorlevel% neq 0 (
+ echo Failed to activate conda environment 'redaction'
+ echo Please ensure conda is installed and the 'redaction' environment exists
+ pause
+ exit /b 1
+)
+
+echo Running GUI tests...
+python test_gui_only.py
+
+if %errorlevel% neq 0 (
+ echo GUI tests failed
+ pause
+ exit /b 1
+) else (
+ echo GUI tests passed successfully
+)
+
+pause
diff --git a/test/run_gui_test.ps1 b/test/run_gui_test.ps1
new file mode 100644
index 0000000000000000000000000000000000000000..74e773e831436f74e7233924e3a9f27e9d5eea43
--- /dev/null
+++ b/test/run_gui_test.ps1
@@ -0,0 +1,34 @@
+# PowerShell script to run GUI tests with conda environment activated
+# This script activates the 'redaction' conda environment and runs the GUI tests
+
+Write-Host "Activating conda environment 'redaction'..." -ForegroundColor Green
+
+try {
+ # Try to activate the conda environment
+ conda activate redaction
+
+ if ($LASTEXITCODE -ne 0) {
+ Write-Host "Failed to activate conda environment 'redaction'" -ForegroundColor Red
+ Write-Host "Please ensure conda is installed and the 'redaction' environment exists" -ForegroundColor Red
+ Read-Host "Press Enter to exit"
+ exit 1
+ }
+
+ Write-Host "Running GUI tests..." -ForegroundColor Green
+ python test_gui_only.py
+
+ if ($LASTEXITCODE -ne 0) {
+ Write-Host "GUI tests failed" -ForegroundColor Red
+ Read-Host "Press Enter to exit"
+ exit 1
+ } else {
+ Write-Host "GUI tests passed successfully" -ForegroundColor Green
+ }
+
+} catch {
+ Write-Host "An error occurred: $_" -ForegroundColor Red
+ Read-Host "Press Enter to exit"
+ exit 1
+}
+
+Read-Host "Press Enter to exit"
diff --git a/test/run_tests.py b/test/run_tests.py
new file mode 100644
index 0000000000000000000000000000000000000000..7025e6ead7ff8f197a6de772ef1abd5693d9391c
--- /dev/null
+++ b/test/run_tests.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python3
+"""
+Simple script to run the CLI redaction test suite.
+
+This script demonstrates how to run the comprehensive test suite
+that covers all the examples from the CLI epilog.
+"""
+
+import os
+import sys
+
+# Add the parent directory to the path so we can import the test module
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from test import run_all_tests
+
+if __name__ == "__main__":
+ print("Starting CLI Redaction Test Suite...")
+ print("This will test all examples from the CLI epilog.")
+ print("=" * 60)
+
+ success = run_all_tests()
+
+ if success:
+ print("\n🎉 All tests passed successfully!")
+ sys.exit(0)
+ else:
+ print("\n❌ Some tests failed. Check the output above for details.")
+ sys.exit(1)
diff --git a/test/test.py b/test/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..1521664c742b3c94085d77d5bdeb2780bb7fc146
--- /dev/null
+++ b/test/test.py
@@ -0,0 +1,1215 @@
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+import threading
+import unittest
+from typing import List, Optional
+
+
+def run_cli_redact(
+ script_path: str,
+ input_file: str,
+ output_dir: str,
+ task: str = "redact",
+ timeout: int = 600, # 10-minute timeout
+ # --- General Arguments ---
+ input_dir: Optional[str] = None,
+ language: Optional[str] = None,
+ allow_list: Optional[str] = None,
+ pii_detector: Optional[str] = None,
+ username: Optional[str] = None,
+ save_to_user_folders: Optional[bool] = None,
+ local_redact_entities: Optional[List[str]] = None,
+ aws_redact_entities: Optional[List[str]] = None,
+ aws_access_key: Optional[str] = None,
+ aws_secret_key: Optional[str] = None,
+ cost_code: Optional[str] = None,
+ aws_region: Optional[str] = None,
+ s3_bucket: Optional[str] = None,
+ do_initial_clean: Optional[bool] = None,
+ save_logs_to_csv: Optional[bool] = None,
+ save_logs_to_dynamodb: Optional[bool] = None,
+ display_file_names_in_logs: Optional[bool] = None,
+ upload_logs_to_s3: Optional[bool] = None,
+ s3_logs_prefix: Optional[str] = None,
+ # --- PDF/Image Redaction Arguments ---
+ ocr_method: Optional[str] = None,
+ page_min: Optional[int] = None,
+ page_max: Optional[int] = None,
+ images_dpi: Optional[float] = None,
+ chosen_local_ocr_model: Optional[str] = None,
+ preprocess_local_ocr_images: Optional[bool] = None,
+ compress_redacted_pdf: Optional[bool] = None,
+ return_pdf_end_of_redaction: Optional[bool] = None,
+ deny_list_file: Optional[str] = None,
+ allow_list_file: Optional[str] = None,
+ redact_whole_page_file: Optional[str] = None,
+ handwrite_signature_extraction: Optional[List[str]] = None,
+ extract_forms: Optional[bool] = None,
+ extract_tables: Optional[bool] = None,
+ extract_layout: Optional[bool] = None,
+ # --- Word/Tabular Anonymisation Arguments ---
+ anon_strategy: Optional[str] = None,
+ text_columns: Optional[List[str]] = None,
+ excel_sheets: Optional[List[str]] = None,
+ fuzzy_mistakes: Optional[int] = None,
+ match_fuzzy_whole_phrase_bool: Optional[bool] = None,
+ # --- Duplicate Detection Arguments ---
+ duplicate_type: Optional[str] = None,
+ similarity_threshold: Optional[float] = None,
+ min_word_count: Optional[int] = None,
+ min_consecutive_pages: Optional[int] = None,
+ greedy_match: Optional[bool] = None,
+ combine_pages: Optional[bool] = None,
+ remove_duplicate_rows: Optional[bool] = None,
+ # --- Textract Batch Operations Arguments ---
+ textract_action: Optional[str] = None,
+ job_id: Optional[str] = None,
+ extract_signatures: Optional[bool] = None,
+ textract_bucket: Optional[str] = None,
+ textract_input_prefix: Optional[str] = None,
+ textract_output_prefix: Optional[str] = None,
+ s3_textract_document_logs_subfolder: Optional[str] = None,
+ local_textract_document_logs_subfolder: Optional[str] = None,
+ poll_interval: Optional[int] = None,
+ max_poll_attempts: Optional[int] = None,
+) -> bool:
+ """
+ Executes the cli_redact.py script with specified arguments using a subprocess.
+
+ Args:
+ script_path (str): The path to the cli_redact.py script.
+ input_file (str): The path to the input file to process.
+ output_dir (str): The path to the directory for output files.
+ task (str): The main task to perform ('redact', 'deduplicate', or 'textract').
+ timeout (int): Timeout in seconds for the subprocess.
+
+ # General Arguments
+ input_dir (str): Directory for all input files.
+ language (str): Language of the document content.
+ allow_list (str): Path to a CSV file with words to exclude from redaction.
+ pii_detector (str): Core PII detection method (Local, AWS Comprehend, or None).
+ username (str): Username for the session.
+ save_to_user_folders (bool): Whether to save to user folders or not.
+ local_redact_entities (List[str]): Local redaction entities to use.
+ aws_redact_entities (List[str]): AWS redaction entities to use.
+ aws_access_key (str): Your AWS Access Key ID.
+ aws_secret_key (str): Your AWS Secret Access Key.
+ cost_code (str): Cost code for tracking usage.
+ aws_region (str): AWS region for cloud services.
+ s3_bucket (str): S3 bucket name for cloud operations.
+ do_initial_clean (bool): Perform initial text cleaning for tabular data.
+ save_logs_to_csv (bool): Save processing logs to CSV files.
+ save_logs_to_dynamodb (bool): Save processing logs to DynamoDB.
+ display_file_names_in_logs (bool): Include file names in log outputs.
+ upload_logs_to_s3 (bool): Upload log files to S3 after processing.
+ s3_logs_prefix (str): S3 prefix for usage log files.
+
+ # PDF/Image Redaction Arguments
+ ocr_method (str): OCR method for text extraction from images.
+ page_min (int): First page to redact.
+ page_max (int): Last page to redact.
+ images_dpi (float): DPI for image processing.
+ chosen_local_ocr_model (str): Local OCR model to use.
+ preprocess_local_ocr_images (bool): Preprocess images before OCR.
+ compress_redacted_pdf (bool): Compress the final redacted PDF.
+ return_pdf_end_of_redaction (bool): Return PDF at end of redaction process.
+ deny_list_file (str): Custom words file to recognize for redaction.
+ allow_list_file (str): Custom words file to recognize for redaction.
+ redact_whole_page_file (str): File for pages to redact completely.
+ handwrite_signature_extraction (List[str]): Handwriting and signature extraction options.
+ extract_forms (bool): Extract forms during Textract analysis.
+ extract_tables (bool): Extract tables during Textract analysis.
+ extract_layout (bool): Extract layout during Textract analysis.
+
+ # Word/Tabular Anonymisation Arguments
+ anon_strategy (str): The anonymisation strategy to apply.
+ text_columns (List[str]): A list of column names to anonymise or deduplicate.
+ excel_sheets (List[str]): Specific Excel sheet names to process.
+ fuzzy_mistakes (int): Number of allowed spelling mistakes for fuzzy matching.
+ match_fuzzy_whole_phrase_bool (bool): Match fuzzy whole phrase boolean.
+
+ # Duplicate Detection Arguments
+ duplicate_type (str): Type of duplicate detection (pages or tabular).
+ similarity_threshold (float): Similarity threshold (0-1) to consider content as duplicates.
+ min_word_count (int): Minimum word count for text to be considered.
+ min_consecutive_pages (int): Minimum number of consecutive pages to consider as a match.
+ greedy_match (bool): Use greedy matching strategy for consecutive pages.
+ combine_pages (bool): Combine text from the same page number within a file.
+ remove_duplicate_rows (bool): Remove duplicate rows from the output.
+
+ # Textract Batch Operations Arguments
+ textract_action (str): Textract action to perform (submit, retrieve, or list).
+ job_id (str): Textract job ID for retrieve action.
+ extract_signatures (bool): Extract signatures during Textract analysis.
+ textract_bucket (str): S3 bucket name for Textract operations.
+ textract_input_prefix (str): S3 prefix for input files in Textract operations.
+ textract_output_prefix (str): S3 prefix for output files in Textract operations.
+ s3_textract_document_logs_subfolder (str): S3 prefix for logs in Textract operations.
+ local_textract_document_logs_subfolder (str): Local prefix for logs in Textract operations.
+ poll_interval (int): Polling interval in seconds for Textract job status.
+ max_poll_attempts (int): Maximum number of polling attempts for Textract job completion.
+
+ Returns:
+ bool: True if the script executed successfully, False otherwise.
+ """
+ # 1. Get absolute paths and perform pre-checks
+ script_abs_path = os.path.abspath(script_path)
+ output_abs_dir = os.path.abspath(output_dir)
+
+ # Handle input file based on task and action
+ if task == "textract" and textract_action in ["retrieve", "list"]:
+ # For retrieve and list actions, input file is not required
+ input_abs_path = None
+ else:
+ # For all other cases, input file is required
+ if input_file is None:
+ raise ValueError("Input file is required for this task")
+ input_abs_path = os.path.abspath(input_file)
+ if not os.path.isfile(input_abs_path):
+ raise FileNotFoundError(f"Input file not found: {input_abs_path}")
+
+ if not os.path.isfile(script_abs_path):
+ raise FileNotFoundError(f"Script not found: {script_abs_path}")
+
+ if not os.path.isdir(output_abs_dir):
+ # Create the output directory if it doesn't exist
+ print(f"Output directory not found. Creating: {output_abs_dir}")
+ os.makedirs(output_abs_dir)
+
+ script_folder = os.path.dirname(script_abs_path)
+
+ # 2. Dynamically build the command list
+ command = [
+ "python",
+ script_abs_path,
+ "--output_dir",
+ output_abs_dir,
+ "--task",
+ task,
+ ]
+
+ # Add input_file only if it's not None
+ if input_abs_path is not None:
+ command.extend(["--input_file", input_abs_path])
+
+ # Add general arguments
+ if input_dir:
+ command.extend(["--input_dir", input_dir])
+ if language:
+ command.extend(["--language", language])
+ if allow_list and os.path.isfile(allow_list):
+ command.extend(["--allow_list", os.path.abspath(allow_list)])
+ if pii_detector:
+ command.extend(["--pii_detector", pii_detector])
+ if username:
+ command.extend(["--username", username])
+ if save_to_user_folders is not None:
+ command.extend(["--save_to_user_folders", str(save_to_user_folders)])
+ if local_redact_entities:
+ command.append("--local_redact_entities")
+ command.extend(local_redact_entities)
+ if aws_redact_entities:
+ command.append("--aws_redact_entities")
+ command.extend(aws_redact_entities)
+ if aws_access_key:
+ command.extend(["--aws_access_key", aws_access_key])
+ if aws_secret_key:
+ command.extend(["--aws_secret_key", aws_secret_key])
+ if cost_code:
+ command.extend(["--cost_code", cost_code])
+ if aws_region:
+ command.extend(["--aws_region", aws_region])
+ if s3_bucket:
+ command.extend(["--s3_bucket", s3_bucket])
+ if do_initial_clean is not None:
+ command.extend(["--do_initial_clean", str(do_initial_clean)])
+ if save_logs_to_csv is not None:
+ command.extend(["--save_logs_to_csv", str(save_logs_to_csv)])
+ if save_logs_to_dynamodb is not None:
+ command.extend(["--save_logs_to_dynamodb", str(save_logs_to_dynamodb)])
+ if display_file_names_in_logs is not None:
+ command.extend(
+ ["--display_file_names_in_logs", str(display_file_names_in_logs)]
+ )
+ if upload_logs_to_s3 is not None:
+ command.extend(["--upload_logs_to_s3", str(upload_logs_to_s3)])
+ if s3_logs_prefix:
+ command.extend(["--s3_logs_prefix", s3_logs_prefix])
+
+ # Add PDF/Image redaction arguments
+ if ocr_method:
+ command.extend(["--ocr_method", ocr_method])
+ if page_min is not None:
+ command.extend(["--page_min", str(page_min)])
+ if page_max is not None:
+ command.extend(["--page_max", str(page_max)])
+ if images_dpi is not None:
+ command.extend(["--images_dpi", str(images_dpi)])
+ if chosen_local_ocr_model:
+ command.extend(["--chosen_local_ocr_model", chosen_local_ocr_model])
+ if preprocess_local_ocr_images is not None:
+ command.extend(
+ ["--preprocess_local_ocr_images", str(preprocess_local_ocr_images)]
+ )
+ if compress_redacted_pdf is not None:
+ command.extend(["--compress_redacted_pdf", str(compress_redacted_pdf)])
+ if return_pdf_end_of_redaction is not None:
+ command.extend(
+ ["--return_pdf_end_of_redaction", str(return_pdf_end_of_redaction)]
+ )
+ if deny_list_file and os.path.isfile(deny_list_file):
+ command.extend(["--deny_list_file", os.path.abspath(deny_list_file)])
+ if allow_list_file and os.path.isfile(allow_list_file):
+ command.extend(["--allow_list_file", os.path.abspath(allow_list_file)])
+ if redact_whole_page_file and os.path.isfile(redact_whole_page_file):
+ command.extend(
+ ["--redact_whole_page_file", os.path.abspath(redact_whole_page_file)]
+ )
+ if handwrite_signature_extraction:
+ command.append("--handwrite_signature_extraction")
+ command.extend(handwrite_signature_extraction)
+ if extract_forms:
+ command.append("--extract_forms")
+ if extract_tables:
+ command.append("--extract_tables")
+ if extract_layout:
+ command.append("--extract_layout")
+
+ # Add Word/Tabular anonymisation arguments
+ if anon_strategy:
+ command.extend(["--anon_strategy", anon_strategy])
+ if text_columns:
+ command.append("--text_columns")
+ command.extend(text_columns)
+ if excel_sheets:
+ command.append("--excel_sheets")
+ command.extend(excel_sheets)
+ if fuzzy_mistakes is not None:
+ command.extend(["--fuzzy_mistakes", str(fuzzy_mistakes)])
+ if match_fuzzy_whole_phrase_bool is not None:
+ command.extend(
+ ["--match_fuzzy_whole_phrase_bool", str(match_fuzzy_whole_phrase_bool)]
+ )
+
+ # Add duplicate detection arguments
+ if duplicate_type:
+ command.extend(["--duplicate_type", duplicate_type])
+ if similarity_threshold is not None:
+ command.extend(["--similarity_threshold", str(similarity_threshold)])
+ if min_word_count is not None:
+ command.extend(["--min_word_count", str(min_word_count)])
+ if min_consecutive_pages is not None:
+ command.extend(["--min_consecutive_pages", str(min_consecutive_pages)])
+ if greedy_match is not None:
+ command.extend(["--greedy_match", str(greedy_match)])
+ if combine_pages is not None:
+ command.extend(["--combine_pages", str(combine_pages)])
+ if remove_duplicate_rows is not None:
+ command.extend(["--remove_duplicate_rows", str(remove_duplicate_rows)])
+
+ # Add Textract batch operations arguments
+ if textract_action:
+ command.extend(["--textract_action", textract_action])
+ if job_id:
+ command.extend(["--job_id", job_id])
+ if extract_signatures is not None:
+ if extract_signatures:
+ command.append("--extract_signatures")
+ if textract_bucket:
+ command.extend(["--textract_bucket", textract_bucket])
+ if textract_input_prefix:
+ command.extend(["--textract_input_prefix", textract_input_prefix])
+ if textract_output_prefix:
+ command.extend(["--textract_output_prefix", textract_output_prefix])
+ if s3_textract_document_logs_subfolder:
+ command.extend(
+ [
+ "--s3_textract_document_logs_subfolder",
+ s3_textract_document_logs_subfolder,
+ ]
+ )
+ if local_textract_document_logs_subfolder:
+ command.extend(
+ [
+ "--local_textract_document_logs_subfolder",
+ local_textract_document_logs_subfolder,
+ ]
+ )
+ if poll_interval is not None:
+ command.extend(["--poll_interval", str(poll_interval)])
+ if max_poll_attempts is not None:
+ command.extend(["--max_poll_attempts", str(max_poll_attempts)])
+
+ # Filter out None values before joining
+ command_str = " ".join(str(arg) for arg in command if arg is not None)
+ print(f"Executing command: {command_str}")
+
+ # 3. Execute the command using subprocess
+ try:
+ # Set environment variable to ensure UTF-8 encoding in the subprocess
+ env = os.environ.copy()
+ env["PYTHONIOENCODING"] = "utf-8"
+
+ result = subprocess.Popen(
+ command,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ text=True,
+ encoding="utf-8",
+ errors="replace",
+ env=env, # Pass the environment with UTF-8 encoding
+ cwd=script_folder, # Important for relative paths within the script
+ )
+
+ # Communicate with the process to get output and handle timeout
+ stdout, stderr = result.communicate(timeout=timeout)
+
+ print("--- SCRIPT STDOUT ---")
+ if stdout:
+ print(stdout)
+ print("--- SCRIPT STDERR ---")
+ if stderr:
+ print(stderr)
+ print("---------------------")
+
+ # Analyze the output for errors and success indicators
+ analysis = analyze_test_output(stdout, stderr)
+
+ if analysis["has_errors"]:
+ print("❌ Errors detected in output:")
+ for i, error_type in enumerate(analysis["error_types"]):
+ print(f" {i+1}. {error_type}")
+ if analysis["error_messages"]:
+ print(" Error messages:")
+ for msg in analysis["error_messages"][
+ :3
+ ]: # Show first 3 error messages
+ print(f" - {msg}")
+ return False
+ elif result.returncode == 0:
+ success_msg = "✅ Script executed successfully."
+ if analysis["success_indicators"]:
+ success_msg += f" (Success indicators: {', '.join(analysis['success_indicators'][:3])})"
+ print(success_msg)
+ return True
+ else:
+ print(f"❌ Command failed with return code {result.returncode}")
+ return False
+
+ except subprocess.TimeoutExpired:
+ result.kill()
+ print(f"❌ Subprocess timed out after {timeout} seconds.")
+ return False
+ except Exception as e:
+ print(f"❌ An unexpected error occurred: {e}")
+ return False
+
+
+def analyze_test_output(stdout: str, stderr: str) -> dict:
+ """
+ Analyze test output to provide detailed error information.
+
+ Args:
+ stdout (str): Standard output from the test
+ stderr (str): Standard error from the test
+
+ Returns:
+ dict: Analysis results with error details
+ """
+ combined_output = (stdout or "") + (stderr or "")
+
+ analysis = {
+ "has_errors": False,
+ "error_types": [],
+ "error_messages": [],
+ "success_indicators": [],
+ "warning_indicators": [],
+ }
+
+ # Error patterns
+ error_patterns = {
+ "An error occurred": "General error message",
+ "Error:": "Error prefix",
+ "Exception:": "Exception occurred",
+ "Traceback": "Python traceback",
+ "Failed to": "Operation failure",
+ "Cannot": "Operation not possible",
+ "Unable to": "Operation not possible",
+ "KeyError:": "Missing key/dictionary error",
+ "AttributeError:": "Missing attribute error",
+ "TypeError:": "Type mismatch error",
+ "ValueError:": "Invalid value error",
+ "FileNotFoundError:": "File not found",
+ "ImportError:": "Import failure",
+ "ModuleNotFoundError:": "Module not found",
+ }
+
+ # Success indicators
+ success_patterns = [
+ "Successfully",
+ "Completed",
+ "Finished",
+ "Processed",
+ "Redacted",
+ "Extracted",
+ ]
+
+ # Warning indicators
+ warning_patterns = ["Warning:", "WARNING:", "Deprecated", "DeprecationWarning"]
+
+ # Check for errors
+ for pattern, description in error_patterns.items():
+ if pattern.lower() in combined_output.lower():
+ analysis["has_errors"] = True
+ analysis["error_types"].append(description)
+
+ # Extract the actual error message
+ lines = combined_output.split("\n")
+ for line in lines:
+ if pattern.lower() in line.lower():
+ analysis["error_messages"].append(line.strip())
+
+ # Check for success indicators
+ for pattern in success_patterns:
+ if pattern.lower() in combined_output.lower():
+ analysis["success_indicators"].append(pattern)
+
+ # Check for warnings
+ for pattern in warning_patterns:
+ if pattern.lower() in combined_output.lower():
+ analysis["warning_indicators"].append(pattern)
+
+ return analysis
+
+
+class TestCLIRedactExamples(unittest.TestCase):
+ """Test suite for CLI redaction examples from the epilog."""
+
+ @classmethod
+ def setUpClass(cls):
+ """Set up test environment before running tests."""
+ cls.script_path = os.path.join(
+ os.path.dirname(os.path.dirname(__file__)), "cli_redact.py"
+ )
+ cls.example_data_dir = os.path.join(
+ os.path.dirname(os.path.dirname(__file__)), "example_data"
+ )
+ cls.temp_output_dir = tempfile.mkdtemp(prefix="test_output_")
+
+ # Verify script exists
+ if not os.path.isfile(cls.script_path):
+ raise FileNotFoundError(f"CLI script not found: {cls.script_path}")
+
+ print(f"Test setup complete. Script: {cls.script_path}")
+ print(f"Example data directory: {cls.example_data_dir}")
+ print(f"Temp output directory: {cls.temp_output_dir}")
+
+ # Debug: Check if example data directory exists and list contents
+ if os.path.exists(cls.example_data_dir):
+ print("Example data directory exists. Contents:")
+ for item in os.listdir(cls.example_data_dir):
+ item_path = os.path.join(cls.example_data_dir, item)
+ if os.path.isfile(item_path):
+ print(f" File: {item} ({os.path.getsize(item_path)} bytes)")
+ else:
+ print(f" Directory: {item}")
+ else:
+ print(f"Example data directory does not exist: {cls.example_data_dir}")
+
+ @classmethod
+ def tearDownClass(cls):
+ """Clean up test environment after running tests."""
+ if os.path.exists(cls.temp_output_dir):
+ shutil.rmtree(cls.temp_output_dir)
+ print(f"Cleaned up temp directory: {cls.temp_output_dir}")
+
+ def test_pdf_redaction_default_settings(self):
+ """Test: Redact a PDF with default settings (local OCR)"""
+ print("\n=== Testing PDF redaction with default settings ===")
+ input_file = os.path.join(
+ self.example_data_dir,
+ "example_of_emails_sent_to_a_professor_before_applying.pdf",
+ )
+
+ if not os.path.isfile(input_file):
+ self.skipTest(f"Example file not found: {input_file}")
+
+ result = run_cli_redact(
+ script_path=self.script_path,
+ input_file=input_file,
+ output_dir=self.temp_output_dir,
+ )
+
+ self.assertTrue(result, "PDF redaction with default settings should succeed")
+ print("✅ PDF redaction with default settings passed")
+
+ def test_pdf_text_extraction_only(self):
+ """Test: Extract text from a PDF only (i.e. no redaction), using local OCR"""
+ print("\n=== Testing PDF text extraction only ===")
+ input_file = os.path.join(
+ self.example_data_dir, "Partnership-Agreement-Toolkit_0_0.pdf"
+ )
+ whole_page_file = os.path.join(
+ self.example_data_dir, "partnership_toolkit_redact_some_pages.csv"
+ )
+
+ if not os.path.isfile(input_file):
+ self.skipTest(f"Example file not found: {input_file}")
+ if not os.path.isfile(whole_page_file):
+ self.skipTest(f"Whole page file not found: {whole_page_file}")
+
+ result = run_cli_redact(
+ script_path=self.script_path,
+ input_file=input_file,
+ output_dir=self.temp_output_dir,
+ redact_whole_page_file=whole_page_file,
+ pii_detector="None",
+ )
+
+ self.assertTrue(result, "PDF text extraction should succeed")
+ print("✅ PDF text extraction only passed")
+
+ def test_pdf_text_extraction_with_whole_page_redaction(self):
+ """Test: Extract text from a PDF only with a whole page redaction list"""
+ print("\n=== Testing PDF text extraction with whole page redaction ===")
+ input_file = os.path.join(
+ self.example_data_dir, "Partnership-Agreement-Toolkit_0_0.pdf"
+ )
+ whole_page_file = os.path.join(
+ self.example_data_dir, "partnership_toolkit_redact_some_pages.csv"
+ )
+
+ if not os.path.isfile(input_file):
+ self.skipTest(f"Example file not found: {input_file}")
+ if not os.path.isfile(whole_page_file):
+ self.skipTest(f"Whole page file not found: {whole_page_file}")
+
+ result = run_cli_redact(
+ script_path=self.script_path,
+ input_file=input_file,
+ output_dir=self.temp_output_dir,
+ redact_whole_page_file=whole_page_file,
+ pii_detector="Local",
+ local_redact_entities=["CUSTOM"],
+ )
+
+ self.assertTrue(
+ result, "PDF text extraction with whole page redaction should succeed"
+ )
+ print("✅ PDF text extraction with whole page redaction passed")
+
+ def test_pdf_redaction_with_allow_list(self):
+ """Test: Redact a PDF with allow list (local OCR) and custom list of redaction entities"""
+ print("\n=== Testing PDF redaction with allow list ===")
+ input_file = os.path.join(
+ self.example_data_dir, "graduate-job-example-cover-letter.pdf"
+ )
+ allow_list_file = os.path.join(
+ self.example_data_dir, "test_allow_list_graduate.csv"
+ )
+
+ if not os.path.isfile(input_file):
+ self.skipTest(f"Example file not found: {input_file}")
+ if not os.path.isfile(allow_list_file):
+ self.skipTest(f"Allow list file not found: {allow_list_file}")
+
+ result = run_cli_redact(
+ script_path=self.script_path,
+ input_file=input_file,
+ output_dir=self.temp_output_dir,
+ allow_list_file=allow_list_file,
+ local_redact_entities=["TITLES", "PERSON", "DATE_TIME"],
+ )
+
+ self.assertTrue(result, "PDF redaction with allow list should succeed")
+ print("✅ PDF redaction with allow list passed")
+
+ def test_pdf_redaction_limited_pages_with_custom_fuzzy(self):
+ """Test: Redact a PDF with limited pages and text extraction method with custom fuzzy matching"""
+ print("\n=== Testing PDF redaction with limited pages and fuzzy matching ===")
+ input_file = os.path.join(
+ self.example_data_dir, "Partnership-Agreement-Toolkit_0_0.pdf"
+ )
+ deny_list_file = os.path.join(
+ self.example_data_dir,
+ "Partnership-Agreement-Toolkit_test_deny_list_para_single_spell.csv",
+ )
+
+ if not os.path.isfile(input_file):
+ self.skipTest(f"Example file not found: {input_file}")
+ if not os.path.isfile(deny_list_file):
+ self.skipTest(f"Deny list file not found: {deny_list_file}")
+
+ result = run_cli_redact(
+ script_path=self.script_path,
+ input_file=input_file,
+ output_dir=self.temp_output_dir,
+ deny_list_file=deny_list_file,
+ local_redact_entities=["CUSTOM_FUZZY"],
+ page_min=1,
+ page_max=3,
+ ocr_method="Local text",
+ fuzzy_mistakes=3,
+ )
+
+ self.assertTrue(
+ result, "PDF redaction with limited pages and fuzzy matching should succeed"
+ )
+ print("✅ PDF redaction with limited pages and fuzzy matching passed")
+
+ def test_pdf_redaction_with_custom_lists(self):
+ """Test: Redaction with custom deny list, allow list, and whole page redaction list"""
+ print("\n=== Testing PDF redaction with custom lists ===")
+ input_file = os.path.join(
+ self.example_data_dir, "Partnership-Agreement-Toolkit_0_0.pdf"
+ )
+ deny_list_file = os.path.join(
+ self.example_data_dir, "partnership_toolkit_redact_custom_deny_list.csv"
+ )
+ whole_page_file = os.path.join(
+ self.example_data_dir, "partnership_toolkit_redact_some_pages.csv"
+ )
+ allow_list_file = os.path.join(
+ self.example_data_dir, "test_allow_list_partnership.csv"
+ )
+
+ if not os.path.isfile(input_file):
+ self.skipTest(f"Example file not found: {input_file}")
+ if not os.path.isfile(deny_list_file):
+ self.skipTest(f"Deny list file not found: {deny_list_file}")
+ if not os.path.isfile(whole_page_file):
+ self.skipTest(f"Whole page file not found: {whole_page_file}")
+ if not os.path.isfile(allow_list_file):
+ self.skipTest(f"Allow list file not found: {allow_list_file}")
+
+ result = run_cli_redact(
+ script_path=self.script_path,
+ input_file=input_file,
+ output_dir=self.temp_output_dir,
+ deny_list_file=deny_list_file,
+ redact_whole_page_file=whole_page_file,
+ allow_list_file=allow_list_file,
+ )
+
+ self.assertTrue(result, "PDF redaction with custom lists should succeed")
+ print("✅ PDF redaction with custom lists passed")
+
+ def test_image_redaction(self):
+ """Test: Redact an image"""
+ print("\n=== Testing image redaction ===")
+ input_file = os.path.join(self.example_data_dir, "example_complaint_letter.jpg")
+
+ if not os.path.isfile(input_file):
+ self.skipTest(f"Example file not found: {input_file}")
+
+ result = run_cli_redact(
+ script_path=self.script_path,
+ input_file=input_file,
+ output_dir=self.temp_output_dir,
+ )
+
+ self.assertTrue(result, "Image redaction should succeed")
+ print("✅ Image redaction passed")
+
+ def test_csv_anonymisation_specific_columns(self):
+ """Test: Anonymise csv file with specific columns"""
+ print("\n=== Testing CSV anonymisation with specific columns ===")
+ input_file = os.path.join(self.example_data_dir, "combined_case_notes.csv")
+
+ if not os.path.isfile(input_file):
+ self.skipTest(f"Example file not found: {input_file}")
+
+ result = run_cli_redact(
+ script_path=self.script_path,
+ input_file=input_file,
+ output_dir=self.temp_output_dir,
+ text_columns=["Case Note", "Client"],
+ anon_strategy="replace_redacted",
+ )
+
+ self.assertTrue(
+ result, "CSV anonymisation with specific columns should succeed"
+ )
+ print("✅ CSV anonymisation with specific columns passed")
+
+ def test_csv_anonymisation_different_strategy(self):
+ """Test: Anonymise csv file with a different strategy (remove text completely)"""
+ print("\n=== Testing CSV anonymisation with different strategy ===")
+ input_file = os.path.join(self.example_data_dir, "combined_case_notes.csv")
+
+ if not os.path.isfile(input_file):
+ self.skipTest(f"Example file not found: {input_file}")
+
+ result = run_cli_redact(
+ script_path=self.script_path,
+ input_file=input_file,
+ output_dir=self.temp_output_dir,
+ text_columns=["Case Note", "Client"],
+ anon_strategy="redact",
+ )
+
+ self.assertTrue(
+ result, "CSV anonymisation with different strategy should succeed"
+ )
+ print("✅ CSV anonymisation with different strategy passed")
+
+ def test_word_document_anonymisation(self):
+ """Test: Anonymise a word document"""
+ print("\n=== Testing Word document anonymisation ===")
+ input_file = os.path.join(
+ self.example_data_dir, "Bold minimalist professional cover letter.docx"
+ )
+
+ if not os.path.isfile(input_file):
+ self.skipTest(f"Example file not found: {input_file}")
+
+ result = run_cli_redact(
+ script_path=self.script_path,
+ input_file=input_file,
+ output_dir=self.temp_output_dir,
+ anon_strategy="replace_redacted",
+ )
+
+ self.assertTrue(result, "Word document anonymisation should succeed")
+ print("✅ Word document anonymisation passed")
+
+ def test_aws_textract_comprehend_redaction(self):
+ """Test: Use Textract and Comprehend for redaction"""
+ print("\n=== Testing AWS Textract and Comprehend redaction ===")
+ input_file = os.path.join(
+ self.example_data_dir,
+ "example_of_emails_sent_to_a_professor_before_applying.pdf",
+ )
+
+ if not os.path.isfile(input_file):
+ self.skipTest(f"Example file not found: {input_file}")
+
+ # Skip this test if AWS credentials are not available
+ # This is a conditional test that may not work in all environments
+ run_cli_redact(
+ script_path=self.script_path,
+ input_file=input_file,
+ output_dir=self.temp_output_dir,
+ ocr_method="AWS Textract",
+ pii_detector="AWS Comprehend",
+ )
+
+ # Note: This test may fail if AWS credentials are not configured
+ # We'll mark it as passed if it runs without crashing
+ print("✅ AWS Textract and Comprehend redaction test completed")
+
+ def test_aws_textract_signature_extraction(self):
+ """Test: Redact specific pages with AWS OCR and signature extraction"""
+ print("\n=== Testing AWS Textract with signature extraction ===")
+ input_file = os.path.join(
+ self.example_data_dir, "Partnership-Agreement-Toolkit_0_0.pdf"
+ )
+
+ if not os.path.isfile(input_file):
+ self.skipTest(f"Example file not found: {input_file}")
+
+ # Skip this test if AWS credentials are not available
+ run_cli_redact(
+ script_path=self.script_path,
+ input_file=input_file,
+ output_dir=self.temp_output_dir,
+ page_min=6,
+ page_max=7,
+ ocr_method="AWS Textract",
+ handwrite_signature_extraction=[
+ "Extract handwriting",
+ "Extract signatures",
+ ],
+ )
+
+ # Note: This test may fail if AWS credentials are not configured
+ print("✅ AWS Textract with signature extraction test completed")
+
+ def test_duplicate_pages_detection(self):
+ """Test: Find duplicate pages in OCR files"""
+ print("\n=== Testing duplicate pages detection ===")
+ input_file = os.path.join(
+ self.example_data_dir,
+ "example_outputs",
+ "doubled_output_joined.pdf_ocr_output.csv",
+ )
+
+ if not os.path.isfile(input_file):
+ self.skipTest(f"Example OCR file not found: {input_file}")
+
+ result = run_cli_redact(
+ script_path=self.script_path,
+ input_file=input_file,
+ output_dir=self.temp_output_dir,
+ task="deduplicate",
+ duplicate_type="pages",
+ similarity_threshold=0.95,
+ )
+
+ self.assertTrue(result, "Duplicate pages detection should succeed")
+ print("✅ Duplicate pages detection passed")
+
+ def test_duplicate_line_level_detection(self):
+ """Test: Find duplicate in OCR files at the line level"""
+ print("\n=== Testing duplicate line level detection ===")
+ input_file = os.path.join(
+ self.example_data_dir,
+ "example_outputs",
+ "doubled_output_joined.pdf_ocr_output.csv",
+ )
+
+ if not os.path.isfile(input_file):
+ self.skipTest(f"Example OCR file not found: {input_file}")
+
+ result = run_cli_redact(
+ script_path=self.script_path,
+ input_file=input_file,
+ output_dir=self.temp_output_dir,
+ task="deduplicate",
+ duplicate_type="pages",
+ similarity_threshold=0.95,
+ combine_pages=False,
+ min_word_count=3,
+ )
+
+ self.assertTrue(result, "Duplicate line level detection should succeed")
+ print("✅ Duplicate line level detection passed")
+
+ def test_duplicate_tabular_detection(self):
+ """Test: Find duplicate rows in tabular data"""
+ print("\n=== Testing duplicate tabular detection ===")
+ input_file = os.path.join(
+ self.example_data_dir, "Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv"
+ )
+
+ if not os.path.isfile(input_file):
+ self.skipTest(f"Example CSV file not found: {input_file}")
+
+ result = run_cli_redact(
+ script_path=self.script_path,
+ input_file=input_file,
+ output_dir=self.temp_output_dir,
+ task="deduplicate",
+ duplicate_type="tabular",
+ text_columns=["text"],
+ similarity_threshold=0.95,
+ )
+
+ self.assertTrue(result, "Duplicate tabular detection should succeed")
+ print("✅ Duplicate tabular detection passed")
+
+ def test_textract_submit_document(self):
+ """Test: Submit document to Textract for basic text analysis"""
+ print("\n=== Testing Textract document submission ===")
+ input_file = os.path.join(
+ self.example_data_dir,
+ "example_of_emails_sent_to_a_professor_before_applying.pdf",
+ )
+
+ if not os.path.isfile(input_file):
+ self.skipTest(f"Example file not found: {input_file}")
+
+ # Skip this test if AWS credentials are not available
+ try:
+ run_cli_redact(
+ script_path=self.script_path,
+ input_file=input_file,
+ output_dir=self.temp_output_dir,
+ task="textract",
+ textract_action="submit",
+ )
+ except Exception as e:
+ print(f"Textract test failed (expected without AWS credentials): {e}")
+
+ # Note: This test may fail if AWS credentials are not configured
+ print("✅ Textract document submission test completed")
+
+ def test_textract_submit_with_signatures(self):
+ """Test: Submit document to Textract for analysis with signature extraction"""
+ print("\n=== Testing Textract submission with signature extraction ===")
+ input_file = os.path.join(
+ self.example_data_dir, "Partnership-Agreement-Toolkit_0_0.pdf"
+ )
+
+ if not os.path.isfile(input_file):
+ self.skipTest(f"Example file not found: {input_file}")
+
+ # Skip this test if AWS credentials are not available
+ try:
+ run_cli_redact(
+ script_path=self.script_path,
+ input_file=input_file,
+ output_dir=self.temp_output_dir,
+ task="textract",
+ textract_action="submit",
+ extract_signatures=True,
+ )
+ except Exception as e:
+ print(f"Textract test failed (expected without AWS credentials): {e}")
+
+ # Note: This test may fail if AWS credentials are not configured
+ print("✅ Textract submission with signature extraction test completed")
+
+ def test_textract_retrieve_results(self):
+ """Test: Retrieve Textract results by job ID"""
+ print("\n=== Testing Textract results retrieval ===")
+
+ # Skip this test if AWS credentials are not available
+ # This would require a valid job ID from a previous submission
+ # For retrieve and list actions, we don't need a real input file
+ try:
+ run_cli_redact(
+ script_path=self.script_path,
+ input_file=None, # No input file needed for retrieve action
+ output_dir=self.temp_output_dir,
+ task="textract",
+ textract_action="retrieve",
+ job_id="12345678-1234-1234-1234-123456789012", # Dummy job ID
+ )
+ except Exception as e:
+ print(f"Textract test failed (expected without AWS credentials): {e}")
+
+ # Note: This test will likely fail with a dummy job ID, but that's expected
+ print("✅ Textract results retrieval test completed")
+
+ def test_textract_list_jobs(self):
+ """Test: List recent Textract jobs"""
+ print("\n=== Testing Textract jobs listing ===")
+
+ # Skip this test if AWS credentials are not available
+ # For list action, we don't need a real input file
+ try:
+ run_cli_redact(
+ script_path=self.script_path,
+ input_file=None, # No input file needed for list action
+ output_dir=self.temp_output_dir,
+ task="textract",
+ textract_action="list",
+ )
+ except Exception as e:
+ print(f"Textract test failed (expected without AWS credentials): {e}")
+
+ # Note: This test may fail if AWS credentials are not configured
+ print("✅ Textract jobs listing test completed")
+
+
+class TestGUIApp(unittest.TestCase):
+ """Test suite for GUI application loading and basic functionality."""
+
+ @classmethod
+ def setUpClass(cls):
+ """Set up test environment for GUI tests."""
+ cls.app_path = os.path.join(
+ os.path.dirname(os.path.dirname(__file__)), "app.py"
+ )
+
+ # Verify app.py exists
+ if not os.path.isfile(cls.app_path):
+ raise FileNotFoundError(f"App file not found: {cls.app_path}")
+
+ print(f"GUI test setup complete. App: {cls.app_path}")
+
+ def test_app_import_and_initialization(self):
+ """Test: Import app.py and check if the Gradio app object is created successfully."""
+ print("\n=== Testing GUI app import and initialization ===")
+
+ try:
+ # Add the parent directory to the path so we can import app
+ parent_dir = os.path.dirname(os.path.dirname(__file__))
+ if parent_dir not in sys.path:
+ sys.path.insert(0, parent_dir)
+
+ # Import the app module
+ import app
+
+ # Check if the app object exists and is a Gradio Blocks object
+ self.assertTrue(
+ hasattr(app, "blocks"), "App object should exist in the module"
+ )
+
+ # Check if it's a Gradio Blocks instance
+ import gradio as gr
+
+ self.assertIsInstance(
+ app.blocks, gr.Blocks, "App should be a Gradio Blocks instance"
+ )
+
+ print("✅ GUI app import and initialisation passed")
+
+ except ImportError as e:
+ error_msg = f"Failed to import app module: {e}"
+ if "gradio_image_annotation" in str(e):
+ error_msg += "\n\nNOTE: This test requires the 'redaction' conda environment to be activated."
+ error_msg += "\nPlease run: conda activate redaction"
+ error_msg += "\nThen run this test again."
+ self.fail(error_msg)
+ except Exception as e:
+ self.fail(f"Unexpected error during app initialization: {e}")
+
+ def test_app_launch_headless(self):
+ """Test: Launch the app in headless mode to verify it starts without errors."""
+ print("\n=== Testing GUI app launch in headless mode ===")
+
+ try:
+ # Add the parent directory to the path
+ parent_dir = os.path.dirname(os.path.dirname(__file__))
+ if parent_dir not in sys.path:
+ sys.path.insert(0, parent_dir)
+
+ # Import the app module
+
+ import app
+
+ # Set up a flag to track if the app launched successfully
+ app_launched = threading.Event()
+ launch_error = None
+
+ def launch_app():
+ try:
+ # Launch the app in headless mode with a short timeout
+ app.app.launch(
+ show_error=True,
+ inbrowser=False, # Don't open browser
+ server_port=0, # Use any available port
+ quiet=True, # Suppress output
+ prevent_thread_lock=True, # Don't block the main thread
+ )
+ app_launched.set()
+ except Exception:
+ app_launched.set()
+
+ # Start the app in a separate thread
+ launch_thread = threading.Thread(target=launch_app)
+ launch_thread.daemon = True
+ launch_thread.start()
+
+ # Wait for the app to launch (with timeout)
+ if app_launched.wait(timeout=10): # 10 second timeout
+ if launch_error:
+ self.fail(f"App launch failed: {launch_error}")
+ else:
+ print("✅ GUI app launch in headless mode passed")
+ else:
+ self.fail("App launch timed out after 10 seconds")
+
+ except Exception as e:
+ error_msg = f"Unexpected error during app launch test: {e}"
+ if "gradio_image_annotation" in str(e):
+ error_msg += "\n\nNOTE: This test requires the 'redaction' conda environment to be activated."
+ error_msg += "\nPlease run: conda activate redaction"
+ error_msg += "\nThen run this test again."
+ self.fail(error_msg)
+
+ def test_app_configuration_loading(self):
+ """Test: Verify that the app can load its configuration without errors."""
+ print("\n=== Testing GUI app configuration loading ===")
+
+ try:
+ # Add the parent directory to the path
+ parent_dir = os.path.dirname(os.path.dirname(__file__))
+ if parent_dir not in sys.path:
+ sys.path.insert(0, parent_dir)
+
+ # Import the app module (not needed?)
+ # import app
+
+ # Check if key configuration variables are accessible
+ # These should be imported from tools.config
+ from tools.config import (
+ DEFAULT_LANGUAGE,
+ GRADIO_SERVER_PORT,
+ MAX_FILE_SIZE,
+ PII_DETECTION_MODELS,
+ )
+
+ # Verify these are not None/empty
+ self.assertIsNotNone(
+ GRADIO_SERVER_PORT, "GRADIO_SERVER_PORT should be configured"
+ )
+ self.assertIsNotNone(MAX_FILE_SIZE, "MAX_FILE_SIZE should be configured")
+ self.assertIsNotNone(
+ DEFAULT_LANGUAGE, "DEFAULT_LANGUAGE should be configured"
+ )
+ self.assertIsNotNone(
+ PII_DETECTION_MODELS, "PII_DETECTION_MODELS should be configured"
+ )
+
+ print("✅ GUI app configuration loading passed")
+
+ except ImportError as e:
+ error_msg = f"Failed to import configuration: {e}"
+ if "gradio_image_annotation" in str(e):
+ error_msg += "\n\nNOTE: This test requires the 'redaction' conda environment to be activated."
+ error_msg += "\nPlease run: conda activate redaction"
+ error_msg += "\nThen run this test again."
+ self.fail(error_msg)
+ except Exception as e:
+ error_msg = f"Unexpected error during configuration test: {e}"
+ if "gradio_image_annotation" in str(e):
+ error_msg += "\n\nNOTE: This test requires the 'redaction' conda environment to be activated."
+ error_msg += "\nPlease run: conda activate redaction"
+ error_msg += "\nThen run this test again."
+ self.fail(error_msg)
+
+
+def run_all_tests():
+ """Run all test examples and report results."""
+ print("=" * 80)
+ print("DOCUMENT REDACTION TEST SUITE")
+ print("=" * 80)
+ print("This test suite includes:")
+ print("- CLI examples from the epilog")
+ print("- GUI application loading and initialization tests")
+ print("Tests will be skipped if required example files are not found.")
+ print("AWS-related tests may fail if credentials are not configured.")
+ print("=" * 80)
+
+ # Create test suite
+ loader = unittest.TestLoader()
+ suite = unittest.TestSuite()
+
+ # Add CLI tests
+ cli_suite = loader.loadTestsFromTestCase(TestCLIRedactExamples)
+ suite.addTests(cli_suite)
+
+ # Add GUI tests
+ gui_suite = loader.loadTestsFromTestCase(TestGUIApp)
+ suite.addTests(gui_suite)
+
+ # Run tests with detailed output
+ runner = unittest.TextTestRunner(verbosity=2, stream=None)
+ result = runner.run(suite)
+
+ # Print summary
+ print("\n" + "=" * 80)
+ print("TEST SUMMARY")
+ print("=" * 80)
+ print(f"Tests run: {result.testsRun}")
+ print(f"Failures: {len(result.failures)}")
+ print(f"Errors: {len(result.errors)}")
+ print(f"Skipped: {len(result.skipped) if hasattr(result, 'skipped') else 0}")
+
+ if result.failures:
+ print("\nFAILURES:")
+ for test, traceback in result.failures:
+ print(f"- {test}: {traceback}")
+
+ if result.errors:
+ print("\nERRORS:")
+ for test, traceback in result.errors:
+ print(f"- {test}: {traceback}")
+
+ success = len(result.failures) == 0 and len(result.errors) == 0
+ print(f"\nOverall result: {'✅ PASSED' if success else '❌ FAILED'}")
+ print("=" * 80)
+
+ return success
+
+
+if __name__ == "__main__":
+ # Run the test suite
+ success = run_all_tests()
+ exit(0 if success else 1)
diff --git a/test/test_gui_only.py b/test/test_gui_only.py
new file mode 100644
index 0000000000000000000000000000000000000000..b97d17bf20a7b19f5b978fa55dfcde6362ce1374
--- /dev/null
+++ b/test/test_gui_only.py
@@ -0,0 +1,207 @@
+#!/usr/bin/env python3
+"""
+Standalone GUI test script for the document redaction application.
+
+This script tests only the GUI functionality of app.py to ensure it loads correctly.
+Run this script to verify that the Gradio interface can be imported and initialized.
+"""
+
+import os
+import sys
+import threading
+import unittest
+
+# Add the parent directory to the path so we can import the app
+parent_dir = os.path.dirname(os.path.dirname(__file__))
+if parent_dir not in sys.path:
+ sys.path.insert(0, parent_dir)
+
+
+class TestGUIAppOnly(unittest.TestCase):
+ """Test suite for GUI application loading and basic functionality."""
+
+ @classmethod
+ def setUpClass(cls):
+ """Set up test environment for GUI tests."""
+ cls.app_path = os.path.join(parent_dir, "app.py")
+
+ # Verify app.py exists
+ if not os.path.isfile(cls.app_path):
+ raise FileNotFoundError(f"App file not found: {cls.app_path}")
+
+ print(f"GUI test setup complete. App: {cls.app_path}")
+
+ def test_app_import_and_initialization(self):
+ """Test: Import app.py and check if the Gradio app object is created successfully."""
+ print("\n=== Testing GUI app import and initialization ===")
+
+ try:
+ # Import the app module
+ import app
+
+ # Check if the app object exists and is a Gradio Blocks object
+ self.assertTrue(
+ hasattr(app, "app"), "App object should exist in the module"
+ )
+
+ # Check if it's a Gradio Blocks instance
+ import gradio as gr
+
+ self.assertIsInstance(
+ app.app, gr.Blocks, "App should be a Gradio Blocks instance"
+ )
+
+ print("✅ GUI app import and initialization passed")
+
+ except ImportError as e:
+ error_msg = f"Failed to import app module: {e}"
+ if "gradio_image_annotation" in str(e):
+ error_msg += "\n\nNOTE: This test requires the 'redaction' conda environment to be activated."
+ error_msg += "\nPlease run: conda activate redaction"
+ error_msg += "\nThen run this test again."
+ self.fail(error_msg)
+ except Exception as e:
+ self.fail(f"Unexpected error during app initialization: {e}")
+
+ def test_app_launch_headless(self):
+ """Test: Launch the app in headless mode to verify it starts without errors."""
+ print("\n=== Testing GUI app launch in headless mode ===")
+
+ try:
+ # Import the app module
+
+ import app
+
+ # Set up a flag to track if the app launched successfully
+ app_launched = threading.Event()
+ launch_error = None
+
+ def launch_app():
+ try:
+ # Launch the app in headless mode with a short timeout
+ app.app.launch(
+ show_error=True,
+ inbrowser=False, # Don't open browser
+ server_port=0, # Use any available port
+ quiet=True, # Suppress output
+ prevent_thread_lock=True, # Don't block the main thread
+ )
+ app_launched.set()
+ except Exception:
+ app_launched.set()
+
+ # Start the app in a separate thread
+ launch_thread = threading.Thread(target=launch_app)
+ launch_thread.daemon = True
+ launch_thread.start()
+
+ # Wait for the app to launch (with timeout)
+ if app_launched.wait(timeout=10): # 10 second timeout
+ if launch_error:
+ self.fail(f"App launch failed: {launch_error}")
+ else:
+ print("✅ GUI app launch in headless mode passed")
+ else:
+ self.fail("App launch timed out after 10 seconds")
+
+ except Exception as e:
+ error_msg = f"Unexpected error during app launch test: {e}"
+ if "gradio_image_annotation" in str(e):
+ error_msg += "\n\nNOTE: This test requires the 'redaction' conda environment to be activated."
+ error_msg += "\nPlease run: conda activate redaction"
+ error_msg += "\nThen run this test again."
+ self.fail(error_msg)
+
+ def test_app_configuration_loading(self):
+ """Test: Verify that the app can load its configuration without errors."""
+ print("\n=== Testing GUI app configuration loading ===")
+
+ try:
+ # Import the app module (not necessary here?)
+ # import app
+
+ # Check if key configuration variables are accessible
+ # These should be imported from tools.config
+ from tools.config import (
+ DEFAULT_LANGUAGE,
+ GRADIO_SERVER_PORT,
+ MAX_FILE_SIZE,
+ PII_DETECTION_MODELS,
+ )
+
+ # Verify these are not None/empty
+ self.assertIsNotNone(
+ GRADIO_SERVER_PORT, "GRADIO_SERVER_PORT should be configured"
+ )
+ self.assertIsNotNone(MAX_FILE_SIZE, "MAX_FILE_SIZE should be configured")
+ self.assertIsNotNone(
+ DEFAULT_LANGUAGE, "DEFAULT_LANGUAGE should be configured"
+ )
+ self.assertIsNotNone(
+ PII_DETECTION_MODELS, "PII_DETECTION_MODELS should be configured"
+ )
+
+ print("✅ GUI app configuration loading passed")
+
+ except ImportError as e:
+ error_msg = f"Failed to import configuration: {e}"
+ if "gradio_image_annotation" in str(e):
+ error_msg += "\n\nNOTE: This test requires the 'redaction' conda environment to be activated."
+ error_msg += "\nPlease run: conda activate redaction"
+ error_msg += "\nThen run this test again."
+ self.fail(error_msg)
+ except Exception as e:
+ error_msg = f"Unexpected error during configuration test: {e}"
+ if "gradio_image_annotation" in str(e):
+ error_msg += "\n\nNOTE: This test requires the 'redaction' conda environment to be activated."
+ error_msg += "\nPlease run: conda activate redaction"
+ error_msg += "\nThen run this test again."
+ self.fail(error_msg)
+
+
+def run_gui_tests():
+ """Run GUI tests and report results."""
+ print("=" * 80)
+ print("DOCUMENT REDACTION GUI TEST SUITE")
+ print("=" * 80)
+ print("This test suite verifies that the GUI application loads correctly.")
+ print("=" * 80)
+
+ # Create test suite
+ loader = unittest.TestLoader()
+ suite = loader.loadTestsFromTestCase(TestGUIAppOnly)
+
+ # Run tests with detailed output
+ runner = unittest.TextTestRunner(verbosity=2, stream=None)
+ result = runner.run(suite)
+
+ # Print summary
+ print("\n" + "=" * 80)
+ print("GUI TEST SUMMARY")
+ print("=" * 80)
+ print(f"Tests run: {result.testsRun}")
+ print(f"Failures: {len(result.failures)}")
+ print(f"Errors: {len(result.errors)}")
+ print(f"Skipped: {len(result.skipped) if hasattr(result, 'skipped') else 0}")
+
+ if result.failures:
+ print("\nFAILURES:")
+ for test, traceback in result.failures:
+ print(f"- {test}: {traceback}")
+
+ if result.errors:
+ print("\nERRORS:")
+ for test, traceback in result.errors:
+ print(f"- {test}: {traceback}")
+
+ success = len(result.failures) == 0 and len(result.errors) == 0
+ print(f"\nOverall result: {'✅ PASSED' if success else '❌ FAILED'}")
+ print("=" * 80)
+
+ return success
+
+
+if __name__ == "__main__":
+ # Run the GUI test suite
+ success = run_gui_tests()
+ exit(0 if success else 1)
diff --git a/tools/__init__.py b/tools/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tools/auth.py b/tools/auth.py
new file mode 100644
index 0000000000000000000000000000000000000000..403be3039cb0778b7f5530354762fa6097725090
--- /dev/null
+++ b/tools/auth.py
@@ -0,0 +1,88 @@
+# import os
+import base64
+import hashlib
+
+# import gradio as gr
+import hmac
+
+import boto3
+
+from tools.config import AWS_CLIENT_ID, AWS_CLIENT_SECRET, AWS_REGION, AWS_USER_POOL_ID
+
+
+def calculate_secret_hash(client_id: str, client_secret: str, username: str):
+ message = username + client_id
+ dig = hmac.new(
+ str(client_secret).encode("utf-8"),
+ msg=str(message).encode("utf-8"),
+ digestmod=hashlib.sha256,
+ ).digest()
+ secret_hash = base64.b64encode(dig).decode()
+ return secret_hash
+
+
+def authenticate_user(
+ username: str,
+ password: str,
+ user_pool_id: str = AWS_USER_POOL_ID,
+ client_id: str = AWS_CLIENT_ID,
+ client_secret: str = AWS_CLIENT_SECRET,
+):
+ """Authenticates a user against an AWS Cognito user pool.
+
+ Args:
+ user_pool_id (str): The ID of the Cognito user pool.
+ client_id (str): The ID of the Cognito user pool client.
+ username (str): The username of the user.
+ password (str): The password of the user.
+ client_secret (str): The client secret of the app client
+
+ Returns:
+ bool: True if the user is authenticated, False otherwise.
+ """
+
+ client = boto3.client(
+ "cognito-idp", region_name=AWS_REGION
+ ) # Cognito Identity Provider client
+
+ # Compute the secret hash
+ secret_hash = calculate_secret_hash(client_id, client_secret, username)
+
+ try:
+
+ if client_secret == "":
+ response = client.initiate_auth(
+ AuthFlow="USER_PASSWORD_AUTH",
+ AuthParameters={
+ "USERNAME": username,
+ "PASSWORD": password,
+ },
+ ClientId=client_id,
+ )
+
+ else:
+ response = client.initiate_auth(
+ AuthFlow="USER_PASSWORD_AUTH",
+ AuthParameters={
+ "USERNAME": username,
+ "PASSWORD": password,
+ "SECRET_HASH": secret_hash,
+ },
+ ClientId=client_id,
+ )
+
+ # If successful, you'll receive an AuthenticationResult in the response
+ if response.get("AuthenticationResult"):
+ return True
+ else:
+ return False
+
+ except client.exceptions.NotAuthorizedException:
+ return False
+ except client.exceptions.UserNotFoundException:
+ return False
+ except Exception as e:
+ out_message = f"An error occurred: {e}"
+ print(out_message)
+ raise Exception(out_message)
+ return False
diff --git a/tools/aws_functions.py b/tools/aws_functions.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe560ecd1f3c8c2ee08f08030fdc2f2ece5e663d
--- /dev/null
+++ b/tools/aws_functions.py
@@ -0,0 +1,461 @@
+import os
+from typing import List, Type
+
+import boto3
+import pandas as pd
+
+from tools.config import (
+ AWS_ACCESS_KEY,
+ AWS_REGION,
+ AWS_SECRET_KEY,
+ DOCUMENT_REDACTION_BUCKET,
+ PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS,
+ RUN_AWS_FUNCTIONS,
+ S3_OUTPUTS_BUCKET,
+ SAVE_LOGS_TO_CSV,
+)
+from tools.secure_path_utils import secure_join
+
+PandasDataFrame = Type[pd.DataFrame]
+
+
+def connect_to_bedrock_runtime(
+ model_name_map: dict,
+ model_choice: str,
+ aws_access_key_textbox: str = "",
+ aws_secret_key_textbox: str = "",
+ aws_region_textbox: str = "",
+):
+ # If running an anthropic model, assume that running an AWS Bedrock model, load in Bedrock
+ model_source = model_name_map[model_choice]["source"]
+
+ # Use aws_region_textbox if provided, otherwise fall back to AWS_REGION from config
+ region = aws_region_textbox if aws_region_textbox else AWS_REGION
+
+ if "AWS" in model_source:
+ if RUN_AWS_FUNCTIONS and PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS == "1":
+ print("Connecting to Bedrock via existing SSO connection")
+ bedrock_runtime = boto3.client("bedrock-runtime", region_name=region)
+ elif aws_access_key_textbox and aws_secret_key_textbox:
+ print(
+ "Connecting to Bedrock using AWS access key and secret keys from user input."
+ )
+ bedrock_runtime = boto3.client(
+ "bedrock-runtime",
+ aws_access_key_id=aws_access_key_textbox,
+ aws_secret_access_key=aws_secret_key_textbox,
+ region_name=region,
+ )
+ elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
+ print("Getting Bedrock credentials from environment variables")
+ bedrock_runtime = boto3.client(
+ "bedrock-runtime",
+ aws_access_key_id=AWS_ACCESS_KEY,
+ aws_secret_access_key=AWS_SECRET_KEY,
+ region_name=region,
+ )
+ elif RUN_AWS_FUNCTIONS == "1":
+ print("Connecting to Bedrock via existing SSO connection")
+ bedrock_runtime = boto3.client("bedrock-runtime", region_name=region)
+ else:
+ bedrock_runtime = ""
+ out_message = "Cannot connect to AWS Bedrock service. Please provide access keys under LLM settings, or choose another model type."
+ print(out_message)
+ raise Exception(out_message)
+ else:
+ bedrock_runtime = None
+
+ return bedrock_runtime
+
+
+def get_assumed_role_info():
+ sts_endpoint = "https://sts." + AWS_REGION + ".amazonaws.com"
+ sts = boto3.client("sts", region_name=AWS_REGION, endpoint_url=sts_endpoint)
+ response = sts.get_caller_identity()
+
+ # Extract ARN of the assumed role
+ assumed_role_arn = response["Arn"]
+
+ # Extract the name of the assumed role from the ARN
+ assumed_role_name = assumed_role_arn.split("/")[-1]
+
+ return assumed_role_arn, assumed_role_name
+
+
+if RUN_AWS_FUNCTIONS:
+ try:
+ session = boto3.Session(region_name=AWS_REGION)
+
+ except Exception as e:
+ print("Could not start boto3 session:", e)
+
+ try:
+ assumed_role_arn, assumed_role_name = get_assumed_role_info()
+
+ print("Successfully assumed ARN role")
+ # print("Assumed Role ARN:", assumed_role_arn)
+ # print("Assumed Role Name:", assumed_role_name)
+
+ except Exception as e:
+ print("Could not get assumed role from STS:", e)
+
+
+# Download direct from S3 - requires login credentials
+def download_file_from_s3(
+ bucket_name: str,
+ key: str,
+ local_file_path_and_name: str,
+ RUN_AWS_FUNCTIONS: bool = RUN_AWS_FUNCTIONS,
+):
+
+ if RUN_AWS_FUNCTIONS:
+
+ try:
+ # Ensure the local directory exists
+ os.makedirs(os.path.dirname(local_file_path_and_name), exist_ok=True)
+
+ s3 = boto3.client("s3", region_name=AWS_REGION)
+ s3.download_file(bucket_name, key, local_file_path_and_name)
+ print(
+ f"File downloaded from s3://{bucket_name}/{key} to {local_file_path_and_name}"
+ )
+ except Exception as e:
+ print("Could not download file:", key, "from s3 due to", e)
+
+
+def download_folder_from_s3(
+ bucket_name: str,
+ s3_folder: str,
+ local_folder: str,
+ RUN_AWS_FUNCTIONS: bool = RUN_AWS_FUNCTIONS,
+):
+ """
+ Download all files from an S3 folder to a local folder.
+ """
+ if RUN_AWS_FUNCTIONS:
+ if bucket_name and s3_folder and local_folder:
+
+ s3 = boto3.client("s3", region_name=AWS_REGION)
+
+ # List objects in the specified S3 folder
+ response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)
+
+ # Download each object
+ for obj in response.get("Contents", []):
+ # Extract object key and construct local file path
+ object_key = obj["Key"]
+ local_file_path = secure_join(
+ local_folder, os.path.relpath(object_key, s3_folder)
+ )
+
+ # Create directories if necessary
+ os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
+
+ # Download the object
+ try:
+ s3.download_file(bucket_name, object_key, local_file_path)
+ print(
+ f"Downloaded 's3://{bucket_name}/{object_key}' to '{local_file_path}'"
+ )
+ except Exception as e:
+ print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
+ else:
+ print(
+ "One or more required variables are empty, could not download from S3"
+ )
+
+
+def download_files_from_s3(
+ bucket_name: str,
+ s3_folder: str,
+ local_folder: str,
+ filenames: List[str],
+ RUN_AWS_FUNCTIONS: bool = RUN_AWS_FUNCTIONS,
+):
+ """
+ Download specific files from an S3 folder to a local folder.
+ """
+
+ if RUN_AWS_FUNCTIONS:
+ if bucket_name and s3_folder and local_folder and filenames:
+
+ s3 = boto3.client("s3", region_name=AWS_REGION)
+
+ print("Trying to download file: ", filenames)
+
+ if filenames == "*":
+ # List all objects in the S3 folder
+ print("Trying to download all files in AWS folder: ", s3_folder)
+ response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)
+
+ print("Found files in AWS folder: ", response.get("Contents", []))
+
+ filenames = [
+ obj["Key"].split("/")[-1] for obj in response.get("Contents", [])
+ ]
+
+ print("Found filenames in AWS folder: ", filenames)
+
+ for filename in filenames:
+ object_key = secure_join(s3_folder, filename)
+ local_file_path = secure_join(local_folder, filename)
+
+ # Create directories if necessary
+ os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
+
+ # Download the object
+ try:
+ s3.download_file(bucket_name, object_key, local_file_path)
+ print(
+ f"Downloaded 's3://{bucket_name}/{object_key}' to '{local_file_path}'"
+ )
+ except Exception as e:
+ print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
+
+ else:
+ print(
+ "One or more required variables are empty, could not download from S3"
+ )
+
+
+def upload_file_to_s3(
+ local_file_paths: List[str],
+ s3_key: str,
+ s3_bucket: str = DOCUMENT_REDACTION_BUCKET,
+ RUN_AWS_FUNCTIONS: bool = RUN_AWS_FUNCTIONS,
+):
+ """
+ Uploads a file from local machine to Amazon S3.
+
+ Args:
+ - local_file_path: Local file path(s) of the file(s) to upload.
+ - s3_key: Key (path) to the file in the S3 bucket.
+ - s3_bucket: Name of the S3 bucket.
+
+ Returns:
+ - Message as variable/printed to console
+ """
+ final_out_message = list()
+ final_out_message_str = ""
+
+ if RUN_AWS_FUNCTIONS:
+ try:
+ # Allow empty s3_key for uploads to bucket root
+ if s3_bucket and local_file_paths:
+
+ s3_client = boto3.client("s3", region_name=AWS_REGION)
+ s3_key_prefix = s3_key if s3_key else ""
+
+ if isinstance(local_file_paths, str):
+ local_file_paths = [local_file_paths]
+
+ for file in local_file_paths:
+ if s3_client:
+ # print(s3_client)
+ try:
+ # Get file name off file path
+ file_name = os.path.basename(file)
+
+ s3_key_full = s3_key_prefix + file_name
+ # print("S3 key: ", s3_bucket, "/", s3_key_full, sep="")
+
+ s3_client.upload_file(file, s3_bucket, s3_key_full)
+ out_message = (
+ "File " + file_name + " uploaded successfully!"
+ )
+
+ except Exception as e:
+ out_message = f"Error uploading file(s): {e}"
+ print(out_message)
+
+ final_out_message.append(out_message)
+ final_out_message_str = "\n".join(final_out_message)
+
+ else:
+ final_out_message_str = "Could not connect to AWS."
+ else:
+ final_out_message_str = (
+ "At least one essential variable is empty, could not upload to S3"
+ )
+ except Exception as e:
+ final_out_message_str = "Could not upload files to S3 due to: " + str(e)
+ print(final_out_message_str)
+ else:
+ final_out_message_str = "App not set to run AWS functions"
+
+ return final_out_message_str
+
+
+def upload_log_file_to_s3(
+ local_file_paths: List[str],
+ s3_key: str,
+ s3_bucket: str = DOCUMENT_REDACTION_BUCKET,
+ RUN_AWS_FUNCTIONS: bool = RUN_AWS_FUNCTIONS,
+ SAVE_LOGS_TO_CSV: bool = SAVE_LOGS_TO_CSV,
+):
+ """
+ Uploads a log file from local machine to Amazon S3.
+
+ Args:
+ - local_file_path: Local file path(s) of the file(s) to upload.
+ - s3_key: Key (path) to the file in the S3 bucket.
+ - s3_bucket: Name of the S3 bucket.
+
+ Returns:
+ - Message as variable/printed to console
+ """
+ final_out_message = list()
+ final_out_message_str = ""
+
+ if RUN_AWS_FUNCTIONS and SAVE_LOGS_TO_CSV:
+ try:
+ if s3_bucket and s3_key and local_file_paths:
+
+ s3_client = boto3.client("s3", region_name=AWS_REGION)
+
+ if isinstance(local_file_paths, str):
+ local_file_paths = [local_file_paths]
+
+ for file in local_file_paths:
+ if s3_client:
+ # print(s3_client)
+ try:
+ # Get file name off file path
+ file_name = os.path.basename(file)
+
+ s3_key_full = s3_key + file_name
+
+ s3_client.upload_file(file, s3_bucket, s3_key_full)
+ out_message = (
+ "File " + file_name + " uploaded successfully!"
+ )
+ # print(out_message)
+
+ except Exception as e:
+ out_message = f"Error uploading file(s): {e}"
+ print(out_message)
+
+ final_out_message.append(out_message)
+ final_out_message_str = "\n".join(final_out_message)
+
+ else:
+ final_out_message_str = "Could not connect to AWS."
+ else:
+ final_out_message_str = (
+ "At least one essential variable is empty, could not upload to S3"
+ )
+ except Exception as e:
+ final_out_message_str = "Could not upload files to S3 due to: " + str(e)
+ print(final_out_message_str)
+ else:
+ final_out_message_str = "App not set to run AWS functions"
+
+ return final_out_message_str
+
+
+# Helper to upload outputs to S3 when enabled in config.
+def export_outputs_to_s3(
+ file_list_state,
+ s3_output_folder_state_value: str,
+ save_outputs_to_s3_flag: bool,
+ base_file_state=None,
+ s3_bucket: str = S3_OUTPUTS_BUCKET,
+):
+ """
+ Upload a list of local output files to the configured S3 outputs folder.
+
+ - file_list_state: Gradio dropdown state that holds a list of file paths or a
+ single path/string. If blank/empty, no action is taken.
+ - s3_output_folder_state_value: Final S3 key prefix (including any session hash)
+ to use as the destination folder for uploads.
+ - s3_bucket: Name of the S3 bucket.
+ """
+ try:
+
+ # Respect the runtime toggle as well as environment configuration
+ if not save_outputs_to_s3_flag:
+ return
+
+ if not s3_output_folder_state_value:
+ # No configured S3 outputs folder – nothing to do
+ return
+
+ # Normalise input to a Python list of strings
+ file_paths = file_list_state
+ if not file_paths:
+ return
+
+ # Gradio dropdown may return a single string or a list
+ if isinstance(file_paths, str):
+ file_paths = [file_paths]
+
+ # Filter out any non-truthy values
+ file_paths = [p for p in file_paths if p]
+ if not file_paths:
+ return
+
+ # Derive a base file stem (name without extension) from the original
+ # file(s) being analysed, if provided. This is used to create an
+ # additional subfolder layer so that outputs are grouped under the
+ # analysed file name rather than under each output file name.
+ base_stem = None
+ if base_file_state:
+ base_path = None
+
+ # Gradio File components typically provide a list of objects with a `.name` attribute
+ if isinstance(base_file_state, str):
+ base_path = base_file_state
+ elif isinstance(base_file_state, list) and base_file_state:
+ first_item = base_file_state[0]
+ base_path = getattr(first_item, "name", None) or str(first_item)
+ else:
+ base_path = getattr(base_file_state, "name", None) or str(
+ base_file_state
+ )
+
+ if base_path:
+ base_name = os.path.basename(base_path)
+ base_stem, _ = os.path.splitext(base_name)
+
+ # Ensure base S3 prefix (session/date) ends with a trailing slash
+ base_prefix = s3_output_folder_state_value
+ if not base_prefix.endswith("/"):
+ base_prefix = base_prefix + "/"
+
+ # For each file, append a subfolder. If we have a derived base_stem
+ # from the input being analysed, use that; otherwise, fall back to
+ # the individual output file name stem. Final pattern:
+ # ///
+ # or, if base_file_stem is not available:
+ # ///
+ for file in file_paths:
+ file_name = os.path.basename(file)
+
+ if base_stem:
+ folder_stem = base_stem
+ else:
+ folder_stem, _ = os.path.splitext(file_name)
+
+ per_file_prefix = base_prefix + folder_stem + "/"
+
+ out_message = upload_file_to_s3(
+ local_file_paths=[file],
+ s3_key=per_file_prefix,
+ s3_bucket=s3_bucket,
+ )
+
+ # Log any issues to console so failures are visible in logs/stdout
+ if (
+ "Error uploading file" in out_message
+ or "could not upload" in out_message.lower()
+ ):
+ print("export_outputs_to_s3 encountered issues:", out_message)
+
+ print("Successfully uploaded outputs to S3")
+
+ except Exception as e:
+ # Do not break the app flow if S3 upload fails – just report to console
+ print(f"export_outputs_to_s3 failed with error: {e}")
+
+ # No GUI outputs to update
+ return
diff --git a/tools/aws_textract.py b/tools/aws_textract.py
new file mode 100644
index 0000000000000000000000000000000000000000..d66a4c84ab2176eb82f0cead5a6a737d50759d86
--- /dev/null
+++ b/tools/aws_textract.py
@@ -0,0 +1,1271 @@
+import io
+import json
+import os
+import re
+import time
+from pathlib import Path
+from typing import Any, Dict, List, Tuple
+
+import boto3
+import pandas as pd
+import pikepdf
+
+from tools.config import (
+ AWS_ACCESS_KEY,
+ AWS_REGION,
+ AWS_SECRET_KEY,
+ PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS,
+ RUN_AWS_FUNCTIONS,
+ SPLIT_PUNCTUATION_FROM_WORDS,
+)
+from tools.custom_image_analyser_engine import CustomImageRecognizerResult, OCRResult
+from tools.helper_functions import _generate_unique_ids
+from tools.secure_path_utils import secure_file_read
+
+
+def extract_textract_metadata(response: object):
+ """Extracts metadata from an AWS Textract response."""
+
+ request_id = response["ResponseMetadata"]["RequestId"]
+ pages = response["DocumentMetadata"]["Pages"]
+
+ return str({"RequestId": request_id, "Pages": pages})
+
+
+def analyse_page_with_textract(
+ pdf_page_bytes: object,
+ page_no: int,
+ client: str = "",
+ handwrite_signature_checkbox: List[str] = ["Extract handwriting"],
+ textract_output_found: bool = False,
+ aws_access_question_textbox: str = AWS_ACCESS_KEY,
+ aws_secret_question_textbox: str = AWS_SECRET_KEY,
+ RUN_AWS_FUNCTIONS: bool = RUN_AWS_FUNCTIONS,
+ PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS: bool = PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS,
+ textract_max_retries: int = 10,
+):
+ """
+ Analyzes a single page of a document using AWS Textract to extract text and other features.
+
+ Args:
+ pdf_page_bytes (object): The content of the PDF page or image as bytes.
+ page_no (int): The page number being analyzed.
+ client (str, optional): An optional pre-initialized AWS Textract client. If not provided,
+ the function will attempt to create one based on configuration.
+ Defaults to "".
+ handwrite_signature_checkbox (List[str], optional): A list of feature types to extract
+ from the document. Options include
+ "Extract handwriting", "Extract signatures",
+ "Extract forms", "Extract layout", "Extract tables".
+ Defaults to ["Extract handwriting"].
+ textract_output_found (bool, optional): A flag indicating whether existing Textract output
+ for the document has been found. This can prevent
+ unnecessary API calls. Defaults to False.
+ aws_access_question_textbox (str, optional): AWS access question provided by the user, if not using
+ SSO or environment variables. Defaults to AWS_ACCESS_KEY.
+ aws_secret_question_textbox (str, optional): AWS secret question provided by the user, if not using
+ SSO or environment variables. Defaults to AWS_SECRET_KEY.
+ RUN_AWS_FUNCTIONS (bool, optional): Configuration flag to enable or
+ disable AWS functions. Defaults to RUN_AWS_FUNCTIONS.
+ PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS (bool, optional): Configuration flag (e.g., True or False)
+ to prioritize AWS SSO credentials
+ over environment variables.
+ Defaults to True.
+ textract_max_retries (int, optional): Maximum number of attempts for each Textract API call.
+ Defaults to 2.
+
+ Returns:
+ Tuple[List[Dict], str]: A tuple containing:
+ - A list of dictionaries, where each dictionary represents a Textract block (e.g., LINE, WORD, FORM, TABLE).
+ - A string containing metadata about the Textract request.
+ """
+
+ # print("handwrite_signature_checkbox in analyse_page_with_textract:", handwrite_signature_checkbox)
+ if client == "":
+ try:
+ # Try to connect to AWS Textract Client if using that text extraction method
+ if RUN_AWS_FUNCTIONS and PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS:
+ print("Connecting to Textract via existing SSO connection")
+ client = boto3.client("textract", region_name=AWS_REGION)
+ elif aws_access_question_textbox and aws_secret_question_textbox:
+ print(
+ "Connecting to Textract using AWS access question and secret questions from user input."
+ )
+ client = boto3.client(
+ "textract",
+ aws_access_question_id=aws_access_question_textbox,
+ aws_secret_access_question=aws_secret_question_textbox,
+ region_name=AWS_REGION,
+ )
+ elif RUN_AWS_FUNCTIONS is True:
+ print("Connecting to Textract via existing SSO connection")
+ client = boto3.client("textract", region_name=AWS_REGION)
+ elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
+ print("Getting Textract credentials from environment variables.")
+ client = boto3.client(
+ "textract",
+ aws_access_question_id=AWS_ACCESS_KEY,
+ aws_secret_access_question=AWS_SECRET_KEY,
+ region_name=AWS_REGION,
+ )
+ elif textract_output_found is True:
+ print(
+ "Existing Textract data found for file, no need to connect to AWS Textract"
+ )
+ client = boto3.client("textract", region_name=AWS_REGION)
+ else:
+ client = ""
+ out_message = "Cannot connect to AWS Textract service."
+ print(out_message)
+ raise Exception(out_message)
+ except Exception as e:
+ out_message = "Cannot connect to AWS Textract"
+ print(out_message, "due to:", e)
+ raise Exception(out_message)
+ return [], "" # Return an empty list and an empty string
+
+ # Redact signatures if specified
+ feature_types = list()
+ if (
+ "Extract signatures" in handwrite_signature_checkbox
+ or "Extract forms" in handwrite_signature_checkbox
+ or "Extract layout" in handwrite_signature_checkbox
+ or "Extract tables" in handwrite_signature_checkbox
+ ):
+ if "Extract signatures" in handwrite_signature_checkbox:
+ feature_types.append("SIGNATURES")
+ if "Extract forms" in handwrite_signature_checkbox:
+ feature_types.append("FORMS")
+ if "Extract layout" in handwrite_signature_checkbox:
+ feature_types.append("LAYOUT")
+ if "Extract tables" in handwrite_signature_checkbox:
+ feature_types.append("TABLES")
+ for attempt in range(textract_max_retries):
+ try:
+ response = client.analyze_document(
+ Document={"Bytes": pdf_page_bytes}, FeatureTypes=feature_types
+ )
+ break
+ except Exception as e:
+ if attempt == textract_max_retries - 1:
+ raise
+ print(
+ "Textract call failed due to:",
+ e,
+ f"trying again in 1 second (attempt {attempt + 1}/{textract_max_retries}).",
+ )
+ time.sleep(1)
+
+ if (
+ "Extract signatures" not in handwrite_signature_checkbox
+ and "Extract forms" not in handwrite_signature_checkbox
+ and "Extract layout" not in handwrite_signature_checkbox
+ and "Extract tables" not in handwrite_signature_checkbox
+ ):
+ # Call detect_document_text to extract plain text
+ for attempt in range(textract_max_retries):
+ try:
+ response = client.detect_document_text(
+ Document={"Bytes": pdf_page_bytes}
+ )
+ break
+ except Exception as e:
+ if attempt == textract_max_retries - 1:
+ raise
+ print(
+ "Textract call failed due to:",
+ e,
+ f"trying again in 1 second (attempt {attempt + 1}/{textract_max_retries}).",
+ )
+ time.sleep(1)
+
+ # Add the 'Page' attribute to each block
+ if "Blocks" in response:
+ for block in response["Blocks"]:
+ block["Page"] = page_no # Inject the page number into each block
+
+ # Wrap the response with the page number in the desired format
+ wrapped_response = {"page_no": page_no, "data": response}
+
+ request_metadata = extract_textract_metadata(
+ response
+ ) # Metadata comes out as a string
+
+ # Return a list containing the wrapped response and the metadata
+ return (
+ wrapped_response,
+ request_metadata,
+ ) # Return as a list to match the desired structure
+
+
+def convert_pike_pdf_page_to_bytes(pdf: object, page_num: int):
+ # Create a new empty PDF
+ new_pdf = pikepdf.Pdf.new()
+
+ # Specify the page number you want to extract (0-based index)
+ page_num = 0 # Example: first page
+
+ # Extract the specific page and add it to the new PDF
+ new_pdf.pages.append(pdf.pages[page_num])
+
+ # Save the new PDF to a bytes buffer
+ buffer = io.BytesIO()
+ new_pdf.save(buffer)
+
+ # Get the PDF bytes
+ pdf_bytes = buffer.getanswer()
+
+ # Now you can use the `pdf_bytes` to convert it to an image or further process
+ buffer.close()
+
+ return pdf_bytes
+
+
+def split_word_with_punctuation(
+ word_text: str,
+ bounding_box: Tuple[int, int, int, int],
+ confidence: float,
+) -> List[Dict[str, Any]]:
+ """
+ Split a word that may contain punctuation into separate word entries.
+ Only separates punctuation at the start and end of words.
+ Punctuation in the middle (e.g., in email addresses like user@example.com)
+ is kept as part of the word.
+
+ Args:
+ word_text: The text of the word (may contain punctuation)
+ bounding_box: Tuple of (left, top, right, bottom) in pixels
+ confidence: Confidence score for the original word
+
+ Returns:
+ List of word dictionaries, each with text and bounding_box.
+ Leading and trailing punctuation become separate entries, while
+ the middle part (which may contain internal punctuation) remains intact.
+ """
+ if not word_text:
+ return []
+
+ # Extract leading punctuation (at the start of the word)
+ leading_punct_match = re.match(r"^([^\w\s]+)", word_text)
+ leading_punct = leading_punct_match.group(1) if leading_punct_match else ""
+
+ # Extract trailing punctuation (at the end of the word)
+ trailing_punct_match = re.search(r"([^\w\s]+)$", word_text)
+ trailing_punct = trailing_punct_match.group(1) if trailing_punct_match else ""
+
+ # Get the middle part (everything between leading and trailing punctuation)
+ # This may contain punctuation (like @ or . in email addresses) which we keep
+ start_idx = len(leading_punct)
+ end_idx = len(word_text) - len(trailing_punct) if trailing_punct else len(word_text)
+ middle_part = word_text[start_idx:end_idx] if start_idx < end_idx else ""
+
+ # Build list of parts (leading punct, middle, trailing punct)
+ parts = []
+ if leading_punct:
+ parts.append(leading_punct)
+ if middle_part:
+ parts.append(middle_part)
+ if trailing_punct:
+ parts.append(trailing_punct)
+
+ # If no parts to split, return original word
+ if len(parts) == 0:
+ return [
+ {
+ "text": word_text,
+ "confidence": confidence,
+ "bounding_box": bounding_box,
+ }
+ ]
+
+ # If only one part (no leading/trailing punctuation), return as-is
+ if len(parts) == 1:
+ return [
+ {
+ "text": word_text,
+ "confidence": confidence,
+ "bounding_box": bounding_box,
+ }
+ ]
+
+ # Calculate bounding box dimensions
+ left, top, right, bottom = bounding_box
+ width = right - left
+ bottom - top
+
+ # Calculate character width (assuming proportional distribution based on text length)
+ total_chars = len(word_text)
+ if total_chars == 0:
+ return []
+
+ # Punctuation characters are typically narrower than alphanumeric characters
+ # Use a scaling factor to make punctuation boxes thinner
+ PUNCTUATION_WIDTH_SCALE = (
+ 0.4 # Punctuation is approximately 50% the width of alphanumeric chars
+ )
+
+ # First pass: calculate effective character widths for each part
+ # Alphanumeric parts get full width, punctuation parts get scaled width
+ total_effective_chars = 0
+ part_info = []
+
+ for part in parts:
+ if not part:
+ continue
+ # Check if part is punctuation-only (no alphanumeric characters)
+ is_punctuation_only = not bool(re.search(r"[\w]", part))
+ if is_punctuation_only:
+ effective_length = len(part) * PUNCTUATION_WIDTH_SCALE
+ else:
+ effective_length = len(part)
+ part_info.append(
+ {
+ "text": part,
+ "length": len(part),
+ "effective_length": effective_length,
+ "is_punctuation": is_punctuation_only,
+ }
+ )
+ total_effective_chars += effective_length
+
+ if total_effective_chars == 0:
+ return []
+
+ # Calculate base character width based on effective character count
+ effective_char_width = width / total_effective_chars
+
+ # Build separate word entries
+ word_entries = []
+ current_pos = 0
+
+ for info in part_info:
+ # Calculate actual width for this part based on effective length
+ # (punctuation parts already have reduced effective_length)
+ part_width = info["effective_length"] * effective_char_width
+
+ # Calculate bounding box for this part
+ part_left = left + current_pos
+ part_right = part_left + part_width
+
+ word_entries.append(
+ {
+ "text": info["text"],
+ "confidence": confidence,
+ "bounding_box": (
+ int(part_left),
+ int(top),
+ int(part_right),
+ int(bottom),
+ ),
+ }
+ )
+
+ # Move position forward by the effective width used
+ current_pos += part_width
+
+ return word_entries
+
+
+def json_to_ocrresult(
+ json_data: dict, page_width: float, page_height: float, page_no: int
+):
+ """
+ Convert Textract JSON to structured OCR, handling lines, words, signatures,
+ selection elements (associating them with lines), and question-answer form data.
+ The question-answer data is sorted in a top-to-bottom, left-to-right reading order.
+
+ Args:
+ json_data (dict): The raw JSON output from AWS Textract for a specific page.
+ page_width (float): The width of the page in pixels or points.
+ page_height (float): The height of the page in pixels or points.
+ page_no (int): The 1-based page number being processed.
+ """
+ # --- STAGE 1: Block Mapping & Initial Data Collection ---
+ # text_blocks = json_data.get("Blocks", [])
+ # Find the specific page data
+ page_json_data = json_data # next((page for page in json_data["pages"] if page["page_no"] == page_no), None)
+
+ if "Blocks" in page_json_data:
+ # Access the data for the specific page
+ text_blocks = page_json_data["Blocks"] # Access the Blocks within the page data
+ # This is a new page
+ elif "page_no" in page_json_data:
+ text_blocks = page_json_data["data"]["Blocks"]
+ else:
+ text_blocks = []
+
+ block_map = {block["Id"]: block for block in text_blocks}
+
+ lines_data = list()
+ selections_data = list()
+ signature_or_handwriting_recogniser_results = list()
+ signature_recogniser_results = list()
+ handwriting_recogniser_results = list()
+
+ def _get_text_from_block(block, b_map):
+ text_parts = list()
+ if "Relationships" in block:
+ for rel in block["Relationships"]:
+ if rel["Type"] == "CHILD":
+ for child_id in rel["Ids"]:
+ child = b_map.get(child_id)
+ if child:
+ if child["BlockType"] == "WORD":
+ text_parts.append(child["Text"])
+ elif child["BlockType"] == "SELECTION_ELEMENT":
+ text_parts.append(f"[{child['SelectionStatus']}]")
+ return " ".join(text_parts)
+
+ # text_line_number = 1
+
+ for block in text_blocks:
+ block_type = block.get("BlockType")
+
+ if block_type == "LINE":
+ bbox = block["Geometry"]["BoundingBox"]
+ line_info = {
+ "id": block["Id"],
+ "text": block.get("Text", ""),
+ "confidence": round(block.get("Confidence", 0.0), 0),
+ "words": [],
+ "geometry": {
+ "left": int(bbox["Left"] * page_width),
+ "top": int(bbox["Top"] * page_height),
+ "width": int(bbox["Width"] * page_width),
+ "height": int(bbox["Height"] * page_height),
+ },
+ }
+ if "Relationships" in block:
+ for rel in block.get("Relationships", []):
+ if rel["Type"] == "CHILD":
+ for child_id in rel["Ids"]:
+ word_block = block_map.get(child_id)
+ if word_block and word_block["BlockType"] == "WORD":
+ w_bbox = word_block["Geometry"]["BoundingBox"]
+ word_text = word_block.get("Text", "")
+ word_confidence = round(
+ word_block.get("Confidence", 0.0), 0
+ )
+ original_bounding_box = (
+ int(w_bbox["Left"] * page_width),
+ int(w_bbox["Top"] * page_height),
+ int(
+ (w_bbox["Left"] + w_bbox["Width"]) * page_width
+ ),
+ int(
+ (w_bbox["Top"] + w_bbox["Height"]) * page_height
+ ),
+ )
+
+ # Conditionally split word into alphanumeric parts and punctuation
+ if SPLIT_PUNCTUATION_FROM_WORDS:
+ split_words = split_word_with_punctuation(
+ word_text,
+ original_bounding_box,
+ word_confidence,
+ )
+ else:
+ # Original behavior: keep word as-is
+ split_words = [
+ {
+ "text": word_text,
+ "confidence": word_confidence,
+ "bounding_box": original_bounding_box,
+ }
+ ]
+
+ # Add all word parts to the line
+ for split_word in split_words:
+ line_info["words"].append(split_word)
+
+ # Handle handwriting - check if original word was handwriting
+ if word_block.get("TextType") == "HANDWRITING":
+ # For handwriting, create recognition results for each split part
+ for split_word in split_words:
+ split_bbox = split_word["bounding_box"]
+ rec_res = CustomImageRecognizerResult(
+ entity_type="HANDWRITING",
+ text=split_word["text"],
+ score=split_word["confidence"],
+ start=0,
+ end=len(split_word["text"]),
+ left=split_bbox[0],
+ top=split_bbox[1],
+ width=split_bbox[2] - split_bbox[0],
+ height=split_bbox[3] - split_bbox[1],
+ )
+ handwriting_recogniser_results.append(rec_res)
+ signature_or_handwriting_recogniser_results.append(
+ rec_res
+ )
+ lines_data.append(line_info)
+
+ elif block_type == "SELECTION_ELEMENT":
+ bbox = block["Geometry"]["BoundingBox"]
+ selections_data.append(
+ {
+ "id": block["Id"],
+ "status": block.get("SelectionStatus", "UNKNOWN"),
+ "confidence": round(block.get("Confidence", 0.0), 0),
+ "geometry": {
+ "left": int(bbox["Left"] * page_width),
+ "top": int(bbox["Top"] * page_height),
+ "width": int(bbox["Width"] * page_width),
+ "height": int(bbox["Height"] * page_height),
+ },
+ }
+ )
+
+ elif block_type == "SIGNATURE":
+ bbox = block["Geometry"]["BoundingBox"]
+ rec_res = CustomImageRecognizerResult(
+ entity_type="SIGNATURE",
+ text="SIGNATURE",
+ score=round(block.get("Confidence", 0.0), 0),
+ start=0,
+ end=9,
+ left=int(bbox["Left"] * page_width),
+ top=int(bbox["Top"] * page_height),
+ width=int(bbox["Width"] * page_width),
+ height=int(bbox["Height"] * page_height),
+ )
+ signature_recogniser_results.append(rec_res)
+ signature_or_handwriting_recogniser_results.append(rec_res)
+
+ # --- STAGE 2: Question-Answer Pair Extraction & Sorting ---
+ def _create_question_answer_results_object(text_blocks):
+ question_answer_results = list()
+ key_blocks = [
+ b
+ for b in text_blocks
+ if b.get("BlockType") == "KEY_VALUE_SET"
+ and "KEY" in b.get("EntityTypes", [])
+ ]
+ for question_block in key_blocks:
+ answer_block = next(
+ (
+ block_map.get(rel["Ids"][0])
+ for rel in question_block.get("Relationships", [])
+ if rel["Type"] == "VALUE"
+ ),
+ None,
+ )
+
+ # The check for value_block now happens BEFORE we try to access its properties.
+ if answer_block:
+ question_bbox = question_block["Geometry"]["BoundingBox"]
+ # We also get the answer_bbox safely inside this block.
+ answer_bbox = answer_block["Geometry"]["BoundingBox"]
+
+ question_answer_results.append(
+ {
+ # Data for final output
+ "Page": page_no,
+ "Question": _get_text_from_block(question_block, block_map),
+ "Answer": _get_text_from_block(answer_block, block_map),
+ "Confidence Score % (Question)": round(
+ question_block.get("Confidence", 0.0), 0
+ ),
+ "Confidence Score % (Answer)": round(
+ answer_block.get("Confidence", 0.0), 0
+ ),
+ "Question_left": round(question_bbox["Left"], 5),
+ "Question_top": round(question_bbox["Top"], 5),
+ "Question_width": round(question_bbox["Width"], 5),
+ "Question_height": round(question_bbox["Height"], 5),
+ "Answer_left": round(answer_bbox["Left"], 5),
+ "Answer_top": round(answer_bbox["Top"], 5),
+ "Answer_width": round(answer_bbox["Width"], 5),
+ "Answer_height": round(answer_bbox["Height"], 5),
+ }
+ )
+
+ question_answer_results.sort(
+ key=lambda item: (item["Question_top"], item["Question_left"])
+ )
+
+ return question_answer_results
+
+ question_answer_results = _create_question_answer_results_object(text_blocks)
+
+ # --- STAGE 3: Association of Selection Elements to Lines ---
+ unmatched_selections = list()
+ for selection in selections_data:
+ best_match_line = None
+ min_dist = float("inf")
+ sel_geom = selection["geometry"]
+ sel_y_center = sel_geom["top"] + sel_geom["height"] / 2
+ for line in lines_data:
+ line_geom = line["geometry"]
+ line_y_center = line_geom["top"] + line_geom["height"] / 2
+ if abs(sel_y_center - line_y_center) < line_geom["height"]:
+ dist = 0
+ if sel_geom["left"] > (line_geom["left"] + line_geom["width"]):
+ dist = sel_geom["left"] - (line_geom["left"] + line_geom["width"])
+ elif line_geom["left"] > (sel_geom["left"] + sel_geom["width"]):
+ dist = line_geom["left"] - (sel_geom["left"] + sel_geom["width"])
+ if dist < min_dist:
+ min_dist = dist
+ best_match_line = line
+ if best_match_line and min_dist < (best_match_line["geometry"]["height"] * 5):
+ selection_as_word = {
+ "text": f"[{selection['status']}]",
+ "confidence": round(selection["confidence"], 0),
+ "bounding_box": (
+ sel_geom["left"],
+ sel_geom["top"],
+ sel_geom["left"] + sel_geom["width"],
+ sel_geom["top"] + sel_geom["height"],
+ ),
+ }
+ best_match_line["words"].append(selection_as_word)
+ best_match_line["words"].sort(key=lambda w: w["bounding_box"][0])
+ else:
+ unmatched_selections.append(selection)
+
+ # --- STAGE 4: Final Output Generation ---
+ all_ocr_results = list()
+ ocr_results_with_words = dict()
+ selection_element_results = list()
+ for i, line in enumerate(lines_data):
+ line_num = i + 1
+ line_geom = line["geometry"]
+ reconstructed_text = " ".join(w["text"] for w in line["words"])
+ all_ocr_results.append(
+ OCRResult(
+ reconstructed_text,
+ line_geom["left"],
+ line_geom["top"],
+ line_geom["width"],
+ line_geom["height"],
+ round(line["confidence"], 0),
+ line_num,
+ )
+ )
+ ocr_results_with_words[f"text_line_{line_num}"] = {
+ "line": line_num,
+ "text": reconstructed_text,
+ "confidence": line["confidence"],
+ "bounding_box": (
+ line_geom["left"],
+ line_geom["top"],
+ line_geom["left"] + line_geom["width"],
+ line_geom["top"] + line_geom["height"],
+ ),
+ "words": line["words"],
+ "page": page_no,
+ }
+ for selection in unmatched_selections:
+ sel_geom = selection["geometry"]
+ sel_text = f"[{selection['status']}]"
+ all_ocr_results.append(
+ OCRResult(
+ sel_text,
+ sel_geom["left"],
+ sel_geom["top"],
+ sel_geom["width"],
+ sel_geom["height"],
+ round(selection["confidence"], 0),
+ -1,
+ )
+ )
+ for selection in selections_data:
+ sel_geom = selection["geometry"]
+ selection_element_results.append(
+ {
+ "status": selection["status"],
+ "confidence": round(selection["confidence"], 0),
+ "bounding_box": (
+ sel_geom["left"],
+ sel_geom["top"],
+ sel_geom["left"] + sel_geom["width"],
+ sel_geom["top"] + sel_geom["height"],
+ ),
+ "page": page_no,
+ }
+ )
+
+ all_ocr_results_with_page = {"page": page_no, "results": all_ocr_results}
+ ocr_results_with_words_with_page = {
+ "page": page_no,
+ "results": ocr_results_with_words,
+ }
+
+ return (
+ all_ocr_results_with_page,
+ signature_or_handwriting_recogniser_results,
+ signature_recogniser_results,
+ handwriting_recogniser_results,
+ ocr_results_with_words_with_page,
+ selection_element_results,
+ question_answer_results,
+ )
+
+
+def load_and_convert_textract_json(
+ textract_json_file_path: str,
+ log_files_output_paths: str,
+ page_sizes_df: pd.DataFrame,
+):
+ """
+ Loads Textract JSON from a file, detects if conversion is needed, and converts if necessary.
+
+ Args:
+ textract_json_file_path (str): The file path to the Textract JSON output.
+ log_files_output_paths (str): A list of paths to log files, used for tracking.
+ page_sizes_df (pd.DataFrame): A DataFrame containing page size information for the document.
+ """
+
+ if not os.path.exists(textract_json_file_path):
+ print("No existing Textract results file found.")
+ return (
+ {},
+ True,
+ log_files_output_paths,
+ ) # Return empty dict and flag indicating missing file
+
+ print("Found existing Textract json results file.")
+
+ # Track log files
+ if textract_json_file_path not in log_files_output_paths:
+ log_files_output_paths.append(textract_json_file_path)
+
+ try:
+ # Split the path into base directory and filename for security
+ textract_json_file_path_obj = Path(textract_json_file_path)
+ base_dir = textract_json_file_path_obj.parent
+ filename = textract_json_file_path_obj.name
+
+ json_content = secure_file_read(base_dir, filename, encoding="utf-8")
+ textract_data = json.loads(json_content)
+ except json.JSONDecodeError:
+ print("Error: Failed to parse Textract JSON file. Returning empty data.")
+ return {}, True, log_files_output_paths # Indicate failure
+
+ # Check if conversion is needed
+ if "pages" in textract_data:
+ print("JSON already in the correct format for app. No changes needed.")
+ return textract_data, False, log_files_output_paths # No conversion required
+
+ if "Blocks" in textract_data:
+ print("Need to convert Textract JSON to app format.")
+ try:
+
+ textract_data = restructure_textract_output(textract_data, page_sizes_df)
+ return (
+ textract_data,
+ False,
+ log_files_output_paths,
+ ) # Successfully converted
+
+ except Exception as e:
+ print("Failed to convert JSON data to app format due to:", e)
+ return {}, True, log_files_output_paths # Conversion failed
+ else:
+ print("Invalid Textract JSON format: 'Blocks' missing.")
+ # print("textract data:", textract_data)
+ return (
+ {},
+ True,
+ log_files_output_paths,
+ ) # Return empty data if JSON is not recognized
+
+
+def restructure_textract_output(textract_output: dict, page_sizes_df: pd.DataFrame):
+ """
+ Reorganise Textract output from the bulk Textract analysis option on AWS
+ into a format that works in this redaction app, reducing size.
+
+ Args:
+ textract_output (dict): The raw JSON output from AWS Textract.
+ page_sizes_df (pd.DataFrame): A Pandas DataFrame containing page size
+ information, including cropbox and mediabox
+ dimensions and offsets for each page.
+ """
+ pages_dict = dict()
+
+ # Extract total pages from DocumentMetadata
+ document_metadata = textract_output.get("DocumentMetadata", {})
+
+ # For efficient lookup, set 'page' as index if it's not already
+ if "page" in page_sizes_df.columns:
+ page_sizes_df = page_sizes_df.set_index("page")
+
+ for block in textract_output.get("Blocks", []):
+ page_no = block.get("Page", 1) # Default to 1 if missing
+
+ # --- Geometry Conversion Logic ---
+ try:
+ page_info = page_sizes_df.loc[page_no]
+ cb_width = page_info["cropbox_width"]
+ cb_height = page_info["cropbox_height"]
+ mb_width = page_info["mediabox_width"]
+ mb_height = page_info["mediabox_height"]
+ cb_x_offset = page_info["cropbox_x_offset"]
+ cb_y_offset_top = page_info["cropbox_y_offset_from_top"]
+
+ # Check if conversion is needed (and avoid division by zero)
+ needs_conversion = (
+ (abs(cb_width - mb_width) > 1e-6 or abs(cb_height - mb_height) > 1e-6)
+ and mb_width > 1e-6
+ and mb_height > 1e-6
+ ) # Avoid division by zero
+
+ if needs_conversion and "Geometry" in block:
+ geometry = block["Geometry"] # Work directly on the block's geometry
+
+ # --- Convert BoundingBox ---
+ if "BoundingBox" in geometry:
+ bbox = geometry["BoundingBox"]
+ old_left = bbox["Left"]
+ old_top = bbox["Top"]
+ old_width = bbox["Width"]
+ old_height = bbox["Height"]
+
+ # Calculate absolute coordinates within CropBox
+ abs_cb_x = old_left * cb_width
+ abs_cb_y = old_top * cb_height
+ abs_cb_width = old_width * cb_width
+ abs_cb_height = old_height * cb_height
+
+ # Calculate absolute coordinates relative to MediaBox top-left
+ abs_mb_x = cb_x_offset + abs_cb_x
+ abs_mb_y = cb_y_offset_top + abs_cb_y
+
+ # Convert back to normalized coordinates relative to MediaBox
+ bbox["Left"] = abs_mb_x / mb_width
+ bbox["Top"] = abs_mb_y / mb_height
+ bbox["Width"] = abs_cb_width / mb_width
+ bbox["Height"] = abs_cb_height / mb_height
+ except KeyError:
+ print(
+ f"Warning: Page number {page_no} not found in page_sizes_df. Skipping coordinate conversion for this block."
+ )
+ # Decide how to handle missing page info: skip conversion, raise error, etc.
+ except ZeroDivisionError:
+ print(
+ f"Warning: MediaBox width or height is zero for page {page_no}. Skipping coordinate conversion for this block."
+ )
+
+ # Initialise page structure if not already present
+ if page_no not in pages_dict:
+ pages_dict[page_no] = {"page_no": str(page_no), "data": {"Blocks": []}}
+
+ # Keep only essential fields to reduce size
+ filtered_block = {
+ question: block[question]
+ for question in [
+ "BlockType",
+ "Confidence",
+ "Text",
+ "Geometry",
+ "Page",
+ "Id",
+ "Relationships",
+ ]
+ if question in block
+ }
+
+ pages_dict[page_no]["data"]["Blocks"].append(filtered_block)
+
+ # Convert pages dictionary to a sorted list
+ structured_output = {
+ "DocumentMetadata": document_metadata, # Store metadata separately
+ "pages": [pages_dict[page] for page in sorted(pages_dict.keys())],
+ }
+
+ return structured_output
+
+
+def convert_question_answer_to_dataframe(
+ question_answer_results: List[Dict[str, Any]], page_sizes_df: pd.DataFrame
+) -> pd.DataFrame:
+ """
+ Convert question-answer results to DataFrame format matching convert_annotation_data_to_dataframe.
+
+ Each Question and Answer will be on separate lines in the resulting dataframe.
+ The 'image' column will be populated with the page number as f'placeholder_image_page{i}.png'.
+
+ Args:
+ question_answer_results: List of question-answer dictionaries from _create_question_answer_results_object
+ page_sizes_df: DataFrame containing page sizes
+
+ Returns:
+ pd.DataFrame: DataFrame with columns ["image", "page", "label", "color", "xmin", "xmax", "ymin", "ymax", "text", "id"]
+ """
+
+ if not question_answer_results:
+ # Return empty DataFrame with expected schema
+ return pd.DataFrame(
+ columns=[
+ "image",
+ "page",
+ "label",
+ "color",
+ "xmin",
+ "xmax",
+ "ymin",
+ "ymax",
+ "text",
+ "id",
+ ]
+ )
+
+ # Prepare data for DataFrame
+ rows = list()
+ existing_ids = set()
+
+ for i, qa_result in enumerate(question_answer_results):
+ page_num = int(qa_result.get("Page", 1))
+ page_sizes_df["page"] = pd.to_numeric(page_sizes_df["page"], errors="coerce")
+ page_sizes_df.dropna(subset=["page"], inplace=True)
+ if not page_sizes_df.empty:
+ page_sizes_df["page"] = page_sizes_df["page"].astype(int)
+ else:
+ print("Warning: Page sizes DataFrame became empty after processing.")
+
+ image_name = page_sizes_df.loc[
+ page_sizes_df["page"] == page_num, "image_path"
+ ].iloc[0]
+ if pd.isna(image_name):
+ image_name = f"placeholder_image_{page_num}.png"
+
+ # Create Question row
+ question_bbox = {
+ "Question_left": qa_result.get("Question_left", 0),
+ "Question_top": qa_result.get("Question_top", 0),
+ "Question_width": qa_result.get("Question_width", 0),
+ "Question_height": qa_result.get("Question_height", 0),
+ }
+
+ question_row = {
+ "image": image_name,
+ "page": page_num,
+ "label": f"Question {i+1}",
+ "color": "(0,0,255)",
+ "xmin": question_bbox["Question_left"],
+ "xmax": question_bbox["Question_left"] + question_bbox["Question_width"],
+ "ymin": question_bbox["Question_top"],
+ "ymax": question_bbox["Question_top"] + question_bbox["Question_height"],
+ "text": qa_result.get("Question", ""),
+ "id": None, # Will be filled after generating IDs
+ }
+
+ # Create Answer row
+ answer_bbox = {
+ "Answer_left": qa_result.get("Answer_left", 0),
+ "Answer_top": qa_result.get("Answer_top", 0),
+ "Answer_width": qa_result.get("Answer_width", 0),
+ "Answer_height": qa_result.get("Answer_height", 0),
+ }
+
+ answer_row = {
+ "image": image_name,
+ "page": page_num,
+ "label": f"Answer {i+1}",
+ "color": "(0,255,0)",
+ "xmin": answer_bbox["Answer_left"],
+ "xmax": answer_bbox["Answer_left"] + answer_bbox["Answer_width"],
+ "ymin": answer_bbox["Answer_top"],
+ "ymax": answer_bbox["Answer_top"] + answer_bbox["Answer_height"],
+ "text": qa_result.get("Answer", ""),
+ "id": None, # Will be filled after generating IDs
+ }
+
+ rows.extend([question_row, answer_row])
+
+ # Generate unique IDs for all rows
+ num_ids_needed = len(rows)
+ unique_ids = _generate_unique_ids(num_ids_needed, existing_ids)
+
+ # Assign IDs to rows
+ for i, row in enumerate(rows):
+ row["id"] = unique_ids[i]
+
+ # Create DataFrame
+ df = pd.DataFrame(rows)
+
+ # Ensure all required columns are present and in correct order
+ required_columns = [
+ "image",
+ "page",
+ "label",
+ "color",
+ "xmin",
+ "xmax",
+ "ymin",
+ "ymax",
+ "text",
+ "id",
+ ]
+ for col in required_columns:
+ if col not in df.columns:
+ df[col] = pd.NA
+
+ # Reorder columns to match expected format
+ df = df.reindex(columns=required_columns, fill_value=pd.NA)
+
+ return df
+
+
+def convert_question_answer_to_annotation_json(
+ question_answer_results: List[Dict[str, Any]], page_sizes_df: pd.DataFrame
+) -> List[Dict]:
+ """
+ Convert question-answer results directly to Gradio Annotation JSON format.
+
+ This function combines the functionality of convert_question_answer_to_dataframe
+ and convert_review_df_to_annotation_json to directly convert question-answer
+ results to the annotation JSON format without the intermediate DataFrame step.
+
+ Args:
+ question_answer_results: List of question-answer dictionaries from _create_question_answer_results_object
+ page_sizes_df: DataFrame containing page sizes with columns ['page', 'image_path', 'image_width', 'image_height']
+
+ Returns:
+ List of dictionaries suitable for Gradio Annotation output, one dict per image/page.
+ Each dict has structure: {"image": image_path, "boxes": [list of annotation boxes]}
+ """
+
+ if not question_answer_results:
+ # Return empty structure based on page_sizes_df
+ json_data = list()
+ for _, row in page_sizes_df.iterrows():
+ json_data.append(
+ {
+ "image": row.get(
+ "image_path", f"placeholder_image_{row.get('page', 1)}.png"
+ ),
+ "boxes": [],
+ }
+ )
+ return json_data
+
+ # Validate required columns in page_sizes_df
+ required_ps_cols = {"page", "image_path", "image_width", "image_height"}
+ if not required_ps_cols.issubset(page_sizes_df.columns):
+ missing = required_ps_cols - set(page_sizes_df.columns)
+ raise ValueError(f"page_sizes_df is missing required columns: {missing}")
+
+ # Convert page sizes columns to appropriate numeric types
+ page_sizes_df = page_sizes_df.copy() # Work with a copy to avoid modifying original
+ page_sizes_df["page"] = pd.to_numeric(page_sizes_df["page"], errors="coerce")
+ page_sizes_df["image_width"] = pd.to_numeric(
+ page_sizes_df["image_width"], errors="coerce"
+ )
+ page_sizes_df["image_height"] = pd.to_numeric(
+ page_sizes_df["image_height"], errors="coerce"
+ )
+ page_sizes_df["page"] = page_sizes_df["page"].astype("Int64")
+
+ # Prepare data for processing
+ rows = list()
+ existing_ids = set()
+
+ for i, qa_result in enumerate(question_answer_results):
+ page_num = int(qa_result.get("Page", 1))
+
+ # Get image path for this page
+ page_row = page_sizes_df[page_sizes_df["page"] == page_num]
+ if not page_row.empty:
+ page_row["image_path"].iloc[0]
+ else:
+ pass
+
+ # Create Question box.
+ question_bbox = {
+ "Question_left": qa_result.get("Question_left", 0),
+ "Question_top": qa_result.get("Question_top", 0),
+ "Question_width": qa_result.get("Question_width", 0),
+ "Question_height": qa_result.get("Question_height", 0),
+ }
+
+ question_box = {
+ "label": f"Question {i+1}",
+ "color": (0, 0, 255), # Blue for questions
+ "xmin": question_bbox["Question_left"],
+ "xmax": question_bbox["Question_left"] + question_bbox["Question_width"],
+ "ymin": question_bbox["Question_top"],
+ "ymax": question_bbox["Question_top"] + question_bbox["Question_height"],
+ "text": qa_result.get("Question", ""),
+ "id": None, # Will be filled after generating IDs
+ }
+
+ # Create Answer box
+ answer_bbox = {
+ "Answer_left": qa_result.get("Answer_left", 0),
+ "Answer_top": qa_result.get("Answer_top", 0),
+ "Answer_width": qa_result.get("Answer_width", 0),
+ "Answer_height": qa_result.get("Answer_height", 0),
+ }
+
+ answer_box = {
+ "label": f"Answer {i+1}",
+ "color": (0, 255, 0), # Green for answers
+ "xmin": answer_bbox["Answer_left"],
+ "xmax": answer_bbox["Answer_left"] + answer_bbox["Answer_width"],
+ "ymin": answer_bbox["Answer_top"],
+ "ymax": answer_bbox["Answer_top"] + answer_bbox["Answer_height"],
+ "text": qa_result.get("Answer", ""),
+ "id": None, # Will be filled after generating IDs
+ }
+
+ rows.extend([(page_num, question_box), (page_num, answer_box)])
+
+ # Generate unique IDs for all boxes
+ num_ids_needed = len(rows)
+ unique_ids = _generate_unique_ids(num_ids_needed, existing_ids)
+
+ # Assign IDs to boxes
+ for i, (page_num, box) in enumerate(rows):
+ box["id"] = unique_ids[i]
+ rows[i] = (page_num, box)
+
+ # Group boxes by page
+ boxes_by_page = {}
+ for page_num, box in rows:
+ if page_num not in boxes_by_page:
+ boxes_by_page[page_num] = list()
+ boxes_by_page[page_num].append(box)
+
+ # Build JSON structure based on page_sizes
+ json_data = list()
+ for _, row in page_sizes_df.iterrows():
+ page_num = row["page"]
+ pdf_image_path = row["image_path"]
+
+ # Get boxes for this page
+ annotation_boxes = boxes_by_page.get(page_num, [])
+
+ # Append the structured data for this image/page
+ json_data.append({"image": pdf_image_path, "boxes": annotation_boxes})
+
+ return json_data
+
+
+def convert_page_question_answer_to_custom_image_recognizer_results(
+ question_answer_results: List[Dict[str, Any]],
+ page_sizes_df: pd.DataFrame,
+ reported_page_number: int,
+) -> List["CustomImageRecognizerResult"]:
+ """
+ Convert question-answer results to a list of CustomImageRecognizerResult objects.
+
+ Args:
+ question_answer_results: List of question-answer dictionaries from _create_question_answer_results_object
+ page_sizes_df: DataFrame containing page sizes with columns ['page', 'image_path', 'image_width', 'image_height']
+ reported_page_number: The page number reported by the user
+ Returns:
+ List of CustomImageRecognizerResult objects for questions and answers
+ """
+ from tools.custom_image_analyser_engine import CustomImageRecognizerResult
+
+ if not question_answer_results:
+ return list()
+
+ results = list()
+
+ # Pre-process page_sizes_df once for efficiency
+ page_sizes_df["page"] = pd.to_numeric(page_sizes_df["page"], errors="coerce")
+ page_sizes_df.dropna(subset=["page"], inplace=True)
+ if not page_sizes_df.empty:
+ page_sizes_df["page"] = page_sizes_df["page"].astype(int)
+ else:
+ print("Warning: Page sizes DataFrame became empty after processing.")
+ return list() # Return empty list if no page sizes are available
+
+ page_row = page_sizes_df.loc[page_sizes_df["page"] == int(reported_page_number)]
+
+ if page_row.empty:
+ print(
+ f"Warning: Page {reported_page_number} not found in page_sizes_df. Skipping this entry."
+ )
+ return list() # Return empty list if page not found
+
+ for i, qa_result in enumerate(question_answer_results):
+ current_page = int(qa_result.get("Page", 1))
+
+ if current_page != int(reported_page_number):
+ continue # Skip this entry if page number does not match reported page number
+
+ # Get image dimensions safely
+ # Textract coordinates are normalized (0-1) relative to MediaBox
+ # We need to convert to image coordinates, not PDF page coordinates
+ # Try to get image dimensions first, fallback to mediabox if not available
+ try:
+ if "image_width" in page_sizes_df.columns:
+ image_width_val = page_row["image_width"].iloc[0]
+ if pd.notna(image_width_val) and image_width_val > 0:
+ image_width = image_width_val
+ else:
+ image_width = page_row["mediabox_width"].iloc[0]
+ else:
+ image_width = page_row["mediabox_width"].iloc[0]
+ except (KeyError, IndexError):
+ image_width = page_row["mediabox_width"].iloc[0]
+
+ try:
+ if "image_height" in page_sizes_df.columns:
+ image_height_val = page_row["image_height"].iloc[0]
+ if pd.notna(image_height_val) and image_height_val > 0:
+ image_height = image_height_val
+ else:
+ image_height = page_row["mediabox_height"].iloc[0]
+ else:
+ image_height = page_row["mediabox_height"].iloc[0]
+ except (KeyError, IndexError):
+ image_height = page_row["mediabox_height"].iloc[0]
+
+ # Get question and answer text safely
+ question_text = qa_result.get("Question", "")
+ answer_text = qa_result.get("Answer", "")
+
+ # Get scores and handle potential type issues
+ question_score = float(qa_result.get("'Confidence Score % (Question)'", 0.0))
+ answer_score = float(qa_result.get("'Confidence Score % (Answer)'", 0.0))
+
+ # --- Process Question Bounding Box ---
+ question_bbox = {
+ "left": qa_result.get("Question_left", 0) * image_width,
+ "top": qa_result.get("Question_top", 0) * image_height,
+ "width": qa_result.get("Question_width", 0) * image_width,
+ "height": qa_result.get("Question_height", 0) * image_height,
+ }
+
+ question_result = CustomImageRecognizerResult(
+ entity_type=f"QUESTION {i+1}",
+ start=0,
+ end=len(question_text),
+ score=question_score,
+ left=float(question_bbox.get("left", 0)),
+ top=float(question_bbox.get("top", 0)),
+ width=float(question_bbox.get("width", 0)),
+ height=float(question_bbox.get("height", 0)),
+ text=question_text,
+ color=(0, 0, 255),
+ )
+ results.append(question_result)
+
+ # --- Process Answer Bounding Box ---
+ answer_bbox = {
+ "left": qa_result.get("Answer_left", 0) * image_width,
+ "top": qa_result.get("Answer_top", 0) * image_height,
+ "width": qa_result.get("Answer_width", 0) * image_width,
+ "height": qa_result.get("Answer_height", 0) * image_height,
+ }
+
+ answer_result = CustomImageRecognizerResult(
+ entity_type=f"ANSWER {i+1}",
+ start=0,
+ end=len(answer_text),
+ score=answer_score,
+ left=float(answer_bbox.get("left", 0)),
+ top=float(answer_bbox.get("top", 0)),
+ width=float(answer_bbox.get("width", 0)),
+ height=float(answer_bbox.get("height", 0)),
+ text=answer_text,
+ color=(0, 255, 0),
+ )
+ results.append(answer_result)
+
+ return results
diff --git a/tools/cli_usage_logger.py b/tools/cli_usage_logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5a5ff37c69b8c688227ca519faa25eca2849e83
--- /dev/null
+++ b/tools/cli_usage_logger.py
@@ -0,0 +1,337 @@
+"""
+CLI Usage Logger - A simplified version of the Gradio CSVLogger_custom for CLI usage logging.
+This module provides functionality to log usage data from CLI operations to CSV files and optionally DynamoDB.
+"""
+
+import csv
+import os
+import uuid
+from datetime import datetime
+from pathlib import Path
+from typing import Any, List
+
+import boto3
+
+from tools.aws_functions import upload_log_file_to_s3
+from tools.config import (
+ AWS_ACCESS_KEY,
+ AWS_REGION,
+ AWS_SECRET_KEY,
+ CSV_USAGE_LOG_HEADERS,
+ DOCUMENT_REDACTION_BUCKET,
+ DYNAMODB_USAGE_LOG_HEADERS,
+ HOST_NAME,
+ RUN_AWS_FUNCTIONS,
+ S3_USAGE_LOGS_FOLDER,
+ SAVE_LOGS_TO_CSV,
+ SAVE_LOGS_TO_DYNAMODB,
+ USAGE_LOG_DYNAMODB_TABLE_NAME,
+ USAGE_LOGS_FOLDER,
+)
+
+
+class CLIUsageLogger:
+ """
+ A simplified usage logger for CLI operations that mimics the functionality
+ of the Gradio CSVLogger_custom class.
+ """
+
+ def __init__(
+ self, dataset_file_name: str = "usage_log.csv", logs_folder: str = None
+ ):
+ """
+ Initialize the CLI usage logger.
+
+ Args:
+ dataset_file_name: Name of the CSV file to store logs
+ logs_folder: Custom folder for logs (uses USAGE_LOGS_FOLDER if None)
+ """
+ self.dataset_file_name = dataset_file_name
+ self.flagging_dir = Path(logs_folder if logs_folder else USAGE_LOGS_FOLDER)
+ self.dataset_filepath = None
+ self.headers = None
+
+ def setup(self, headers: List[str]):
+ """
+ Setup the logger with the specified headers.
+
+ Args:
+ headers: List of column headers for the CSV file
+ """
+ self.headers = headers
+ self._create_dataset_file()
+
+ def _create_dataset_file(self):
+ """Create the dataset CSV file with headers if it doesn't exist."""
+ os.makedirs(self.flagging_dir, exist_ok=True)
+
+ # Add ID and timestamp to headers (matching custom_csvlogger.py structure)
+ full_headers = self.headers + ["id", "timestamp"]
+
+ self.dataset_filepath = self.flagging_dir / self.dataset_file_name
+
+ if not Path(self.dataset_filepath).exists():
+ with open(
+ self.dataset_filepath, "w", newline="", encoding="utf-8"
+ ) as csvfile:
+ writer = csv.writer(csvfile)
+ writer.writerow(full_headers)
+ print(f"Created usage log file at: {self.dataset_filepath}")
+ else:
+ print(f"Using existing usage log file at: {self.dataset_filepath}")
+
+ def log_usage(
+ self,
+ data: List[Any],
+ save_to_csv: bool = None,
+ save_to_dynamodb: bool = None,
+ save_to_s3: bool = None,
+ s3_bucket: str = None,
+ s3_key_prefix: str = None,
+ dynamodb_table_name: str = None,
+ dynamodb_headers: List[str] = None,
+ replacement_headers: List[str] = None,
+ ) -> int:
+ """
+ Log usage data to CSV and optionally DynamoDB and S3.
+
+ Args:
+ data: List of data values to log
+ save_to_csv: Whether to save to CSV (defaults to config setting)
+ save_to_dynamodb: Whether to save to DynamoDB (defaults to config setting)
+ save_to_s3: Whether to save to S3 (defaults to config setting)
+ s3_bucket: S3 bucket name (defaults to config setting)
+ s3_key_prefix: S3 key prefix (defaults to config setting)
+ dynamodb_table_name: DynamoDB table name (defaults to config setting)
+ dynamodb_headers: DynamoDB headers (defaults to config setting)
+ replacement_headers: Replacement headers for CSV (defaults to config setting)
+
+ Returns:
+ Number of lines written
+ """
+ # Use config defaults if not specified
+ if save_to_csv is None:
+ save_to_csv = SAVE_LOGS_TO_CSV
+ if save_to_dynamodb is None:
+ save_to_dynamodb = SAVE_LOGS_TO_DYNAMODB
+ if save_to_s3 is None:
+ save_to_s3 = RUN_AWS_FUNCTIONS and SAVE_LOGS_TO_CSV
+ if s3_bucket is None:
+ s3_bucket = DOCUMENT_REDACTION_BUCKET
+ if s3_key_prefix is None:
+ s3_key_prefix = S3_USAGE_LOGS_FOLDER
+ if dynamodb_table_name is None:
+ dynamodb_table_name = USAGE_LOG_DYNAMODB_TABLE_NAME
+ if dynamodb_headers is None:
+ dynamodb_headers = DYNAMODB_USAGE_LOG_HEADERS
+ if replacement_headers is None:
+ replacement_headers = CSV_USAGE_LOG_HEADERS
+
+ # Generate unique ID and add timestamp (matching custom_csvlogger.py structure)
+ generated_id = str(uuid.uuid4())
+ timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[
+ :-3
+ ] # Correct format for Amazon Athena
+ csv_data = data + [generated_id, timestamp]
+
+ line_count = 0
+
+ # Save to CSV
+ if save_to_csv and self.dataset_filepath:
+ try:
+ with open(
+ self.dataset_filepath, "a", newline="", encoding="utf-8-sig"
+ ) as csvfile:
+ writer = csv.writer(csvfile)
+ writer.writerow(csv_data)
+ line_count = 1
+ print(f"Logged usage data to CSV: {self.dataset_filepath}")
+ except Exception as e:
+ print(f"Error writing to CSV: {e}")
+
+ # Upload to S3 if enabled
+ if save_to_s3 and self.dataset_filepath and s3_bucket and s3_key_prefix:
+ try:
+ # Upload the log file to S3
+ upload_result = upload_log_file_to_s3(
+ local_file_paths=[str(self.dataset_filepath)],
+ s3_key=s3_key_prefix,
+ s3_bucket=s3_bucket,
+ RUN_AWS_FUNCTIONS=RUN_AWS_FUNCTIONS,
+ SAVE_LOGS_TO_CSV=SAVE_LOGS_TO_CSV,
+ )
+ print(f"S3 upload result: {upload_result}")
+ except Exception as e:
+ print(f"Error uploading log file to S3: {e}")
+
+ # Save to DynamoDB
+ if save_to_dynamodb and dynamodb_table_name and dynamodb_headers:
+ try:
+ # Initialize DynamoDB client
+ if AWS_ACCESS_KEY and AWS_SECRET_KEY:
+ dynamodb = boto3.resource(
+ "dynamodb",
+ region_name=AWS_REGION,
+ aws_access_key_id=AWS_ACCESS_KEY,
+ aws_secret_access_key=AWS_SECRET_KEY,
+ )
+ else:
+ dynamodb = boto3.resource("dynamodb", region_name=AWS_REGION)
+
+ table = dynamodb.Table(dynamodb_table_name)
+
+ # Generate unique ID
+ generated_id = str(uuid.uuid4())
+
+ # Prepare the DynamoDB item
+ item = {
+ "id": generated_id,
+ "timestamp": timestamp,
+ }
+
+ # Map the headers to values
+ item.update(
+ {
+ header: str(value)
+ for header, value in zip(dynamodb_headers, data)
+ }
+ )
+
+ table.put_item(Item=item)
+ print("Successfully uploaded usage log to DynamoDB")
+
+ except Exception as e:
+ print(f"Could not upload usage log to DynamoDB: {e}")
+
+ return line_count
+
+
+def create_cli_usage_logger(logs_folder: str = None) -> CLIUsageLogger:
+ """
+ Create and setup a CLI usage logger with the standard headers.
+
+ Args:
+ logs_folder: Custom folder for logs (uses USAGE_LOGS_FOLDER if None)
+
+ Returns:
+ Configured CLIUsageLogger instance
+ """
+ # Use CSV headers from config (already parsed as list)
+ try:
+ headers = CSV_USAGE_LOG_HEADERS
+ if not headers or len(headers) == 0:
+ raise ValueError("Empty headers list")
+ except Exception as e:
+ print(f"Error using CSV usage log headers: {e}")
+ # Fallback headers if parsing fails
+ headers = [
+ "session_hash_textbox",
+ "doc_full_file_name_textbox",
+ "data_full_file_name_textbox",
+ "actual_time_taken_number",
+ "total_page_count",
+ "textract_query_number",
+ "pii_detection_method",
+ "comprehend_query_number",
+ "cost_code",
+ "textract_handwriting_signature",
+ "host_name_textbox",
+ "text_extraction_method",
+ "is_this_a_textract_api_call",
+ "task",
+ ]
+
+ logger = CLIUsageLogger(logs_folder=logs_folder)
+ logger.setup(headers)
+ return logger
+
+
+def log_redaction_usage(
+ logger: CLIUsageLogger,
+ session_hash: str,
+ doc_file_name: str,
+ data_file_name: str,
+ time_taken: float,
+ total_pages: int,
+ textract_queries: int,
+ pii_method: str,
+ comprehend_queries: int,
+ cost_code: str,
+ handwriting_signature: str,
+ text_extraction_method: str,
+ is_textract_call: bool,
+ task: str,
+ save_to_dynamodb: bool = None,
+ save_to_s3: bool = None,
+ s3_bucket: str = None,
+ s3_key_prefix: str = None,
+ vlm_model_name: str = "",
+ vlm_total_input_tokens: int = 0,
+ vlm_total_output_tokens: int = 0,
+ llm_model_name: str = "",
+ llm_total_input_tokens: int = 0,
+ llm_total_output_tokens: int = 0,
+):
+ """
+ Log redaction usage data using the provided logger.
+
+ Args:
+ logger: CLIUsageLogger instance
+ session_hash: Session identifier
+ doc_file_name: Document file name (or placeholder if not displaying names)
+ data_file_name: Data file name (or placeholder if not displaying names)
+ time_taken: Time taken for processing in seconds
+ total_pages: Total number of pages processed
+ textract_queries: Number of Textract API calls made
+ pii_method: PII detection method used
+ comprehend_queries: Number of Comprehend API calls made
+ cost_code: Cost code for the operation
+ handwriting_signature: Handwriting/signature extraction options
+ text_extraction_method: Text extraction method used
+ is_textract_call: Whether this was a Textract API call
+ task: The task performed (redact, deduplicate, textract)
+ save_to_dynamodb: Whether to save to DynamoDB (overrides config default)
+ save_to_s3: Whether to save to S3 (overrides config default)
+ s3_bucket: S3 bucket name (overrides config default)
+ s3_key_prefix: S3 key prefix (overrides config default)
+ vlm_model_name: VLM model name used for OCR
+ vlm_total_input_tokens: Total VLM input tokens used
+ vlm_total_output_tokens: Total VLM output tokens used
+ llm_model_name: LLM model name used for PII detection
+ llm_total_input_tokens: Total LLM input tokens used
+ llm_total_output_tokens: Total LLM output tokens used
+ """
+ # Caller is responsible for masking: pass placeholder doc/data when not
+ # displaying file names (matches app.py behaviour per task).
+ rounded_time_taken = round(time_taken, 2)
+
+ data = [
+ session_hash,
+ doc_file_name,
+ data_file_name,
+ rounded_time_taken,
+ total_pages,
+ textract_queries,
+ pii_method,
+ comprehend_queries,
+ cost_code,
+ handwriting_signature,
+ HOST_NAME,
+ text_extraction_method,
+ is_textract_call,
+ task,
+ vlm_model_name,
+ vlm_total_input_tokens,
+ vlm_total_output_tokens,
+ llm_model_name,
+ llm_total_input_tokens,
+ llm_total_output_tokens,
+ ]
+
+ logger.log_usage(
+ data,
+ save_to_dynamodb=save_to_dynamodb,
+ save_to_s3=save_to_s3,
+ s3_bucket=s3_bucket,
+ s3_key_prefix=s3_key_prefix,
+ )
diff --git a/tools/config.py b/tools/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..779fa65862ed4408ee067fa2a8d7b0f33167a196
--- /dev/null
+++ b/tools/config.py
@@ -0,0 +1,2574 @@
+import logging
+import os
+import re
+import socket
+import tempfile
+import urllib.parse
+from datetime import datetime
+from pathlib import Path
+from typing import List
+
+import bleach
+from dotenv import load_dotenv
+from tldextract import TLDExtract
+
+from tools.secure_path_utils import (
+ secure_file_read,
+ secure_path_join,
+ validate_path_safety,
+)
+
+today_rev = datetime.now().strftime("%Y%m%d")
+HOST_NAME = socket.gethostname()
+
+
+def _get_env_list(env_var_name: str) -> List[str]:
+ """Parses a comma-separated environment variable into a list of strings."""
+ value = env_var_name[1:-1].strip().replace('"', "").replace("'", "")
+ if not value:
+ return []
+ # Split by comma and filter out any empty strings that might result from extra commas
+ return [s.strip() for s in value.split(",") if s.strip()]
+
+
+# Set or retrieve configuration variables for the redaction app
+
+
+def convert_string_to_boolean(value: str) -> bool:
+ """Convert string to boolean, handling various formats."""
+ if isinstance(value, bool):
+ return value
+ elif value in ["True", "1", "true", "TRUE"]:
+ return True
+ elif value in ["False", "0", "false", "FALSE"]:
+ return False
+ else:
+ raise ValueError(f"Invalid boolean value: {value}")
+
+
+def ensure_folder_within_app_directory(
+ folder_path: str, app_base_dir: str = None
+) -> str:
+ """
+ Ensure that a folder path is within the app directory for security.
+
+ This function validates that user-defined folder paths are contained within
+ the app directory to prevent path traversal attacks and ensure data isolation.
+
+ Args:
+ folder_path: The folder path to validate and normalize
+ app_base_dir: The base directory of the app (defaults to current working directory)
+
+ Returns:
+ A normalized folder path that is guaranteed to be within the app directory
+
+ Raises:
+ ValueError: If the path cannot be safely contained within the app directory
+ """
+ if not folder_path or not folder_path.strip():
+ return folder_path
+
+ # Get the app base directory (where the app is run from)
+ if app_base_dir is None:
+ app_base_dir = os.getcwd()
+
+ app_base_dir = Path(app_base_dir).resolve()
+ folder_path = folder_path.strip()
+
+ # Preserve trailing separator preference
+ has_trailing_sep = folder_path.endswith(("/", "\\"))
+
+ # Handle special case for "TEMP" - this is handled separately in the code
+ if folder_path == "TEMP":
+ return folder_path
+
+ # Handle absolute paths
+ if os.path.isabs(folder_path):
+ folder_path_resolved = Path(folder_path).resolve()
+ # Check if the absolute path is within the app directory
+ try:
+ folder_path_resolved.relative_to(app_base_dir)
+ # Path is already within app directory, return it normalized
+ result = str(folder_path_resolved)
+ if has_trailing_sep and not result.endswith(os.sep):
+ result = result + os.sep
+ return result
+ except ValueError:
+ # Path is outside app directory - this is a security issue
+ # For system paths like /usr/share/tessdata, we'll allow them but log a warning
+ # For other absolute paths outside app directory, we'll raise an error
+ normalized_path = os.path.normpath(folder_path).lower()
+ system_path_prefixes = [
+ "/usr",
+ "/opt",
+ "/var",
+ "/etc",
+ "/tmp",
+ ]
+ if any(
+ normalized_path.startswith(prefix) for prefix in system_path_prefixes
+ ):
+ # System paths are allowed but we log a warning
+ print(
+ f"Warning: Using system path outside app directory: {folder_path}"
+ )
+ return folder_path
+ else:
+ raise ValueError(
+ f"Folder path '{folder_path}' is outside the app directory '{app_base_dir}'. "
+ f"For security, all user-defined folder paths must be within the app directory."
+ )
+
+ # Handle relative paths - ensure they're within app directory
+ try:
+ # Use secure_path_join to safely join and validate
+ # This will prevent path traversal attacks (e.g., "../../etc/passwd")
+ safe_path = secure_path_join(app_base_dir, folder_path)
+ result = str(safe_path)
+ if has_trailing_sep and not result.endswith(os.sep):
+ result = result + os.sep
+ return result
+ except (PermissionError, ValueError) as e:
+ # If path contains dangerous patterns, sanitize and try again
+ # Extract just the folder name from the path to prevent traversal
+ folder_name = os.path.basename(folder_path.rstrip("/\\"))
+ if folder_name:
+ safe_path = secure_path_join(app_base_dir, folder_name)
+ result = str(safe_path)
+ if has_trailing_sep and not result.endswith(os.sep):
+ result = result + os.sep
+ print(
+ f"Warning: Sanitized folder path '{folder_path}' to '{result}' for security"
+ )
+ return result
+ else:
+ raise ValueError(
+ f"Cannot safely normalize folder path: {folder_path}"
+ ) from e
+
+
+def get_or_create_env_var(var_name: str, default_value: str, print_val: bool = False):
+ """
+ Get an environmental variable, and set it to a default value if it doesn't exist
+ """
+ # Get the environment variable if it exists
+ value = os.environ.get(var_name)
+
+ # If it doesn't exist, set the environment variable to the default value
+ if value is None:
+ os.environ[var_name] = default_value
+ value = default_value
+
+ if print_val is True:
+ print(f"The value of {var_name} is {value}")
+
+ return value
+
+
+def add_folder_to_path(folder_path: str):
+ """
+ Check if a folder exists on your system. If so, get the absolute path and then add it to the system Path variable if it doesn't already exist. Function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
+ """
+
+ if os.path.exists(folder_path) and os.path.isdir(folder_path):
+ # print(folder_path, "folder exists.")
+
+ # Resolve relative path to absolute path
+ absolute_path = os.path.abspath(folder_path)
+
+ current_path = os.environ["PATH"]
+ if absolute_path not in current_path.split(os.pathsep):
+ full_path_extension = absolute_path + os.pathsep + current_path
+ os.environ["PATH"] = full_path_extension
+ # print(f"Updated PATH with: ", full_path_extension)
+ else:
+ pass
+ # print(f"Directory {folder_path} already exists in PATH.")
+ else:
+ print(f"Folder not found at {folder_path} - not added to PATH")
+
+
+def validate_safe_url(url_candidate: str, allowed_domains: list = None) -> str:
+ """
+ Validate and return a safe URL with enhanced security checks.
+ """
+ if allowed_domains is None:
+ allowed_domains = [
+ "seanpedrick-case.github.io",
+ "github.io",
+ "github.com",
+ "sharepoint.com",
+ ]
+
+ try:
+ parsed = urllib.parse.urlparse(url_candidate)
+
+ # Basic structure validation
+ if not parsed.scheme or not parsed.netloc:
+ raise ValueError("Invalid URL structure")
+
+ # Security checks
+ if parsed.scheme not in ["https"]: # Only allow HTTPS
+ raise ValueError("Only HTTPS URLs are allowed for security")
+
+ # Domain validation
+ domain = parsed.netloc.lower()
+ if not any(domain.endswith(allowed) for allowed in allowed_domains):
+ raise ValueError(f"Domain not in allowed list: {domain}")
+
+ # Additional security checks
+ if any(
+ suspicious in domain for suspicious in ["..", "//", "javascript:", "data:"]
+ ):
+ raise ValueError("Suspicious URL patterns detected")
+
+ # Path validation (prevent path traversal)
+ if ".." in parsed.path or "//" in parsed.path:
+ raise ValueError("Path traversal attempts detected")
+
+ return url_candidate
+
+ except Exception as e:
+ print(f"URL validation failed: {e}")
+ return "https://seanpedrick-case.github.io/doc_redaction" # Safe fallback
+
+
+def sanitize_markdown_text(text: str) -> str:
+ """
+ Sanitize markdown text by removing dangerous HTML/scripts while preserving
+ safe markdown syntax.
+ """
+ if not text or not isinstance(text, str):
+ return ""
+
+ # Remove dangerous HTML tags and scripts using bleach
+ # Define allowed tags for markdown (customize as needed)
+ allowed_tags = [
+ "a",
+ "b",
+ "strong",
+ "em",
+ "i",
+ "u",
+ "code",
+ "pre",
+ "blockquote",
+ "ul",
+ "ol",
+ "li",
+ "p",
+ "br",
+ "hr",
+ ]
+ allowed_attributes = {"a": ["href", "title", "rel"]}
+ # Clean the text to strip (remove) any tags not in allowed_tags, and remove all script/iframe/etc.
+ text = bleach.clean(
+ text, tags=allowed_tags, attributes=allowed_attributes, strip=True
+ )
+
+ # Remove iframe, object, embed tags (should already be stripped, but keep for redundancy)
+ text = re.sub(
+ r"<(iframe|object|embed)[^>]*>.*?\1>",
+ "",
+ text,
+ flags=re.IGNORECASE | re.DOTALL,
+ )
+
+ # Remove event handlers (onclick, onerror, etc.)
+ text = re.sub(r'\s*on\w+\s*=\s*["\'][^"\']*["\']', "", text, flags=re.IGNORECASE)
+
+ # Remove javascript: and data: URLs from markdown links
+ text = re.sub(
+ r"\[([^\]]+)\]\(javascript:[^\)]+\)", r"[\1]", text, flags=re.IGNORECASE
+ )
+ text = re.sub(r"\[([^\]]+)\]\(data:[^\)]+\)", r"[\1]", text, flags=re.IGNORECASE)
+
+ # Remove dangerous HTML attributes
+ text = re.sub(
+ r'\s*(style|onerror|onload|onclick)\s*=\s*["\'][^"\']*["\']',
+ "",
+ text,
+ flags=re.IGNORECASE,
+ )
+
+ return text.strip()
+
+
+###
+# LOAD CONFIG FROM ENV FILE
+###
+
+CONFIG_FOLDER = get_or_create_env_var("CONFIG_FOLDER", "config/")
+CONFIG_FOLDER = ensure_folder_within_app_directory(CONFIG_FOLDER)
+
+# If you have an aws_config env file in the config folder, you can load in app variables this way, e.g. 'config/app_config.env'
+APP_CONFIG_PATH = get_or_create_env_var(
+ "APP_CONFIG_PATH", CONFIG_FOLDER + "app_config.env"
+) # e.g. config/app_config.env
+
+if APP_CONFIG_PATH:
+ if os.path.exists(APP_CONFIG_PATH):
+ print(f"Loading app variables from config file {APP_CONFIG_PATH}")
+ load_dotenv(APP_CONFIG_PATH)
+ else:
+ print("App config file not found at location:", APP_CONFIG_PATH)
+
+###
+# AWS OPTIONS
+###
+
+# If you have an aws_config env file in the config folder, you can load in AWS keys this way, e.g. 'env/aws_config.env'
+AWS_CONFIG_PATH = get_or_create_env_var(
+ "AWS_CONFIG_PATH", ""
+) # e.g. config/aws_config.env
+
+if AWS_CONFIG_PATH:
+ if os.path.exists(AWS_CONFIG_PATH):
+ print(f"Loading AWS variables from config file {AWS_CONFIG_PATH}")
+ load_dotenv(AWS_CONFIG_PATH)
+ else:
+ print("AWS config file not found at location:", AWS_CONFIG_PATH)
+
+RUN_AWS_FUNCTIONS = convert_string_to_boolean(
+ get_or_create_env_var("RUN_AWS_FUNCTIONS", "False")
+)
+
+AWS_REGION = get_or_create_env_var("AWS_REGION", "")
+
+AWS_CLIENT_ID = get_or_create_env_var("AWS_CLIENT_ID", "")
+
+AWS_CLIENT_SECRET = get_or_create_env_var("AWS_CLIENT_SECRET", "")
+
+AWS_USER_POOL_ID = get_or_create_env_var("AWS_USER_POOL_ID", "")
+
+AWS_ACCESS_KEY = get_or_create_env_var("AWS_ACCESS_KEY", "")
+# if AWS_ACCESS_KEY: print(f'AWS_ACCESS_KEY found in environment variables')
+
+AWS_SECRET_KEY = get_or_create_env_var("AWS_SECRET_KEY", "")
+# if AWS_SECRET_KEY: print(f'AWS_SECRET_KEY found in environment variables')
+
+DOCUMENT_REDACTION_BUCKET = get_or_create_env_var("DOCUMENT_REDACTION_BUCKET", "")
+
+# Should the app prioritise using AWS SSO over using API keys stored in environment variables/secrets (defaults to yes)
+PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS = convert_string_to_boolean(
+ get_or_create_env_var("PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS", "True")
+)
+
+# Custom headers e.g. if routing traffic through Cloudfront
+# Retrieving or setting CUSTOM_HEADER
+CUSTOM_HEADER = get_or_create_env_var("CUSTOM_HEADER", "")
+
+# Retrieving or setting CUSTOM_HEADER_VALUE
+CUSTOM_HEADER_VALUE = get_or_create_env_var("CUSTOM_HEADER_VALUE", "")
+
+###
+# Image options
+###
+IMAGES_DPI = float(get_or_create_env_var("IMAGES_DPI", "300.0"))
+LOAD_TRUNCATED_IMAGES = convert_string_to_boolean(
+ get_or_create_env_var("LOAD_TRUNCATED_IMAGES", "True")
+)
+MAX_IMAGE_PIXELS = get_or_create_env_var(
+ "MAX_IMAGE_PIXELS", ""
+) # Changed to None if blank in file_conversion.py
+
+# Whether to merge nearby bounding boxes (reconstruction + grouping + horizontal merge)
+MERGE_BOUNDING_BOXES = convert_string_to_boolean(
+ get_or_create_env_var("MERGE_BOUNDING_BOXES", "True")
+)
+
+MAX_SPACES_GPU_RUN_TIME = int(
+ get_or_create_env_var("MAX_SPACES_GPU_RUN_TIME", "60")
+) # Maximum number of seconds to run the GPU on Spaces
+
+###
+# File I/O options
+###
+
+SESSION_OUTPUT_FOLDER = convert_string_to_boolean(
+ get_or_create_env_var("SESSION_OUTPUT_FOLDER", "False")
+) # i.e. do you want your input and output folders saved within a subfolder based on session hash value within output/input folders
+
+OUTPUT_FOLDER = get_or_create_env_var("GRADIO_OUTPUT_FOLDER", "output/") # 'output/'
+INPUT_FOLDER = get_or_create_env_var("GRADIO_INPUT_FOLDER", "input/") # 'input/'
+
+# Whether to automatically upload redaction outputs to S3
+SAVE_OUTPUTS_TO_S3 = convert_string_to_boolean(
+ get_or_create_env_var("SAVE_OUTPUTS_TO_S3", "False")
+)
+
+# Base S3 folder (key prefix) for saving redaction outputs within the DOCUMENT_REDACTION_BUCKET.
+# If left blank, S3 uploads for outputs will be skipped even if SAVE_OUTPUTS_TO_S3 is True.
+S3_OUTPUTS_FOLDER = get_or_create_env_var("S3_OUTPUTS_FOLDER", "")
+
+S3_OUTPUTS_BUCKET = get_or_create_env_var(
+ "S3_OUTPUTS_BUCKET", DOCUMENT_REDACTION_BUCKET
+)
+
+# Allow for files to be saved in a temporary folder for increased security in some instances
+if OUTPUT_FOLDER == "TEMP" or INPUT_FOLDER == "TEMP":
+ # Use mkdtemp so the directory persists for the lifetime of the process.
+ # TemporaryDirectory() as a context manager deletes the directory immediately on exit.
+ import atexit
+ import shutil
+
+ temp_dir = tempfile.mkdtemp()
+ print(f"Temporary directory created at: {temp_dir}")
+ atexit.register(shutil.rmtree, temp_dir, ignore_errors=True)
+
+ if OUTPUT_FOLDER == "TEMP":
+ OUTPUT_FOLDER = temp_dir + "/"
+ if INPUT_FOLDER == "TEMP":
+ INPUT_FOLDER = temp_dir + "/"
+else:
+ # Ensure folders are within app directory (skip validation for TEMP as it's handled above)
+ OUTPUT_FOLDER = ensure_folder_within_app_directory(OUTPUT_FOLDER)
+ INPUT_FOLDER = ensure_folder_within_app_directory(INPUT_FOLDER)
+
+GRADIO_TEMP_DIR = get_or_create_env_var(
+ "GRADIO_TEMP_DIR", ""
+) # Default Gradio temp folder
+if GRADIO_TEMP_DIR:
+ GRADIO_TEMP_DIR = ensure_folder_within_app_directory(GRADIO_TEMP_DIR)
+MPLCONFIGDIR = get_or_create_env_var("MPLCONFIGDIR", "") # Matplotlib cache folder
+if MPLCONFIGDIR:
+ MPLCONFIGDIR = ensure_folder_within_app_directory(MPLCONFIGDIR)
+
+###
+# LOGGING OPTIONS
+###
+
+# By default, logs are put into a subfolder of today's date and the host name of the instance running the app. This is to avoid at all possible the possibility of log files from one instance overwriting the logs of another instance on S3. If running the app on one system always, or just locally, it is not necessary to make the log folders so specific.
+# Another way to address this issue would be to write logs to another type of storage, e.g. database such as dynamodb. I may look into this in future.
+
+SAVE_LOGS_TO_CSV = convert_string_to_boolean(
+ get_or_create_env_var("SAVE_LOGS_TO_CSV", "True")
+)
+
+USE_LOG_SUBFOLDERS = convert_string_to_boolean(
+ get_or_create_env_var("USE_LOG_SUBFOLDERS", "True")
+)
+
+FEEDBACK_LOGS_FOLDER = get_or_create_env_var("FEEDBACK_LOGS_FOLDER", "feedback/")
+ACCESS_LOGS_FOLDER = get_or_create_env_var("ACCESS_LOGS_FOLDER", "logs/")
+USAGE_LOGS_FOLDER = get_or_create_env_var("USAGE_LOGS_FOLDER", "usage/")
+
+# Ensure log folders are within app directory before adding subfolders
+FEEDBACK_LOGS_FOLDER = ensure_folder_within_app_directory(FEEDBACK_LOGS_FOLDER)
+ACCESS_LOGS_FOLDER = ensure_folder_within_app_directory(ACCESS_LOGS_FOLDER)
+USAGE_LOGS_FOLDER = ensure_folder_within_app_directory(USAGE_LOGS_FOLDER)
+
+if USE_LOG_SUBFOLDERS:
+ day_log_subfolder = today_rev + "/"
+ host_name_subfolder = HOST_NAME + "/"
+ full_log_subfolder = day_log_subfolder + host_name_subfolder
+
+ FEEDBACK_LOGS_FOLDER = FEEDBACK_LOGS_FOLDER + full_log_subfolder
+ ACCESS_LOGS_FOLDER = ACCESS_LOGS_FOLDER + full_log_subfolder
+ USAGE_LOGS_FOLDER = USAGE_LOGS_FOLDER + full_log_subfolder
+
+ # Re-validate after adding subfolders to ensure still within app directory
+ FEEDBACK_LOGS_FOLDER = ensure_folder_within_app_directory(FEEDBACK_LOGS_FOLDER)
+ ACCESS_LOGS_FOLDER = ensure_folder_within_app_directory(ACCESS_LOGS_FOLDER)
+ USAGE_LOGS_FOLDER = ensure_folder_within_app_directory(USAGE_LOGS_FOLDER)
+
+S3_FEEDBACK_LOGS_FOLDER = get_or_create_env_var(
+ "S3_FEEDBACK_LOGS_FOLDER", "feedback/" + full_log_subfolder
+)
+S3_ACCESS_LOGS_FOLDER = get_or_create_env_var(
+ "S3_ACCESS_LOGS_FOLDER", "logs/" + full_log_subfolder
+)
+S3_USAGE_LOGS_FOLDER = get_or_create_env_var(
+ "S3_USAGE_LOGS_FOLDER", "usage/" + full_log_subfolder
+)
+
+# Should the redacted file name be included in the logs? In some instances, the names of the files themselves could be sensitive, and should not be disclosed beyond the app. So, by default this is false.
+DISPLAY_FILE_NAMES_IN_LOGS = convert_string_to_boolean(
+ get_or_create_env_var("DISPLAY_FILE_NAMES_IN_LOGS", "False")
+)
+
+# Further customisation options for CSV logs
+CSV_ACCESS_LOG_HEADERS = get_or_create_env_var(
+ "CSV_ACCESS_LOG_HEADERS", ""
+) # If blank, uses component labels
+CSV_FEEDBACK_LOG_HEADERS = get_or_create_env_var(
+ "CSV_FEEDBACK_LOG_HEADERS", ""
+) # If blank, uses component labels
+CSV_USAGE_LOG_HEADERS = get_or_create_env_var(
+ "CSV_USAGE_LOG_HEADERS",
+ '["session_hash_textbox", "doc_full_file_name_textbox", "data_full_file_name_textbox", "actual_time_taken_number", "total_page_count", "textract_query_number", "pii_detection_method", "comprehend_query_number", "cost_code", "textract_handwriting_signature", "host_name_textbox", "text_extraction_method", "is_this_a_textract_api_call", "task", "vlm_model_name", "vlm_total_input_tokens", "vlm_total_output_tokens", "llm_model_name", "llm_total_input_tokens", "llm_total_output_tokens"]',
+) # If blank, uses component labels
+
+### DYNAMODB logs. Whether to save to DynamoDB, and the headers of the table
+SAVE_LOGS_TO_DYNAMODB = convert_string_to_boolean(
+ get_or_create_env_var("SAVE_LOGS_TO_DYNAMODB", "False")
+)
+
+ACCESS_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var(
+ "ACCESS_LOG_DYNAMODB_TABLE_NAME", "redaction_access_log"
+)
+DYNAMODB_ACCESS_LOG_HEADERS = get_or_create_env_var("DYNAMODB_ACCESS_LOG_HEADERS", "")
+
+FEEDBACK_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var(
+ "FEEDBACK_LOG_DYNAMODB_TABLE_NAME", "redaction_feedback"
+)
+DYNAMODB_FEEDBACK_LOG_HEADERS = get_or_create_env_var(
+ "DYNAMODB_FEEDBACK_LOG_HEADERS", ""
+)
+
+USAGE_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var(
+ "USAGE_LOG_DYNAMODB_TABLE_NAME", "redaction_usage"
+)
+DYNAMODB_USAGE_LOG_HEADERS = get_or_create_env_var("DYNAMODB_USAGE_LOG_HEADERS", "")
+
+# Report logging to console?
+LOGGING = convert_string_to_boolean(get_or_create_env_var("LOGGING", "False"))
+
+if LOGGING:
+ # Configure logging
+ logging.basicConfig(
+ level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+ )
+
+LOG_FILE_NAME = get_or_create_env_var("LOG_FILE_NAME", "log.csv")
+USAGE_LOG_FILE_NAME = get_or_create_env_var("USAGE_LOG_FILE_NAME", LOG_FILE_NAME)
+FEEDBACK_LOG_FILE_NAME = get_or_create_env_var("FEEDBACK_LOG_FILE_NAME", LOG_FILE_NAME)
+
+
+###
+# Gradio general app options
+###
+
+FAVICON_PATH = get_or_create_env_var("FAVICON_PATH", "favicon.png")
+
+RUN_FASTAPI = convert_string_to_boolean(get_or_create_env_var("RUN_FASTAPI", "False"))
+
+RUN_MCP_SERVER = convert_string_to_boolean(
+ get_or_create_env_var("RUN_MCP_SERVER", "False")
+)
+
+MAX_QUEUE_SIZE = int(get_or_create_env_var("MAX_QUEUE_SIZE", "20"))
+
+MAX_FILE_SIZE = get_or_create_env_var("MAX_FILE_SIZE", "250mb").lower()
+
+GRADIO_SERVER_NAME = get_or_create_env_var(
+ "GRADIO_SERVER_NAME", "127.0.0.1"
+) # Use "0.0.0.0" for external access
+
+GRADIO_SERVER_PORT = int(get_or_create_env_var("GRADIO_SERVER_PORT", "7860"))
+
+ALLOWED_ORIGINS = get_or_create_env_var(
+ "ALLOWED_ORIGINS", ""
+) # should be a list of allowed origins e.g. ['https://example.com', 'https://www.example.com']
+
+ALLOWED_HOSTS = get_or_create_env_var("ALLOWED_HOSTS", "")
+
+ROOT_PATH = get_or_create_env_var("ROOT_PATH", "")
+FASTAPI_ROOT_PATH = get_or_create_env_var("FASTAPI_ROOT_PATH", "/")
+
+DEFAULT_CONCURRENCY_LIMIT = int(get_or_create_env_var("DEFAULT_CONCURRENCY_LIMIT", "3"))
+
+# Number of pages to loop through before breaking the function and restarting from the last finished page (not currently activated).
+PAGE_BREAK_VALUE = int(get_or_create_env_var("PAGE_BREAK_VALUE", "99999"))
+
+MAX_TIME_VALUE = int(get_or_create_env_var("MAX_TIME_VALUE", "999999"))
+MAX_SIMULTANEOUS_FILES = int(get_or_create_env_var("MAX_SIMULTANEOUS_FILES", "10"))
+MAX_DOC_PAGES = int(get_or_create_env_var("MAX_DOC_PAGES", "3000"))
+MAX_TABLE_ROWS = int(get_or_create_env_var("MAX_TABLE_ROWS", "250000"))
+MAX_TABLE_COLUMNS = int(get_or_create_env_var("MAX_TABLE_COLUMNS", "100"))
+MAX_OPEN_TEXT_CHARACTERS = int(
+ get_or_create_env_var("MAX_OPEN_TEXT_CHARACTERS", "50000")
+)
+
+# When loading for review, should PDFs have existing redaction annotations loaded in?
+LOAD_REDACTION_ANNOTATIONS_FROM_PDF = convert_string_to_boolean(
+ get_or_create_env_var("LOAD_REDACTION_ANNOTATIONS_FROM_PDF", "True")
+)
+
+
+# Create Tesseract and Poppler folders if you have installed them locally
+TESSERACT_FOLDER = get_or_create_env_var(
+ "TESSERACT_FOLDER", ""
+) # # If installing for Windows, install Tesseract 5.5.0 from here: https://github.com/UB-Mannheim/tesseract/wiki. Then this environment variable should point to the Tesseract folder e.g. tesseract/
+if TESSERACT_FOLDER:
+ TESSERACT_FOLDER = ensure_folder_within_app_directory(TESSERACT_FOLDER)
+ add_folder_to_path(TESSERACT_FOLDER)
+
+TESSERACT_DATA_FOLDER = get_or_create_env_var(
+ "TESSERACT_DATA_FOLDER", "/usr/share/tessdata"
+)
+# Only validate if it's a relative path (system paths like /usr/share/tessdata are allowed)
+if TESSERACT_DATA_FOLDER and not os.path.isabs(TESSERACT_DATA_FOLDER):
+ TESSERACT_DATA_FOLDER = ensure_folder_within_app_directory(TESSERACT_DATA_FOLDER)
+
+POPPLER_FOLDER = get_or_create_env_var(
+ "POPPLER_FOLDER", ""
+) # If installing on Windows,install Poppler from here https://github.com/oschwartz10612/poppler-windows. This variable needs to point to the poppler bin folder e.g. poppler/poppler-24.02.0/Library/bin/
+if POPPLER_FOLDER:
+ POPPLER_FOLDER = ensure_folder_within_app_directory(POPPLER_FOLDER)
+ add_folder_to_path(POPPLER_FOLDER)
+
+SHOW_QUICKSTART = convert_string_to_boolean(
+ get_or_create_env_var("SHOW_QUICKSTART", "False")
+)
+
+SHOW_SUMMARISATION = convert_string_to_boolean(
+ get_or_create_env_var("SHOW_SUMMARISATION", "False")
+)
+
+# Extraction and PII options open by default:
+EXTRACTION_AND_PII_OPTIONS_OPEN_BY_DEFAULT = convert_string_to_boolean(
+ get_or_create_env_var("EXTRACTION_AND_PII_OPTIONS_OPEN_BY_DEFAULT", "True")
+)
+
+### VLM model options and display
+
+# List of models to use for text extraction
+SELECTABLE_TEXT_EXTRACT_OPTION = get_or_create_env_var(
+ "SELECTABLE_TEXT_EXTRACT_OPTION", "Local model - selectable text"
+)
+LOCAL_OCR_MODEL_TEXT_EXTRACT_OPTION = get_or_create_env_var(
+ "LOCAL_OCR_MODEL_TEXT_EXTRACT_OPTION",
+ "Local OCR model - PDFs without selectable text",
+)
+TEXTRACT_TEXT_EXTRACT_OPTION = get_or_create_env_var(
+ "TEXTRACT_TEXT_EXTRACT_OPTION", "AWS Textract service - all PDF types"
+)
+BEDROCK_VLM_TEXT_EXTRACT_OPTION = get_or_create_env_var(
+ "BEDROCK_VLM_TEXT_EXTRACT_OPTION", "AWS Bedrock VLM OCR - all PDF types"
+)
+GEMINI_VLM_TEXT_EXTRACT_OPTION = get_or_create_env_var(
+ "GEMINI_VLM_TEXT_EXTRACT_OPTION", "Google Gemini VLM OCR - all PDF types"
+)
+AZURE_OPENAI_VLM_TEXT_EXTRACT_OPTION = get_or_create_env_var(
+ "AZURE_OPENAI_VLM_TEXT_EXTRACT_OPTION", "Azure/OpenAI VLM OCR - all PDF types"
+)
+
+# When True, use a two-step OCR process for PDFs: try selectable text extraction per page first;
+# only run OCR (Tesseract/Textract/VLM) on pages where no text could be extracted. Saves cost/time.
+EFFICIENT_OCR = convert_string_to_boolean(
+ get_or_create_env_var("EFFICIENT_OCR", "False")
+)
+# Minimum number of extractable words on a page to use text-only route; below this use OCR.
+EFFICIENT_OCR_MIN_WORDS = int(get_or_create_env_var("EFFICIENT_OCR_MIN_WORDS", "20"))
+# Minimum fraction of page area (MediaBox) that a single placement of an embedded image must
+# cover to route the page through OCR in addition to the word-count rule. Reduces false
+# positives from tiny icons/watermarks. Set to 0 to disable image-based routing (word count only).
+EFFICIENT_OCR_MIN_IMAGE_COVERAGE_FRACTION = float(
+ get_or_create_env_var("EFFICIENT_OCR_MIN_IMAGE_COVERAGE_FRACTION", "0.005")
+)
+# Default max workers for parallel processing app-wide. Overridable by specific env vars below.
+MAX_WORKERS = max(
+ 1,
+ int(get_or_create_env_var("MAX_WORKERS", "8")),
+)
+# Dedicated worker cap for line->word segmentation. This path is CPU/memory heavy
+# and can slow down when matching MAX_WORKERS on busy documents.
+LINE_TO_WORD_SEGMENT_MAX_WORKERS = max(
+ 1,
+ int(
+ get_or_create_env_var(
+ "LINE_TO_WORD_SEGMENT_MAX_WORKERS",
+ str(min(MAX_WORKERS, 4)),
+ )
+ ),
+)
+# Max threads for OCR first pass in redact_image_pdf (1 = sequential). Enables parallel Textract/Tesseract/VLM.
+OCR_FIRST_PASS_MAX_WORKERS = max(
+ 1,
+ int(get_or_create_env_var("OCR_FIRST_PASS_MAX_WORKERS", str(MAX_WORKERS))),
+)
+
+# Maximum parallel workers for local Tesseract OCR. Keep this lower than MAX_WORKERS
+# to avoid saturating CPU/RAM when running many pages concurrently.
+TESSERACT_MAX_WORKERS = max(
+ 1,
+ int(get_or_create_env_var("TESSERACT_MAX_WORKERS", "4")),
+)
+
+# Maximum parallel workers for local PaddleOCR page OCR. Often GPU-bound; keep low to
+# avoid saturating VRAM or contending on a single PaddleOCR model instance.
+PADDLE_MAX_WORKERS = max(
+ 1,
+ int(get_or_create_env_var("PADDLE_MAX_WORKERS", "2")),
+)
+# Max threads for page-group summarisation in summarise_document (1 = sequential). Use 1 for local models.
+SUMMARY_PAGE_GROUP_MAX_WORKERS = max(
+ 1,
+ int(get_or_create_env_var("SUMMARY_PAGE_GROUP_MAX_WORKERS", str(MAX_WORKERS))),
+)
+
+SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS = convert_string_to_boolean(
+ get_or_create_env_var("SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS", "True")
+)
+SHOW_AWS_TEXT_EXTRACTION_OPTIONS = convert_string_to_boolean(
+ get_or_create_env_var("SHOW_AWS_TEXT_EXTRACTION_OPTIONS", "True")
+)
+SHOW_HYBRID_TEXTRACT_BEDROCK_CHECKBOX = convert_string_to_boolean(
+ get_or_create_env_var("SHOW_HYBRID_TEXTRACT_BEDROCK_CHECKBOX", "False")
+)
+SHOW_BEDROCK_VLM_MODELS = convert_string_to_boolean(
+ get_or_create_env_var("SHOW_BEDROCK_VLM_MODELS", "False")
+)
+SHOW_GEMINI_VLM_MODELS = convert_string_to_boolean(
+ get_or_create_env_var("SHOW_GEMINI_VLM_MODELS", "False")
+)
+SHOW_AZURE_OPENAI_VLM_MODELS = convert_string_to_boolean(
+ get_or_create_env_var("SHOW_AZURE_OPENAI_VLM_MODELS", "False")
+)
+
+# Show at least local options if everything mistakenly removed
+if (
+ not SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS
+ and not SHOW_AWS_TEXT_EXTRACTION_OPTIONS
+ and not SHOW_BEDROCK_VLM_MODELS
+ and not SHOW_GEMINI_VLM_MODELS
+ and not SHOW_AZURE_OPENAI_VLM_MODELS
+):
+ SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS = True
+
+local_text_extraction_model_options = list()
+aws_text_extraction_model_options = list()
+cloud_vlm_model_options = list()
+
+if SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS:
+ local_text_extraction_model_options.append(SELECTABLE_TEXT_EXTRACT_OPTION)
+ local_text_extraction_model_options.append(LOCAL_OCR_MODEL_TEXT_EXTRACT_OPTION)
+
+if SHOW_AWS_TEXT_EXTRACTION_OPTIONS:
+ aws_text_extraction_model_options.append(TEXTRACT_TEXT_EXTRACT_OPTION)
+
+if SHOW_BEDROCK_VLM_MODELS:
+ cloud_vlm_model_options.append(BEDROCK_VLM_TEXT_EXTRACT_OPTION)
+
+if SHOW_GEMINI_VLM_MODELS:
+ cloud_vlm_model_options.append(GEMINI_VLM_TEXT_EXTRACT_OPTION)
+
+if SHOW_AZURE_OPENAI_VLM_MODELS:
+ cloud_vlm_model_options.append(AZURE_OPENAI_VLM_TEXT_EXTRACT_OPTION)
+
+TEXT_EXTRACTION_MODELS = (
+ local_text_extraction_model_options
+ + aws_text_extraction_model_options
+ + cloud_vlm_model_options
+)
+DO_INITIAL_TABULAR_DATA_CLEAN = convert_string_to_boolean(
+ get_or_create_env_var("DO_INITIAL_TABULAR_DATA_CLEAN", "True")
+)
+
+### PII model options and display
+
+# PII detection models
+NO_REDACTION_PII_OPTION = get_or_create_env_var(
+ "NO_REDACTION_PII_OPTION", "Only extract text (no redaction)"
+)
+LOCAL_PII_OPTION = get_or_create_env_var("LOCAL_PII_OPTION", "Local")
+AWS_PII_OPTION = get_or_create_env_var("AWS_PII_OPTION", "AWS Comprehend")
+AWS_LLM_PII_OPTION = get_or_create_env_var("AWS_LLM_PII_OPTION", "LLM (AWS Bedrock)")
+INFERENCE_SERVER_PII_OPTION = get_or_create_env_var(
+ "INFERENCE_SERVER_PII_OPTION", "Local inference server"
+)
+LOCAL_TRANSFORMERS_LLM_PII_OPTION = get_or_create_env_var(
+ "LOCAL_TRANSFORMERS_LLM_PII_OPTION", "Local transformers LLM"
+)
+
+SHOW_LOCAL_PII_DETECTION_OPTIONS = convert_string_to_boolean(
+ get_or_create_env_var("SHOW_LOCAL_PII_DETECTION_OPTIONS", "True")
+)
+SHOW_AWS_PII_DETECTION_OPTIONS = convert_string_to_boolean(
+ get_or_create_env_var("SHOW_AWS_PII_DETECTION_OPTIONS", "True")
+)
+SHOW_INFERENCE_SERVER_PII_OPTIONS = convert_string_to_boolean(
+ get_or_create_env_var("SHOW_INFERENCE_SERVER_PII_OPTIONS", "False")
+)
+SHOW_TRANSFORMERS_LLM_PII_DETECTION_OPTIONS = convert_string_to_boolean(
+ get_or_create_env_var("SHOW_TRANSFORMERS_LLM_PII_DETECTION_OPTIONS", "False")
+)
+SHOW_AWS_BEDROCK_LLM_MODELS = convert_string_to_boolean(
+ get_or_create_env_var("SHOW_AWS_BEDROCK_LLM_MODELS", "False")
+)
+
+
+if (
+ not SHOW_LOCAL_PII_DETECTION_OPTIONS
+ and not SHOW_AWS_PII_DETECTION_OPTIONS
+ and not SHOW_AWS_BEDROCK_LLM_MODELS
+ and not SHOW_TRANSFORMERS_LLM_PII_DETECTION_OPTIONS
+ and not SHOW_INFERENCE_SERVER_PII_OPTIONS
+ and not SHOW_TRANSFORMERS_LLM_PII_DETECTION_OPTIONS
+):
+ SHOW_LOCAL_PII_DETECTION_OPTIONS = True
+
+local_pii_model_options = [NO_REDACTION_PII_OPTION]
+aws_pii_model_options = list()
+
+if SHOW_LOCAL_PII_DETECTION_OPTIONS:
+ local_pii_model_options.append(LOCAL_PII_OPTION)
+
+if SHOW_TRANSFORMERS_LLM_PII_DETECTION_OPTIONS:
+ local_pii_model_options.append(LOCAL_TRANSFORMERS_LLM_PII_OPTION)
+
+if SHOW_INFERENCE_SERVER_PII_OPTIONS:
+ local_pii_model_options.append(INFERENCE_SERVER_PII_OPTION)
+
+if SHOW_AWS_PII_DETECTION_OPTIONS:
+ aws_pii_model_options.append(AWS_PII_OPTION)
+
+if SHOW_AWS_BEDROCK_LLM_MODELS:
+ aws_pii_model_options.append(AWS_LLM_PII_OPTION)
+
+PII_DETECTION_MODELS = local_pii_model_options + aws_pii_model_options
+
+if SHOW_AWS_TEXT_EXTRACTION_OPTIONS:
+ DEFAULT_TEXT_EXTRACTION_MODEL = get_or_create_env_var(
+ "DEFAULT_TEXT_EXTRACTION_MODEL", TEXTRACT_TEXT_EXTRACT_OPTION
+ )
+else:
+ DEFAULT_TEXT_EXTRACTION_MODEL = get_or_create_env_var(
+ "DEFAULT_TEXT_EXTRACTION_MODEL", SELECTABLE_TEXT_EXTRACT_OPTION
+ )
+
+# Validate that DEFAULT_TEXT_EXTRACTION_MODEL is in the available choices
+# If not, fall back to the first available option
+if DEFAULT_TEXT_EXTRACTION_MODEL not in TEXT_EXTRACTION_MODELS:
+ if TEXT_EXTRACTION_MODELS:
+ DEFAULT_TEXT_EXTRACTION_MODEL = TEXT_EXTRACTION_MODELS[0]
+ print(
+ f"Warning: DEFAULT_TEXT_EXTRACTION_MODEL was not in available choices. "
+ f"Using '{DEFAULT_TEXT_EXTRACTION_MODEL}' instead."
+ )
+ else:
+ # This should never happen, but provide a fallback
+ DEFAULT_TEXT_EXTRACTION_MODEL = SELECTABLE_TEXT_EXTRACT_OPTION
+ print("Warning: No text extraction models available. Using default option.")
+
+if SHOW_AWS_PII_DETECTION_OPTIONS:
+ DEFAULT_PII_DETECTION_MODEL = get_or_create_env_var(
+ "DEFAULT_PII_DETECTION_MODEL", AWS_PII_OPTION
+ )
+else:
+ DEFAULT_PII_DETECTION_MODEL = get_or_create_env_var(
+ "DEFAULT_PII_DETECTION_MODEL", LOCAL_PII_OPTION
+ )
+
+# Validate that DEFAULT_PII_DETECTION_MODEL is in the available choices
+# If not, fall back to the first available option
+if DEFAULT_PII_DETECTION_MODEL not in PII_DETECTION_MODELS:
+ if PII_DETECTION_MODELS:
+ DEFAULT_PII_DETECTION_MODEL = PII_DETECTION_MODELS[0]
+ print(
+ f"Warning: DEFAULT_PII_DETECTION_MODEL was not in available choices. "
+ f"Using '{DEFAULT_PII_DETECTION_MODEL}' instead."
+ )
+ else:
+ # This should never happen, but provide a fallback
+ DEFAULT_PII_DETECTION_MODEL = LOCAL_PII_OPTION
+ print("Warning: No PII detection models available. Using default option.")
+
+SHOW_PII_IDENTIFICATION_OPTIONS = convert_string_to_boolean(
+ get_or_create_env_var("SHOW_PII_IDENTIFICATION_OPTIONS", "True")
+)
+
+# LLM inference method for PII detection (similar to VLM options)
+# Options: "aws-bedrock", "local", "inference-server", "azure-openai", "gemini"
+CHOSEN_LLM_PII_INFERENCE_METHOD = get_or_create_env_var(
+ "CHOSEN_LLM_PII_INFERENCE_METHOD", "aws-bedrock"
+) # Default to AWS Bedrock for backward compatibility
+
+SHOW_LOCAL_LLM_PII_OPTIONS = convert_string_to_boolean(
+ get_or_create_env_var("SHOW_LOCAL_LLM_PII_OPTIONS", "False")
+) # Whether to show local LLM options for PII detection
+
+SHOW_INFERENCE_SERVER_LLM_PII_OPTIONS = convert_string_to_boolean(
+ get_or_create_env_var("SHOW_INFERENCE_SERVER_LLM_PII_OPTIONS", "False")
+) # Whether to show inference-server options for PII detection
+
+SHOW_AZURE_LLM_PII_OPTIONS = convert_string_to_boolean(
+ get_or_create_env_var("SHOW_AZURE_LLM_PII_OPTIONS", "False")
+) # Whether to show Azure/OpenAI options for PII detection
+
+SHOW_GEMINI_LLM_PII_OPTIONS = convert_string_to_boolean(
+ get_or_create_env_var("SHOW_GEMINI_LLM_PII_OPTIONS", "False")
+) # Whether to show Gemini options for PII detection
+
+# Build list of available LLM inference methods for PII detection
+LLM_PII_INFERENCE_METHODS = [] # Always available
+
+if SHOW_LOCAL_LLM_PII_OPTIONS:
+ LLM_PII_INFERENCE_METHODS.append("local")
+
+if SHOW_INFERENCE_SERVER_LLM_PII_OPTIONS:
+ LLM_PII_INFERENCE_METHODS.append("inference-server")
+
+if SHOW_AZURE_LLM_PII_OPTIONS:
+ LLM_PII_INFERENCE_METHODS.append("azure-openai")
+
+if SHOW_GEMINI_LLM_PII_OPTIONS:
+ LLM_PII_INFERENCE_METHODS.append("gemini")
+
+if SHOW_AWS_PII_DETECTION_OPTIONS:
+ LLM_PII_INFERENCE_METHODS.append("aws-bedrock")
+
+# Create list of PII detection models for tabular redaction
+TABULAR_PII_DETECTION_MODELS = PII_DETECTION_MODELS.copy()
+if NO_REDACTION_PII_OPTION in TABULAR_PII_DETECTION_MODELS:
+ TABULAR_PII_DETECTION_MODELS.remove(NO_REDACTION_PII_OPTION)
+
+DEFAULT_TEXT_COLUMNS = get_or_create_env_var("DEFAULT_TEXT_COLUMNS", "[]")
+DEFAULT_EXCEL_SHEETS = get_or_create_env_var("DEFAULT_EXCEL_SHEETS", "[]")
+
+DEFAULT_TABULAR_ANONYMISATION_STRATEGY = get_or_create_env_var(
+ "DEFAULT_TABULAR_ANONYMISATION_STRATEGY", "redact completely"
+)
+
+###
+# LOCAL OCR MODEL OPTIONS
+###
+
+
+### VLM OPTIONS
+
+SHOW_VLM_MODEL_OPTIONS = convert_string_to_boolean(
+ get_or_create_env_var("SHOW_VLM_MODEL_OPTIONS", "False")
+) # Whether to show the VLM model options in the UI
+
+SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL = get_or_create_env_var(
+ "SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL", "Qwen3-VL-8B-Instruct"
+) # Selected vision model. Choose from: "Nanonets-OCR2-3B", "Dots.OCR", "Qwen3-VL-2B-Instruct", "Qwen3-VL-4B-Instruct", "Qwen3-VL-8B-Instruct", "Qwen3-VL-30B-A3B-Instruct", "Qwen3-VL-235B-A22B-Instruct", "PaddleOCR-VL"
+
+# When True, use the same local transformers VLM model (e.g. Qwen3-VL-4B-Instruct) for LLM tasks (e.g. PII entity detection) as for VLM/OCR. Overrides LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE for local LLM.
+USE_TRANSFORMERS_VLM_MODEL_AS_LLM = convert_string_to_boolean(
+ get_or_create_env_var("USE_TRANSFORMERS_VLM_MODEL_AS_LLM", "False")
+)
+
+if SHOW_VLM_MODEL_OPTIONS:
+ VLM_MODEL_OPTIONS = [
+ SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL,
+ ]
+
+
+MAX_NEW_TOKENS = int(
+ get_or_create_env_var("MAX_NEW_TOKENS", "8192")
+) # Maximum number of tokens to generate
+
+DEFAULT_MAX_NEW_TOKENS = int(
+ get_or_create_env_var("DEFAULT_MAX_NEW_TOKENS", "8192")
+) # Default maximum number of tokens to generate
+
+HYBRID_OCR_MAX_NEW_TOKENS = int(
+ get_or_create_env_var("HYBRID_OCR_MAX_NEW_TOKENS", "1024")
+) # Maximum number of tokens to generate per line crop for hybrid OCR. Raised from 50: reasoning
+# models (e.g. Qwen3.5) emit ... tokens that consume the generation budget before
+# producing any answer content, causing every line to return empty results at the old 50-token cap.
+
+HYBRID_OCR_MAX_WORDS = int(
+ get_or_create_env_var("HYBRID_OCR_MAX_WORDS", "50")
+) # Sanity-check: discard any hybrid OCR result whose word count exceeds this value. Kept at 50
+# because a single text line should never legitimately contain more than ~50 words.
+
+MAX_INPUT_TOKEN_LENGTH = int(
+ get_or_create_env_var("MAX_INPUT_TOKEN_LENGTH", "32768")
+) # VLM only: maximum input/context tokens for vision-language models. Controls tokenizer cap, model max_position_embeddings (KV cache), and effective max_pixels (image size) so image+text fit. Set lower (e.g. 8192 or 16384) to reduce VRAM. Separate from LLM_CONTEXT_LENGTH.
+
+ADD_VLM_BOUNDING_BOX_RULES = convert_string_to_boolean(
+ get_or_create_env_var("ADD_VLM_BOUNDING_BOX_RULES", "True")
+) # Whether to add bounding box rules to the VLM prompt. Does not apply to Qwen models as they have already been trained in this coordinate system.
+
+# Bedrock VLM OCR cost estimation (used when text extraction method is "AWS Bedrock VLM OCR - all PDF types")
+BEDROCK_VLM_INPUT_COST = float(
+ get_or_create_env_var(
+ "BEDROCK_VLM_INPUT_COST", "1.13"
+ ) # Based on Amazon Nova Pro input cost of $1.13 per million tokens
+) # USD per million input tokens for Bedrock VLM OCR cost estimate.
+BEDROCK_VLM_OUTPUT_COST = float(
+ get_or_create_env_var(
+ "BEDROCK_VLM_OUTPUT_COST", "4.52"
+ ) # Based on Amazon Nova Pro output cost of $4.52 per million tokens
+) # USD per million output tokens for Bedrock VLM OCR cost estimate.
+BEDROCK_VLM_PIXELS_PER_INPUT_TOKEN = int(
+ get_or_create_env_var("BEDROCK_VLM_PIXELS_PER_INPUT_TOKEN", "2500")
+) # Pixels (width*height) per input token; used with VLM_MAX_IMAGE_SIZE to estimate input tokens per page for cost calculation.
+
+# Bedrock LLM cost estimation (used when PII identification method is "LLM (AWS Bedrock)")
+BEDROCK_LLM_INPUT_COST = float(
+ get_or_create_env_var("BEDROCK_LLM_INPUT_COST", "1.13")
+) # USD per million input tokens for Bedrock LLM (e.g. PII detection) cost estimate. Based on Amazon Nova Pro input cost of $1.13 per million tokens
+BEDROCK_LLM_OUTPUT_COST = float(
+ get_or_create_env_var("BEDROCK_LLM_OUTPUT_COST", "4.52")
+) # USD per million output tokens for Bedrock LLM cost estimate. Based on Amazon Nova Pro output cost of $4.52 per million tokens
+BEDROCK_LLM_INPUT_TOKENS_PER_PAGE = int(
+ get_or_create_env_var("BEDROCK_LLM_INPUT_TOKENS_PER_PAGE", "2000")
+) # Estimated input tokens per page for Bedrock LLM cost calculation.
+BEDROCK_LLM_OUTPUT_TOKENS_PER_PAGE = int(
+ get_or_create_env_var("BEDROCK_LLM_OUTPUT_TOKENS_PER_PAGE", "250")
+) # Estimated output tokens per page for Bedrock LLM cost calculation.
+
+VLM_HYBRID_MIN_IMAGE_SIZE = int(
+ get_or_create_env_var(
+ "VLM_HYBRID_MIN_IMAGE_SIZE", "307200"
+ ) # Equivalent to 300 pixels.
+) # Min pixels (width*height) for hybrid VLM line/crop OCR via _prepare_image_for_vlm(hybrid_vlm=True). Upscaled if below. Default 307200.
+
+# When True, for hybrid PaddleOCR->VLM routes, resize/pad the full page to satisfy
+# VLM hybrid min/max pixel and DPI constraints *before* running PaddleOCR.
+# This makes Paddle's output bboxes align with the prepared page and reduces
+# per-line VLM crop resizing work.
+PREPARE_PAGE_FOR_HYBRID_VLM_BEFORE_PADDLE = convert_string_to_boolean(
+ get_or_create_env_var("PREPARE_PAGE_FOR_HYBRID_VLM_BEFORE_PADDLE", "True")
+)
+
+
+VLM_MIN_IMAGE_SIZE = int(
+ get_or_create_env_var("VLM_MIN_IMAGE_SIZE", "819200")
+) # Min pixels for full-page VLM via _prepare_image_for_vlm(hybrid_vlm=False). Upscaled if below. Default 819200. Hybrid crops use VLM_HYBRID_MIN_IMAGE_SIZE.
+
+VLM_MAX_IMAGE_SIZE = int(
+ get_or_create_env_var("VLM_MAX_IMAGE_SIZE", "1152000")
+) # Maximum total pixels (width * height) for images passed to VLM, as a multiple of 16*16 for Qwen3.5. Images with more pixels will be resized while maintaining aspect ratio. Default is 1152000 (1125*16*16).
+
+VLM_MIN_DPI = float(
+ get_or_create_env_var("VLM_MIN_DPI", "150.0")
+) # _prepare_image_for_vlm: reported DPI below this implies upscale (effective DPI = reported_dpi * scale).
+
+VLM_MAX_DPI = float(
+ get_or_create_env_var("VLM_MAX_DPI", "200.0")
+) # _prepare_image_for_vlm: reported DPI above this implies downscale. Bounds apply together with min/max pixels.
+
+# Max image aspect ratio max(width/height, height/width) after white-padding for VLM inputs.
+# Very long/thin hybrid line crops can exceed provider or model limits; padding keeps aspect within this ratio.
+VLM_MAX_ASPECT_RATIO = float(get_or_create_env_var("VLM_MAX_ASPECT_RATIO", "10.0"))
+if VLM_MAX_ASPECT_RATIO < 1.0:
+ VLM_MAX_ASPECT_RATIO = 1.0
+
+USE_FLASH_ATTENTION = convert_string_to_boolean(
+ get_or_create_env_var("USE_FLASH_ATTENTION", "False")
+) # Whether to use flash attention for the VLM
+
+QUANTISE_VLM_MODELS = convert_string_to_boolean(
+ get_or_create_env_var("QUANTISE_VLM_MODELS", "False")
+) # Whether to use 4-bit quantisation (bitsandbytes) for VLM models. Only applies when SHOW_VLM_MODEL_OPTIONS is True.
+
+OVERRIDE_VLM_REPO_ID = get_or_create_env_var(
+ "OVERRIDE_VLM_REPO_ID", ""
+) # If set, overrides the Hugging Face repo ID or model path used for the selected VLM (e.g. local path or different repo). Leave empty for no override.
+
+REPORT_VLM_OUTPUTS_TO_GUI = convert_string_to_boolean(
+ get_or_create_env_var("REPORT_VLM_OUTPUTS_TO_GUI", "False")
+) # Whether to report VLM outputs to the GUI with info boxes as they are processed..
+
+REPORT_LLM_OUTPUTS_TO_GUI = convert_string_to_boolean(
+ get_or_create_env_var("REPORT_LLM_OUTPUTS_TO_GUI", "False")
+) # Whether to report streamed LLM outputs (local transformers or inference-server) to the GUI with gr.Info as they are processed.
+
+OVERWRITE_EXISTING_OCR_RESULTS = convert_string_to_boolean(
+ get_or_create_env_var("OVERWRITE_EXISTING_OCR_RESULTS", "False")
+) # If True, always create new OCR results instead of loading from existing JSON files
+
+# VLM generation parameter defaults
+# If empty, these will be None and model defaults will be used instead
+VLM_SEED = get_or_create_env_var(
+ "VLM_SEED", ""
+) # Random seed for VLM generation. If empty, no seed is set (non-deterministic). If set to an integer, generation will be deterministic.
+if VLM_SEED and VLM_SEED.strip():
+ VLM_SEED = int(VLM_SEED)
+else:
+ VLM_SEED = None
+
+VLM_DEFAULT_TEMPERATURE = get_or_create_env_var(
+ "VLM_DEFAULT_TEMPERATURE", ""
+) # Default temperature for VLM generation. If empty, model-specific defaults will be used.
+if VLM_DEFAULT_TEMPERATURE and VLM_DEFAULT_TEMPERATURE.strip():
+ VLM_DEFAULT_TEMPERATURE = float(VLM_DEFAULT_TEMPERATURE)
+else:
+ VLM_DEFAULT_TEMPERATURE = None
+
+VLM_DEFAULT_TOP_P = get_or_create_env_var(
+ "VLM_DEFAULT_TOP_P", ""
+) # Default top_p (nucleus sampling) for VLM generation. If empty, model-specific defaults will be used.
+if VLM_DEFAULT_TOP_P and VLM_DEFAULT_TOP_P.strip():
+ VLM_DEFAULT_TOP_P = float(VLM_DEFAULT_TOP_P)
+else:
+ VLM_DEFAULT_TOP_P = None
+
+VLM_DEFAULT_MIN_P = get_or_create_env_var(
+ "VLM_DEFAULT_MIN_P", ""
+) # Default min_p (minimum probability threshold) for VLM generation. If empty, model-specific defaults will be used.
+if VLM_DEFAULT_MIN_P and VLM_DEFAULT_MIN_P.strip():
+ VLM_DEFAULT_MIN_P = float(VLM_DEFAULT_MIN_P)
+else:
+ VLM_DEFAULT_MIN_P = None
+
+VLM_DEFAULT_TOP_K = get_or_create_env_var(
+ "VLM_DEFAULT_TOP_K", ""
+) # Default top_k for VLM generation. If empty, model-specific defaults will be used.
+if VLM_DEFAULT_TOP_K and VLM_DEFAULT_TOP_K.strip():
+ VLM_DEFAULT_TOP_K = int(VLM_DEFAULT_TOP_K)
+else:
+ VLM_DEFAULT_TOP_K = None
+
+VLM_DEFAULT_REPETITION_PENALTY = get_or_create_env_var(
+ "VLM_DEFAULT_REPETITION_PENALTY", ""
+) # Default repetition penalty for VLM generation. If empty, model-specific defaults will be used.
+if VLM_DEFAULT_REPETITION_PENALTY and VLM_DEFAULT_REPETITION_PENALTY.strip():
+ VLM_DEFAULT_REPETITION_PENALTY = float(VLM_DEFAULT_REPETITION_PENALTY)
+else:
+ VLM_DEFAULT_REPETITION_PENALTY = None
+
+VLM_DEFAULT_DO_SAMPLE = get_or_create_env_var(
+ "VLM_DEFAULT_DO_SAMPLE", "True"
+) # Default do_sample setting for VLM generation. If empty, model-specific defaults will be used. True means use sampling, False means use greedy decoding (do_sample=False).
+if VLM_DEFAULT_DO_SAMPLE and VLM_DEFAULT_DO_SAMPLE.strip():
+ VLM_DEFAULT_DO_SAMPLE = convert_string_to_boolean(VLM_DEFAULT_DO_SAMPLE)
+else:
+ VLM_DEFAULT_DO_SAMPLE = None
+
+VLM_DEFAULT_STREAM = get_or_create_env_var(
+ "VLM_DEFAULT_STREAM", "True"
+) # Default stream setting for VLM generation. If empty, model-specific defaults will be used. True means stream the response, False means return the response as a single string.
+if VLM_DEFAULT_STREAM and VLM_DEFAULT_STREAM.strip():
+ VLM_DEFAULT_STREAM = convert_string_to_boolean(VLM_DEFAULT_STREAM)
+else:
+ VLM_DEFAULT_STREAM = None
+
+VLM_DEFAULT_PRESENCE_PENALTY = get_or_create_env_var(
+ "VLM_DEFAULT_PRESENCE_PENALTY", ""
+) # Default presence penalty for VLM generation. If empty, model-specific defaults will be used.
+if VLM_DEFAULT_PRESENCE_PENALTY and VLM_DEFAULT_PRESENCE_PENALTY.strip():
+ VLM_DEFAULT_PRESENCE_PENALTY = float(VLM_DEFAULT_PRESENCE_PENALTY)
+else:
+ VLM_DEFAULT_PRESENCE_PENALTY = None
+
+VLM_DISABLE_QWEN3_5_THINKING = convert_string_to_boolean(
+ get_or_create_env_var("VLM_DISABLE_QWEN3_5_THINKING", "False")
+) # Whether to disable Qwen3.5 thinking for local transformers VLM calls.
+
+# Suffix appended to the generation prompt when Qwen3.5 thinking is disabled (used in run_vlm and llm_funcs).
+VLM_QWEN3_5_NOTHINK_SUFFIX = ""
+
+INFERENCE_SERVER_DISABLE_THINKING = convert_string_to_boolean(
+ get_or_create_env_var("INFERENCE_SERVER_DISABLE_THINKING", "False")
+) # When True, passes chat_template_kwargs={"enable_thinking": false} in every inference-server VLM
+# API request. This is the vLLM-native equivalent of VLM_DISABLE_QWEN3_5_THINKING for the
+# inference-server path: vLLM applies the Qwen3/Qwen3.5 chat template server-side and honours this
+# flag to skip ... generation entirely. Eliminates thinking-token overhead and avoids
+# the need to raise HYBRID_OCR_MAX_NEW_TOKENS to accommodate reasoning budgets.
+
+
+### Local OCR model - Tesseract vs PaddleOCR
+DEFAULT_LOCAL_OCR_MODEL = get_or_create_env_var(
+ "DEFAULT_LOCAL_OCR_MODEL", "tesseract"
+) # Choose the engine for local OCR: "tesseract", "paddle", "hybrid-paddle", "hybrid-vlm", "hybrid-paddle-vlm", "hybrid-paddle-inference-server", "vlm", "inference-server"
+
+SHOW_OCR_GUI_OPTIONS = convert_string_to_boolean(
+ get_or_create_env_var("SHOW_OCR_GUI_OPTIONS", "True")
+)
+
+SHOW_LOCAL_OCR_MODEL_OPTIONS = convert_string_to_boolean(
+ get_or_create_env_var("SHOW_LOCAL_OCR_MODEL_OPTIONS", "False")
+)
+
+SHOW_PADDLE_MODEL_OPTIONS = convert_string_to_boolean(
+ get_or_create_env_var("SHOW_PADDLE_MODEL_OPTIONS", "False")
+)
+
+SHOW_INFERENCE_SERVER_VLM_OPTIONS = convert_string_to_boolean(
+ get_or_create_env_var("SHOW_INFERENCE_SERVER_VLM_OPTIONS", "False")
+)
+
+SHOW_INFERENCE_SERVER_VLM_MODEL_OPTIONS = convert_string_to_boolean(
+ get_or_create_env_var("SHOW_INFERENCE_SERVER_VLM_MODEL_OPTIONS", "False")
+)
+
+SHOW_HYBRID_MODELS = convert_string_to_boolean(
+ get_or_create_env_var("SHOW_HYBRID_MODELS", "False")
+)
+
+LOCAL_OCR_MODEL_OPTIONS = ["tesseract"]
+
+CHOSEN_LOCAL_MODEL_INTRO_TEXT = get_or_create_env_var(
+ "CHOSEN_LOCAL_MODEL_INTRO_TEXT",
+ """Choose a local OCR model. "tesseract" is the default and will work for documents with clear typed text. """,
+)
+
+PADDLE_OCR_INTRO_TEXT = get_or_create_env_var(
+ "PADDLE_OCR_INTRO_TEXT",
+ """"paddle" is more accurate for text extraction where the text is not clear or well-formatted, but word-level extract is not natively supported, and so word bounding boxes will be inaccurate. """,
+)
+
+PADDLE_OCR_HYBRID_INTRO_TEXT = get_or_create_env_var(
+ "PADDLE_OCR_HYBRID_INTRO_TEXT",
+ """"hybrid-paddle" will do the first pass with Tesseract, and the second with PaddleOCR. """,
+)
+
+VLM_OCR_INTRO_TEXT = get_or_create_env_var(
+ "VLM_OCR_INTRO_TEXT",
+ """"vlm" will call the chosen vision model (VLM) to return a structured json output that is then parsed into word-level bounding boxes. """,
+)
+
+VLM_OCR_HYBRID_INTRO_TEXT = get_or_create_env_var(
+ "VLM_OCR_HYBRID_INTRO_TEXT",
+ """"hybrid-vlm" is a combination of Tesseract for OCR, and a second pass with the chosen vision model (VLM). """,
+)
+
+INFERENCE_SERVER_OCR_INTRO_TEXT = get_or_create_env_var(
+ "INFERENCE_SERVER_OCR_INTRO_TEXT",
+ """"inference-server" will call an external inference-server API to perform OCR using a vision model hosted remotely. """,
+)
+
+HYBRID_PADDLE_VLM_INTRO_TEXT = get_or_create_env_var(
+ "HYBRID_PADDLE_VLM_INTRO_TEXT",
+ """"hybrid-paddle-vlm" is a combination of PaddleOCR with the chosen VLM.""",
+)
+
+HYBRID_PADDLE_INFERENCE_SERVER_INTRO_TEXT = get_or_create_env_var(
+ "HYBRID_PADDLE_INFERENCE_SERVER_INTRO_TEXT",
+ """"hybrid-paddle-inference-server" is a combination of PaddleOCR with an external inference-server API.""",
+)
+
+paddle_options = ["paddle"]
+# if SHOW_HYBRID_MODELS:
+# paddle_options.append("hybrid-paddle")
+if SHOW_PADDLE_MODEL_OPTIONS:
+ LOCAL_OCR_MODEL_OPTIONS.extend(paddle_options)
+ CHOSEN_LOCAL_MODEL_INTRO_TEXT += PADDLE_OCR_INTRO_TEXT
+ # if SHOW_HYBRID_MODELS:
+ # CHOSEN_LOCAL_MODEL_INTRO_TEXT += PADDLE_OCR_HYBRID_INTRO_TEXT
+
+vlm_options = ["vlm"]
+# if SHOW_HYBRID_MODELS:
+# vlm_options.append("hybrid-vlm")
+if SHOW_VLM_MODEL_OPTIONS:
+ LOCAL_OCR_MODEL_OPTIONS.extend(vlm_options)
+ CHOSEN_LOCAL_MODEL_INTRO_TEXT += VLM_OCR_INTRO_TEXT
+ # if SHOW_HYBRID_MODELS:
+ # CHOSEN_LOCAL_MODEL_INTRO_TEXT += VLM_OCR_HYBRID_INTRO_TEXT
+
+if SHOW_PADDLE_MODEL_OPTIONS and SHOW_VLM_MODEL_OPTIONS and SHOW_HYBRID_MODELS:
+ LOCAL_OCR_MODEL_OPTIONS.append("hybrid-paddle-vlm")
+ CHOSEN_LOCAL_MODEL_INTRO_TEXT += HYBRID_PADDLE_VLM_INTRO_TEXT
+
+if (
+ SHOW_PADDLE_MODEL_OPTIONS
+ and SHOW_INFERENCE_SERVER_VLM_OPTIONS
+ and SHOW_HYBRID_MODELS
+):
+ LOCAL_OCR_MODEL_OPTIONS.append("hybrid-paddle-inference-server")
+ CHOSEN_LOCAL_MODEL_INTRO_TEXT += HYBRID_PADDLE_INFERENCE_SERVER_INTRO_TEXT
+
+inference_server_options = ["inference-server"]
+if SHOW_INFERENCE_SERVER_VLM_OPTIONS:
+ LOCAL_OCR_MODEL_OPTIONS.extend(inference_server_options)
+ CHOSEN_LOCAL_MODEL_INTRO_TEXT += INFERENCE_SERVER_OCR_INTRO_TEXT
+
+# Cloud VLM options
+if SHOW_BEDROCK_VLM_MODELS:
+ LOCAL_OCR_MODEL_OPTIONS.append("bedrock-vlm")
+
+if SHOW_GEMINI_VLM_MODELS:
+ LOCAL_OCR_MODEL_OPTIONS.append("gemini-vlm")
+
+if SHOW_AZURE_OPENAI_VLM_MODELS:
+ LOCAL_OCR_MODEL_OPTIONS.append("azure-openai-vlm")
+
+# Inference-server API configuration
+INFERENCE_SERVER_API_URL = get_or_create_env_var(
+ "INFERENCE_SERVER_API_URL", "http://localhost:8080"
+) # Base URL of the inference-server API
+
+INFERENCE_SERVER_MODEL_NAME = get_or_create_env_var(
+ "INFERENCE_SERVER_MODEL_NAME", ""
+) # Optional model name to use. If empty, uses the default model on the server
+
+INFERENCE_SERVER_TIMEOUT = int(
+ get_or_create_env_var("INFERENCE_SERVER_TIMEOUT", "300")
+) # Timeout in seconds for API requests
+
+DEFAULT_INFERENCE_SERVER_VLM_MODEL = get_or_create_env_var(
+ "DEFAULT_INFERENCE_SERVER_VLM_MODEL", "qwen_3_5_27b"
+) # Default model name for inference-server VLM API calls. If empty, uses INFERENCE_SERVER_MODEL_NAME or server default
+
+DEFAULT_INFERENCE_SERVER_PII_MODEL = get_or_create_env_var(
+ "DEFAULT_INFERENCE_SERVER_PII_MODEL", "gemma_3_27b"
+) # Default model name for inference-server PII detection API calls. If empty, uses INFERENCE_SERVER_MODEL_NAME, CHOSEN_INFERENCE_SERVER_PII_MODEL, or server default
+
+MODEL_CACHE_PATH = get_or_create_env_var("MODEL_CACHE_PATH", "./model_cache")
+MODEL_CACHE_PATH = ensure_folder_within_app_directory(MODEL_CACHE_PATH)
+
+
+HYBRID_OCR_CONFIDENCE_THRESHOLD = int(
+ get_or_create_env_var("HYBRID_OCR_CONFIDENCE_THRESHOLD", "95")
+) # The tesseract confidence threshold under which the text will be passed to PaddleOCR for re-extraction using the hybrid OCR method.
+
+HYBRID_OCR_PADDING = int(
+ get_or_create_env_var("HYBRID_OCR_PADDING", "5")
+) # The padding (in pixels) to add to the text when passing it to PaddleOCR for re-extraction using the hybrid OCR method.
+
+# Hybrid Textract + Bedrock VLM: when active and AWS Textract is selected, lines with low average confidence are re-analyzed with Bedrock VLM and replaced.
+HYBRID_TEXTRACT_BEDROCK_VLM = convert_string_to_boolean(
+ get_or_create_env_var("HYBRID_TEXTRACT_BEDROCK_VLM", "False")
+)
+HYBRID_TEXTRACT_BEDROCK_VLM_CONFIDENCE_THRESHOLD = int(
+ get_or_create_env_var(
+ "HYBRID_TEXTRACT_BEDROCK_VLM_CONFIDENCE_THRESHOLD",
+ str(HYBRID_OCR_CONFIDENCE_THRESHOLD),
+ )
+) # Line average confidence below this (0-100) triggers Bedrock VLM re-extraction.
+HYBRID_TEXTRACT_BEDROCK_VLM_PADDING = int(
+ get_or_create_env_var(
+ "HYBRID_TEXTRACT_BEDROCK_VLM_PADDING", str(HYBRID_OCR_PADDING)
+ )
+) # Padding (pixels) around line crop when calling Bedrock VLM.
+
+TESSERACT_WORD_LEVEL_OCR = convert_string_to_boolean(
+ get_or_create_env_var("TESSERACT_WORD_LEVEL_OCR", "True")
+) # Whether to use Tesseract word-level OCR.
+
+TESSERACT_SEGMENTATION_LEVEL = int(
+ get_or_create_env_var("TESSERACT_SEGMENTATION_LEVEL", "11")
+) # Tesseract segmentation level: PSM level to use for Tesseract OCR
+
+CONVERT_LINE_TO_WORD_LEVEL = convert_string_to_boolean(
+ get_or_create_env_var("CONVERT_LINE_TO_WORD_LEVEL", "True")
+) # Whether to convert paddle line-level OCR results to word-level for better precision
+
+LOAD_PADDLE_AT_STARTUP = convert_string_to_boolean(
+ get_or_create_env_var("LOAD_PADDLE_AT_STARTUP", "False")
+) # Whether to load the PaddleOCR model at startup.
+
+PADDLE_USE_TEXTLINE_ORIENTATION = convert_string_to_boolean(
+ get_or_create_env_var("PADDLE_USE_TEXTLINE_ORIENTATION", "False")
+)
+
+PADDLE_DET_DB_UNCLIP_RATIO = float(
+ get_or_create_env_var("PADDLE_DET_DB_UNCLIP_RATIO", "1.2")
+)
+
+SAVE_EXAMPLE_HYBRID_IMAGES = convert_string_to_boolean(
+ get_or_create_env_var("SAVE_EXAMPLE_HYBRID_IMAGES", "False")
+) # Whether to save example images of Tesseract vs PaddleOCR re-extraction in hybrid OCR mode.
+
+SAVE_TEXTRACT_BEDROCK_HYBRID_EXAMPLES = convert_string_to_boolean(
+ get_or_create_env_var("SAVE_TEXTRACT_BEDROCK_HYBRID_EXAMPLES", "False")
+) # When True, save example crop images and a log of prompt/response for each Textract+Bedrock VLM hybrid inference attempt.
+
+SAVE_PAGE_OCR_VISUALISATIONS = convert_string_to_boolean(
+ get_or_create_env_var("SAVE_PAGE_OCR_VISUALISATIONS", "False")
+) # Whether to save visualisations of Tesseract, PaddleOCR, and Textract bounding boxes.
+
+INCLUDE_OCR_VISUALISATION_IN_OUTPUT_FILES = convert_string_to_boolean(
+ get_or_create_env_var("INCLUDE_OCR_VISUALISATION_IN_OUTPUT_FILES", "True")
+) # Whether to include OCR visualisation outputs in the final output file list returned by choose_and_run_redactor.
+
+SAVE_WORD_SEGMENTER_OUTPUT_IMAGES = convert_string_to_boolean(
+ get_or_create_env_var("SAVE_WORD_SEGMENTER_OUTPUT_IMAGES", "False")
+) # Whether to save output images from the word segmenter.
+
+# Model storage paths for Lambda compatibility
+PADDLE_MODEL_PATH = get_or_create_env_var(
+ "PADDLE_MODEL_PATH", ""
+) # Directory for PaddleOCR model storage. Uses default location if not set.
+if PADDLE_MODEL_PATH:
+ PADDLE_MODEL_PATH = ensure_folder_within_app_directory(PADDLE_MODEL_PATH)
+
+PADDLE_FONT_PATH = get_or_create_env_var(
+ "PADDLE_FONT_PATH", ""
+) # Custom font path for PaddleOCR. If empty, will attempt to use system fonts to avoid downloading simfang.ttf/PingFang-SC-Regular.ttf.
+if PADDLE_FONT_PATH:
+ PADDLE_FONT_PATH = ensure_folder_within_app_directory(PADDLE_FONT_PATH)
+
+SPACY_MODEL_PATH = get_or_create_env_var(
+ "SPACY_MODEL_PATH", ""
+) # Directory for spaCy model storage. Uses default location if not set.
+if SPACY_MODEL_PATH:
+ SPACY_MODEL_PATH = ensure_folder_within_app_directory(SPACY_MODEL_PATH)
+
+PREPROCESS_LOCAL_OCR_IMAGES = get_or_create_env_var(
+ "PREPROCESS_LOCAL_OCR_IMAGES", "True"
+) # Whether to try and preprocess images before extracting text. NOTE: I have found in testing that this doesn't necessarily imporove results, and greatly slows down extraction.
+
+SAVE_PREPROCESS_IMAGES = convert_string_to_boolean(
+ get_or_create_env_var("SAVE_PREPROCESS_IMAGES", "False")
+) # Whether to save the pre-processed images.
+
+SAVE_VLM_INPUT_IMAGES = convert_string_to_boolean(
+ get_or_create_env_var("SAVE_VLM_INPUT_IMAGES", "False")
+) # Whether to save input images sent to VLM OCR for debugging.
+
+### LLM options
+SHOW_AWS_API_KEYS = convert_string_to_boolean(
+ get_or_create_env_var("SHOW_AWS_API_KEYS", "False")
+)
+# Gemini settings
+SHOW_GEMINI_LLM_MODELS = convert_string_to_boolean(
+ get_or_create_env_var("SHOW_GEMINI_LLM_MODELS", "False")
+)
+GEMINI_API_KEY = get_or_create_env_var("GEMINI_API_KEY", "")
+# Azure/OpenAI AI Inference settings
+SHOW_AZURE_LLM_MODELS = convert_string_to_boolean(
+ get_or_create_env_var("SHOW_AZURE_LLM_MODELS", "False")
+)
+AZURE_OPENAI_API_KEY = get_or_create_env_var("AZURE_OPENAI_API_KEY", "")
+AZURE_OPENAI_INFERENCE_ENDPOINT = get_or_create_env_var(
+ "AZURE_OPENAI_INFERENCE_ENDPOINT", ""
+)
+
+SHOW_INFERENCE_SERVER_LLM_MODELS = convert_string_to_boolean(
+ get_or_create_env_var("SHOW_INFERENCE_SERVER_LLM_MODELS", "False")
+)
+INFERENCE_SERVER_API_URL = get_or_create_env_var(
+ "INFERENCE_SERVER_API_URL", str(INFERENCE_SERVER_API_URL)
+)
+
+# Build up options for models
+model_full_names = list()
+model_short_names = list()
+model_source = list()
+
+# Local Transformers LLM PII Detection Model Configuration
+# See below for the list of accepted models
+LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE = get_or_create_env_var(
+ "LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE", "Qwen 3.5 9B"
+)
+LOCAL_TRANSFORMERS_LLM_PII_REPO_ID = get_or_create_env_var(
+ "LOCAL_TRANSFORMERS_LLM_PII_REPO_ID", "Qwen/Qwen3.5-9B"
+) # Hugging Face repository ID for PII detection model
+
+
+# When USE_TRANSFORMERS_VLM_MODEL_AS_LLM is True, register the VLM model as a Local option so LLM entity detection can use it
+if (
+ USE_TRANSFORMERS_VLM_MODEL_AS_LLM
+ and SHOW_VLM_MODEL_OPTIONS
+ and SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL
+):
+ if SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL not in model_full_names:
+ model_full_names.append(SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL)
+ model_short_names.append(SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL)
+ model_source.append("Local")
+
+# HF token may or may not be needed for downloading models from Hugging Face
+HF_TOKEN = get_or_create_env_var("HF_TOKEN", "")
+
+LOAD_TRANSFORMERS_LLM_PII_MODEL_AT_START = convert_string_to_boolean(
+ get_or_create_env_var("LOAD_TRANSFORMERS_LLM_PII_MODEL_AT_START", "False")
+)
+
+LOAD_TRANSFORMERS_VLM_MODEL_AT_START = convert_string_to_boolean(
+ get_or_create_env_var("LOAD_TRANSFORMERS_VLM_MODEL_AT_START", "True")
+)
+
+MULTIMODAL_PROMPT_FORMAT = convert_string_to_boolean(
+ get_or_create_env_var("MULTIMODAL_PROMPT_FORMAT", "False")
+)
+
+GEMMA3_12B_REPO_ID = get_or_create_env_var(
+ "GEMMA3_12B_REPO_TRANSFORMERS_ID", "unsloth/gemma-3-12b-it-bnb-4bit"
+)
+GEMMA3_27B_REPO_ID = get_or_create_env_var(
+ "GEMMA3_27B_REPO_TRANSFORMERS_ID", "unsloth/gemma-3-27b-it-bnb-4bit"
+)
+GPT_OSS_REPO_ID = get_or_create_env_var("GPT_OSS_REPO_ID", "openai/GPT-OSS 20B")
+
+MINISTRAL_3_14B_INST_REPO_ID = get_or_create_env_var(
+ "MINISTRAL_3_14B_INST_REPO_TRANSFORMERS_ID",
+ "mistralai/Ministral-3-14B-Instruct-2512",
+)
+
+# Qwen 3.0 model repo IDs (8B through 27B, from run_vlm.py)
+QWEN3_8B_INST_REPO_ID = get_or_create_env_var(
+ "QWEN3_8B_INST_REPO_TRANSFORMERS_ID", "Qwen/Qwen3-VL-8B-Instruct"
+)
+
+# Qwen 3.5 model repo IDs (9B through 122B, from run_vlm.py)
+QWEN35_08B_REPO_ID = get_or_create_env_var(
+ "QWEN35_08B_REPO_TRANSFORMERS_ID", "Qwen/Qwen3.5-0.8B"
+)
+QWEN35_2B_REPO_ID = get_or_create_env_var(
+ "QWEN35_2B_REPO_TRANSFORMERS_ID", "Qwen/Qwen3.5-2B"
+)
+QWEN35_4B_REPO_ID = get_or_create_env_var(
+ "QWEN35_4B_REPO_TRANSFORMERS_ID", "Qwen/Qwen3.5-4B"
+)
+QWEN35_9B_REPO_ID = get_or_create_env_var(
+ "QWEN35_9B_REPO_TRANSFORMERS_ID", "Qwen/Qwen3.5-9B"
+)
+
+QWEN35_27B_REPO_ID = get_or_create_env_var(
+ "QWEN35_27B_REPO_TRANSFORMERS_ID", "Qwen/Qwen3.5-27B"
+)
+QWEN35_27B_BNB_4BIT_REPO_ID = get_or_create_env_var(
+ "QWEN35_27B_BNB_4BIT_REPO_TRANSFORMERS_ID", "bertbobson/Qwen3.5-27B-bnb-4bit"
+)
+QWEN35_35B_A3B_REPO_ID = get_or_create_env_var(
+ "QWEN35_35B_A3B_REPO_TRANSFORMERS_ID", "Qwen/Qwen3.5-35B-A3B"
+)
+QWEN35_122B_A10B_REPO_ID = get_or_create_env_var(
+ "QWEN35_122B_A10B_REPO_TRANSFORMERS_ID", "Qwen/Qwen3.5-122B-A10B"
+)
+
+NVIDIA_NEMOTRON_30B_A3B_REPO_ID = get_or_create_env_var(
+ "NVIDIA_NEMOTRON_30B_A3B_REPO_TRANSFORMERS_ID",
+ "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4",
+)
+
+if LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE:
+ # Rename LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE for display on GUI
+ model_choice_lower = LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE.lower()
+
+ if "gemma-3-12b" in model_choice_lower or "gemma3-12b" in model_choice_lower:
+ LOCAL_TRANSFORMERS_LLM_PII_REPO_ID = GEMMA3_12B_REPO_ID
+ LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE = "Gemma 3 12B"
+ elif "gemma-3-27b" in model_choice_lower or "gemma3-27b" in model_choice_lower:
+ LOCAL_TRANSFORMERS_LLM_PII_REPO_ID = GEMMA3_27B_REPO_ID
+ LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE = "Gemma 3 27B"
+ elif "gpt-oss" in model_choice_lower:
+ LOCAL_TRANSFORMERS_LLM_PII_REPO_ID = GPT_OSS_REPO_ID
+ LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE = "GPT-OSS 20B"
+ elif (
+ "ministral-3-14b-instruct" in model_choice_lower
+ or "ministral3-14b-instruct" in model_choice_lower
+ ):
+ LOCAL_TRANSFORMERS_LLM_PII_REPO_ID = MINISTRAL_3_14B_INST_REPO_ID
+ LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE = "Ministral 3 14B Instruct"
+ elif (
+ "qwen3-8b-instruct" in model_choice_lower
+ or "qwen-3-8b-instruct" in model_choice_lower
+ ):
+ LOCAL_TRANSFORMERS_LLM_PII_REPO_ID = QWEN3_8B_INST_REPO_ID
+ LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE = "Qwen 3 8B Instruct"
+ elif "qwen3.5-0.8b" in model_choice_lower or "qwen-3.5-0.8b" in model_choice_lower:
+ LOCAL_TRANSFORMERS_LLM_PII_REPO_ID = QWEN35_08B_REPO_ID
+ LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE = "Qwen 3.5 0.8B"
+ elif "qwen3.5-2b" in model_choice_lower or "qwen-3.5-2b" in model_choice_lower:
+ LOCAL_TRANSFORMERS_LLM_PII_REPO_ID = QWEN35_2B_REPO_ID
+ LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE = "Qwen 3.5 2B"
+ elif "qwen3.5-4b" in model_choice_lower or "qwen-3.5-4b" in model_choice_lower:
+ LOCAL_TRANSFORMERS_LLM_PII_REPO_ID = QWEN35_4B_REPO_ID
+ LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE = "Qwen 3.5 4B"
+ elif "qwen3.5-9b" in model_choice_lower or "qwen-3.5-9b" in model_choice_lower:
+ LOCAL_TRANSFORMERS_LLM_PII_REPO_ID = QWEN35_9B_REPO_ID
+ LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE = "Qwen 3.5 9B"
+ elif (
+ "qwen3.5-27b-bnb" in model_choice_lower
+ or "qwen3.5-27b-4bit" in model_choice_lower
+ ):
+ LOCAL_TRANSFORMERS_LLM_PII_REPO_ID = QWEN35_27B_BNB_4BIT_REPO_ID
+ LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE = "Qwen 3.5 27B (4-bit)"
+ elif "qwen3.5-27b" in model_choice_lower or "qwen-3.5-27b" in model_choice_lower:
+ LOCAL_TRANSFORMERS_LLM_PII_REPO_ID = QWEN35_27B_REPO_ID
+ LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE = "Qwen 3.5 27B"
+ elif "qwen3.5-35b" in model_choice_lower or "qwen-3.5-35b" in model_choice_lower:
+ LOCAL_TRANSFORMERS_LLM_PII_REPO_ID = QWEN35_35B_A3B_REPO_ID
+ LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE = "Qwen 3.5 35B-A3B"
+ elif "qwen3.5-122b" in model_choice_lower or "qwen-3.5-122b" in model_choice_lower:
+ LOCAL_TRANSFORMERS_LLM_PII_REPO_ID = QWEN35_122B_A10B_REPO_ID
+ LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE = "Qwen 3.5 122B-A10B"
+ elif (
+ "nvidia-nemotron-3-nano-30b-a3b-nvfp4" in model_choice_lower
+ or "nvidia-nemotron-3-nano-30b-a3b-nvfp4" in model_choice_lower
+ ):
+ LOCAL_TRANSFORMERS_LLM_PII_REPO_ID = NVIDIA_NEMOTRON_30B_A3B_REPO_ID
+ LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE = "NVIDIA Nemotron 3 Nano 30B A3B NVFP4"
+
+if (
+ SHOW_TRANSFORMERS_LLM_PII_DETECTION_OPTIONS
+ and LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE
+):
+
+ display_name = LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE
+ model_full_names.append(display_name)
+ model_short_names.append(display_name)
+ model_source.append("Local")
+
+# Set MULTIMODAL_PROMPT_FORMAT based on model choice
+if LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE in [
+ "Gemma 3 12B",
+ "Gemma 3 27B",
+ "Ministral 3 14B Instruct",
+ "Qwen 3 8B Instruct" "Qwen 3.5 0.8B",
+ "Qwen 3.5 2B",
+ "Qwen 3.5 4B",
+ "Qwen 3.5 9B",
+ "Qwen 3.5 27B (4-bit)",
+ "Qwen 3.5 27B",
+ "Qwen 3.5 35B-A3B",
+ "Qwen 3.5 122B-A10B",
+]:
+ MULTIMODAL_PROMPT_FORMAT = True
+
+OVERRIDE_LLM_TRANSFORMERS_REPO_ID = get_or_create_env_var(
+ "OVERRIDE_LLM_TRANSFORMERS_REPO_ID", ""
+)
+if OVERRIDE_LLM_TRANSFORMERS_REPO_ID:
+ LOCAL_TRANSFORMERS_LLM_PII_REPO_ID = OVERRIDE_LLM_TRANSFORMERS_REPO_ID
+
+### AWS Bedrock LLM Model Options
+
+amazon_models = [
+ "anthropic.claude-3-haiku-20240307-v1:0",
+ "anthropic.claude-3-7-sonnet-20250219-v1:0",
+ "anthropic.claude-sonnet-4-5-20250929-v1:0",
+ "anthropic.claude-sonnet-4-6",
+ "amazon.nova-micro-v1:0",
+ "amazon.nova-lite-v1:0",
+ "amazon.nova-pro-v1:0",
+ "deepseek.v3-v1:0",
+ "openai.GPT-OSS 20B-1:0",
+ "openai.gpt-oss-120b-1:0",
+ "google.gemma-3-12b-it",
+ "mistral.ministral-3-14b-instruct",
+]
+
+if SHOW_AWS_BEDROCK_LLM_MODELS:
+ model_full_names.extend(amazon_models)
+ model_short_names.extend(
+ [
+ "haiku",
+ "sonnet_3_7",
+ "sonnet_4_5",
+ "sonnet_4_6",
+ "nova_micro",
+ "nova_lite",
+ "nova_pro",
+ "deepseek_v3",
+ "gpt_oss_20b_aws",
+ "gpt_oss_120b_aws",
+ "gemma_3_12b_it",
+ "ministral_3_14b_instruct",
+ ]
+ )
+ model_source.extend(["AWS"] * len(amazon_models))
+
+gemini_models = ["gemini-2.5-flash-lite", "gemini-2.5-flash", "gemini-2.5-pro"]
+
+if SHOW_GEMINI_LLM_MODELS:
+ model_full_names.extend(gemini_models)
+ model_short_names.extend(
+ ["gemini_flash_lite_2.5", "gemini_flash_2.5", "gemini_pro"]
+ )
+ model_source.extend(["Gemini"] * len(gemini_models))
+
+azure_models = ["gpt-5-mini", "gpt-4o-mini"]
+
+# Register Azure/OpenAI AI models (model names must match your Azure/OpenAI deployments)
+if SHOW_AZURE_LLM_MODELS:
+ # Example deployments; adjust to the deployments you actually create in Azure/OpenAI
+ model_full_names.extend(azure_models)
+ model_short_names.extend(["gpt-5-mini", "gpt-4o-mini"])
+ model_source.extend(["Azure/OpenAI"] * len(azure_models))
+
+
+# Register inference-server models
+CHOSEN_INFERENCE_SERVER_PII_MODEL = ""
+inference_server_models = [
+ "unnamed-inference-server-model",
+ "qwen_3_4b_it",
+ "qwen_3_4b_think",
+ "gpt_oss_20b",
+ "gemma_3_12b",
+ "ministral_3_14b_it",
+]
+
+if SHOW_INFERENCE_SERVER_LLM_MODELS:
+ # Example inference-server models; adjust to the models you have available on your server
+ model_full_names.extend(inference_server_models)
+ model_short_names.extend(inference_server_models)
+ model_source.extend(["inference-server"] * len(inference_server_models))
+
+ CHOSEN_INFERENCE_SERVER_PII_MODEL = get_or_create_env_var(
+ "CHOSEN_INFERENCE_SERVER_PII_MODEL", inference_server_models[0]
+ )
+
+ # If the chosen inference server model is not in the list of inference server models, add it to the list
+ if CHOSEN_INFERENCE_SERVER_PII_MODEL not in inference_server_models:
+ model_full_names.append(CHOSEN_INFERENCE_SERVER_PII_MODEL)
+ model_short_names.append(CHOSEN_INFERENCE_SERVER_PII_MODEL)
+ model_source.append("inference-server")
+
+# Inference Server LLM Model Choice for PII Detection
+# This is the primary config variable for choosing inference server models for PII detection
+# Note: This must be defined after CHOSEN_INFERENCE_SERVER_PII_MODEL
+INFERENCE_SERVER_LLM_PII_MODEL_CHOICE = get_or_create_env_var(
+ "INFERENCE_SERVER_LLM_PII_MODEL_CHOICE",
+ (
+ DEFAULT_INFERENCE_SERVER_PII_MODEL
+ if DEFAULT_INFERENCE_SERVER_PII_MODEL
+ else (
+ CHOSEN_INFERENCE_SERVER_PII_MODEL
+ if CHOSEN_INFERENCE_SERVER_PII_MODEL
+ else ""
+ )
+ ),
+) # Model choice for inference-server PII detection. Defaults to DEFAULT_INFERENCE_SERVER_PII_MODEL, then CHOSEN_INFERENCE_SERVER_PII_MODEL
+
+# Is Llama Swap used for model selection?
+USE_LLAMA_SWAP = convert_string_to_boolean(
+ get_or_create_env_var("USE_LLAMA_SWAP", "False")
+)
+
+###
+# Map all model names to their short names and sources
+###
+
+model_name_map = {
+ full: {"short_name": short, "source": source}
+ for full, short, source in zip(model_full_names, model_short_names, model_source)
+}
+
+if SHOW_TRANSFORMERS_LLM_PII_DETECTION_OPTIONS:
+ default_model_choice = LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE
+elif SHOW_INFERENCE_SERVER_LLM_MODELS:
+ default_model_choice = CHOSEN_INFERENCE_SERVER_PII_MODEL
+elif SHOW_AWS_BEDROCK_LLM_MODELS:
+ default_model_choice = amazon_models[0]
+elif SHOW_GEMINI_LLM_MODELS:
+ default_model_choice = gemini_models[0]
+elif SHOW_AZURE_LLM_MODELS:
+ default_model_choice = azure_models[0]
+else:
+ default_model_choice = ""
+
+if default_model_choice:
+ default_model_source = model_name_map[default_model_choice]["source"]
+ model_sources = list(
+ set([model_name_map[model]["source"] for model in model_full_names])
+ )
+else:
+ default_model_source = ""
+ model_sources = []
+
+
+def update_model_choice_config(default_model_source, model_name_map):
+ # Filter models by source and return the first matching model name
+ matching_models = [
+ model_name
+ for model_name, model_info in model_name_map.items()
+ if model_info["source"] == default_model_source
+ ]
+
+ output_model = matching_models[0] if matching_models else model_full_names[0]
+
+ return output_model, matching_models
+
+
+if default_model_source:
+ default_model_choice, default_source_models = update_model_choice_config(
+ default_model_source, model_name_map
+ )
+else:
+ default_model_choice = ""
+ default_source_models = []
+
+DIRECT_MODE_INFERENCE_SERVER_MODEL = get_or_create_env_var(
+ "DIRECT_MODE_INFERENCE_SERVER_MODEL",
+ CHOSEN_INFERENCE_SERVER_PII_MODEL if CHOSEN_INFERENCE_SERVER_PII_MODEL else "",
+)
+
+# Cloud LLM Model Choice for PII Detection (AWS Bedrock)
+# Note: This should be set after amazon_models is defined
+CLOUD_LLM_PII_MODEL_CHOICE = get_or_create_env_var(
+ "CLOUD_LLM_PII_MODEL_CHOICE",
+ "amazon.nova-pro-v1:0", # "anthropic.claude-3-7-sonnet-20250219-v1:0" # "amazon.nova-pro-v1:0", # "anthropic.claude-3-7-sonnet-20250219-v1:0", # Default AWS Bedrock model for PII detection
+)
+
+# Cloud LLM model used for PII detection when custom_instructions are provided.
+# If set and non-empty, overrides CLOUD_LLM_PII_MODEL_CHOICE whenever custom instructions are passed to the LLM (e.g. allow-list style rules). Leave empty to always use CLOUD_LLM_PII_MODEL_CHOICE.
+CLOUD_LLM_PII_CUSTOM_INSTRUCTIONS_MODEL_CHOICE = get_or_create_env_var(
+ "CLOUD_LLM_PII_CUSTOM_INSTRUCTIONS_MODEL_CHOICE",
+ "amazon.nova-pro-v1:0", # Empty = use CLOUD_LLM_PII_MODEL_CHOICE even with custom instructions
+)
+
+# Cloud LLM Model Choice for summarisation (AWS Bedrock / cloud)
+# Used by tools/summaries.py; separate from CLOUD_LLM_PII_MODEL_CHOICE so a different model can be used for summarisation vs entity detection.
+CLOUD_SUMMARISATION_MODEL_CHOICE = get_or_create_env_var(
+ "CLOUD_SUMMARISATION_MODEL_CHOICE",
+ "amazon.nova-lite-v1:0", # Default AWS Bedrock model for summarisation
+)
+
+# VLM Model Choice for cloud VLM OCR (defaults to first available cloud model)
+# Note: This should be set after model lists are defined
+CLOUD_VLM_MODEL_CHOICE = get_or_create_env_var(
+ "CLOUD_VLM_MODEL_CHOICE",
+ "amazon.nova-pro-v1:0", # In tests I have seen amazon nova pro outperform qwen3-vl-235b-a22b #"qwen.qwen3-vl-235b-a22b", # Will be set to default below if empty
+) # Default model choice for cloud VLM OCR (Bedrock, Gemini, or Azure/OpenAI)
+
+# Set default CLOUD_VLM_MODEL_CHOICE if not provided
+if not CLOUD_VLM_MODEL_CHOICE or not CLOUD_VLM_MODEL_CHOICE.strip():
+ # Set default based on available models (priority: AWS Bedrock > Gemini > Azure/OpenAI)
+ if SHOW_AWS_BEDROCK_LLM_MODELS and amazon_models:
+ CLOUD_VLM_MODEL_CHOICE = amazon_models[0] # Default to first AWS Bedrock model
+ elif SHOW_GEMINI_LLM_MODELS and gemini_models:
+ CLOUD_VLM_MODEL_CHOICE = gemini_models[0] # Default to first Gemini model
+ elif SHOW_AZURE_LLM_MODELS and azure_models:
+ CLOUD_VLM_MODEL_CHOICE = azure_models[0] # Default to first Azure/OpenAI model
+ else:
+ CLOUD_VLM_MODEL_CHOICE = "" # No default available
+else:
+ # Use the value from environment variable
+ CLOUD_VLM_MODEL_CHOICE = CLOUD_VLM_MODEL_CHOICE.strip()
+
+# Backend used for CUSTOM_VLM_FACES / CUSTOM_VLM_SIGNATURE (face/signature detection).
+# One of: 'transformers_vlm', 'inference_vlm', 'bedrock_vlm'.
+CUSTOM_VLM_BACKEND = (
+ get_or_create_env_var("CUSTOM_VLM_BACKEND", "bedrock_vlm").strip().lower()
+)
+if CUSTOM_VLM_BACKEND not in ("transformers_vlm", "inference_vlm", "bedrock_vlm"):
+ CUSTOM_VLM_BACKEND = "bedrock_vlm"
+
+# Minimum confidence for CUSTOM_VLM face/signature detections to be kept.
+# Values can be provided either as 0..1 or 0..100. Internally interpreted as 0..1.
+CUSTOM_VLM_MIN_CONFIDENCE = float(
+ get_or_create_env_var("CUSTOM_VLM_MIN_CONFIDENCE", "0.65")
+)
+if CUSTOM_VLM_MIN_CONFIDENCE > 1.0:
+ CUSTOM_VLM_MIN_CONFIDENCE = CUSTOM_VLM_MIN_CONFIDENCE / 100.0
+CUSTOM_VLM_MIN_CONFIDENCE = max(0.0, min(1.0, CUSTOM_VLM_MIN_CONFIDENCE))
+
+# Local transformers LLM generation parameters
+LLM_MODEL_DTYPE = get_or_create_env_var("LLM_MODEL_DTYPE", "bfloat16")
+QUANTISE_TRANSFORMERS_LLM_MODELS = convert_string_to_boolean(
+ get_or_create_env_var("QUANTISE_TRANSFORMERS_LLM_MODELS", "False")
+)
+INT8_WITH_OFFLOAD_TO_CPU = convert_string_to_boolean(
+ get_or_create_env_var("INT8_WITH_OFFLOAD_TO_CPU", "False")
+)
+COMPILE_MODE = get_or_create_env_var("COMPILE_MODE", "reduce-overhead")
+COMPILE_TRANSFORMERS = convert_string_to_boolean(
+ get_or_create_env_var("COMPILE_TRANSFORMERS", "False")
+)
+
+PRINT_TRANSFORMERS_USER_PROMPT = convert_string_to_boolean(
+ get_or_create_env_var("PRINT_TRANSFORMERS_USER_PROMPT", "False")
+)
+
+LLM_TEMPERATURE = float(get_or_create_env_var("LLM_TEMPERATURE", "0.1"))
+LLM_TOP_K = int(
+ get_or_create_env_var("LLM_TOP_K", "64")
+) # https://docs.unsloth.ai/basics/gemma-3-how-to-run-and-fine-tune
+LLM_MIN_P = float(get_or_create_env_var("LLM_MIN_P", "0"))
+LLM_TOP_P = float(get_or_create_env_var("LLM_TOP_P", "0.95"))
+LLM_REPETITION_PENALTY = float(get_or_create_env_var("LLM_REPETITION_PENALTY", "1.0"))
+LLM_MAX_NEW_TOKENS = int(get_or_create_env_var("LLM_MAX_NEW_TOKENS", "8192"))
+LLM_SEED = int(get_or_create_env_var("LLM_SEED", "42"))
+LLM_RESET = convert_string_to_boolean(get_or_create_env_var("LLM_RESET", "False"))
+LLM_STREAM = convert_string_to_boolean(get_or_create_env_var("LLM_STREAM", "True"))
+LLM_THREADS = int(get_or_create_env_var("LLM_THREADS", "-1"))
+LLM_CONTEXT_LENGTH = int(
+ get_or_create_env_var("LLM_CONTEXT_LENGTH", "32768")
+) # LLM only: maximum context length for text LLMs (e.g. llama.cpp). Separate from MAX_INPUT_TOKEN_LENGTH (VLM).
+LLM_STOP_STRINGS = _get_env_list(get_or_create_env_var("LLM_STOP_STRINGS", ""))
+LLM_RETRY_ATTEMPTS = int(get_or_create_env_var("NUMBER_OF_RETRY_ATTEMPTS", "10"))
+LLM_TIMEOUT_WAIT = int(get_or_create_env_var("TIMEOUT_WAIT", "5"))
+
+# Additional LLM configuration options (not currently implemented)
+SPECULATIVE_DECODING = convert_string_to_boolean(
+ get_or_create_env_var("SPECULATIVE_DECODING", "False")
+)
+ASSISTANT_MODEL = get_or_create_env_var(
+ "ASSISTANT_MODEL", ""
+) # Not currently implemented
+
+
+# If you are using e.g. gpt-oss, you can add a reasoning suffix to set reasoning level, or turn it off in the case of Qwen 3 4B
+# Use LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE if available, otherwise check LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE
+model_type_for_reasoning = LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE
+
+if LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE == "GPT-OSS 20B":
+ REASONING_SUFFIX = get_or_create_env_var("REASONING_SUFFIX", "Reasoning: low")
+ # print("Using REASONING_SUFFIX: Reasoning: low")
+elif "Qwen 3 " in LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE:
+ # print("Using REASONING_SUFFIX: /nothink")
+ REASONING_SUFFIX = get_or_create_env_var("REASONING_SUFFIX", "/think")
+else:
+ # print("No reasoning suffix applied")
+ REASONING_SUFFIX = get_or_create_env_var("REASONING_SUFFIX", "")
+
+###
+# Entities for redaction
+###
+CHOSEN_COMPREHEND_ENTITIES = get_or_create_env_var(
+ "CHOSEN_COMPREHEND_ENTITIES",
+ "['EMAIL','ADDRESS','NAME','PHONE', 'PASSPORT_NUMBER', 'UK_NATIONAL_INSURANCE_NUMBER', 'UK_NATIONAL_HEALTH_SERVICE_NUMBER', 'CUSTOM']",
+)
+
+FULL_COMPREHEND_ENTITY_LIST = get_or_create_env_var(
+ "FULL_COMPREHEND_ENTITY_LIST",
+ "['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE','SSN','DATE_TIME','PASSPORT_NUMBER','DRIVER_ID','URL','AGE','USERNAME','PASSWORD','AWS_ACCESS_KEY','AWS_SECRET_KEY','IP_ADDRESS','MAC_ADDRESS','LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER','INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER', 'ALL', 'CUSTOM', 'CUSTOM_FUZZY']",
+)
+
+FULL_LLM_ENTITY_LIST = get_or_create_env_var(
+ "FULL_LLM_ENTITY_LIST",
+ "['EMAIL_ADDRESS', 'STREET_ADDRESS','PERSON_NAME','PHONE_NUMBER', 'DATE_TIME', 'URL', 'IP_ADDRESS', 'AGE', 'BANK_ACCOUNT_NUMBER', 'PASSPORT_NUMBER', 'CUSTOM', 'CUSTOM_FUZZY']",
+)
+
+# Entities for LLM-based PII redaction option
+CHOSEN_LLM_ENTITIES = get_or_create_env_var(
+ "CHOSEN_LLM_ENTITIES",
+ "['EMAIL_ADDRESS','STREET_ADDRESS','PERSON_NAME','PHONE_NUMBER', 'CUSTOM']",
+)
+
+
+# Entities for local PII redaction option
+CHOSEN_REDACT_ENTITIES = get_or_create_env_var(
+ "CHOSEN_REDACT_ENTITIES",
+ "['TITLES', 'PERSON', 'PHONE_NUMBER', 'EMAIL_ADDRESS', 'STREETNAME', 'UKPOSTCODE', 'CUSTOM']",
+)
+
+FULL_ENTITY_LIST = get_or_create_env_var(
+ "FULL_ENTITY_LIST",
+ "['TITLES', 'PERSON', 'PHONE_NUMBER', 'EMAIL_ADDRESS', 'STREETNAME', 'UKPOSTCODE', 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS', 'CUSTOM', 'CUSTOM_FUZZY']",
+)
+
+
+CUSTOM_ENTITIES = get_or_create_env_var(
+ "CUSTOM_ENTITIES",
+ "['TITLES', 'UKPOSTCODE', 'STREETNAME']",
+)
+
+
+DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX = get_or_create_env_var(
+ "DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX", "[]"
+)
+
+HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS = get_or_create_env_var(
+ "HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS",
+ "['Extract handwriting', 'Extract signatures']",
+)
+
+if HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS:
+ HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS = _get_env_list(
+ HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS
+ )
+
+INCLUDE_FORM_EXTRACTION_TEXTRACT_OPTION = get_or_create_env_var(
+ "INCLUDE_FORM_EXTRACTION_TEXTRACT_OPTION", "False"
+)
+INCLUDE_LAYOUT_EXTRACTION_TEXTRACT_OPTION = get_or_create_env_var(
+ "INCLUDE_LAYOUT_EXTRACTION_TEXTRACT_OPTION", "False"
+)
+INCLUDE_TABLE_EXTRACTION_TEXTRACT_OPTION = get_or_create_env_var(
+ "INCLUDE_TABLE_EXTRACTION_TEXTRACT_OPTION", "False"
+)
+INCLUDE_FACE_IDENTIFICATION_TEXTRACT_OPTION = get_or_create_env_var(
+ "INCLUDE_FACE_IDENTIFICATION_TEXTRACT_OPTION", "False"
+)
+
+if INCLUDE_FORM_EXTRACTION_TEXTRACT_OPTION == "True":
+ HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS.append("Extract forms")
+if INCLUDE_LAYOUT_EXTRACTION_TEXTRACT_OPTION == "True":
+ HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS.append("Extract layout")
+if INCLUDE_TABLE_EXTRACTION_TEXTRACT_OPTION == "True":
+ HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS.append("Extract tables")
+if INCLUDE_FACE_IDENTIFICATION_TEXTRACT_OPTION == "True":
+ HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS.append("Face detection")
+
+# Whether to split punctuation from words in Textract output
+# If True, punctuation marks (full stops, commas, quotes, brackets, etc.) will be separated
+# from alphanumeric characters and returned as separate words with separate bounding boxes.
+# If False, words will be returned as-is from Textract (original behavior).
+SPLIT_PUNCTUATION_FROM_WORDS = convert_string_to_boolean(
+ get_or_create_env_var("SPLIT_PUNCTUATION_FROM_WORDS", "True")
+)
+
+DEFAULT_SEARCH_QUERY = get_or_create_env_var("DEFAULT_SEARCH_QUERY", "")
+DEFAULT_FUZZY_SPELLING_MISTAKES_NUM = int(
+ get_or_create_env_var("DEFAULT_FUZZY_SPELLING_MISTAKES_NUM", "0")
+)
+
+DEFAULT_PAGE_MIN = int(get_or_create_env_var("DEFAULT_PAGE_MIN", "0"))
+
+DEFAULT_PAGE_MAX = int(get_or_create_env_var("DEFAULT_PAGE_MAX", "0"))
+
+
+### Language selection options
+
+SHOW_LANGUAGE_SELECTION = convert_string_to_boolean(
+ get_or_create_env_var("SHOW_LANGUAGE_SELECTION", "False")
+)
+
+DEFAULT_LANGUAGE_FULL_NAME = get_or_create_env_var(
+ "DEFAULT_LANGUAGE_FULL_NAME", "english"
+)
+DEFAULT_LANGUAGE = get_or_create_env_var(
+ "DEFAULT_LANGUAGE", "en"
+) # For tesseract, ensure the Tesseract language data (e.g., fra.traineddata) is installed on your system. You can find the relevant language packs here: https://github.com/tesseract-ocr/tessdata.
+# For paddle, ensure the paddle language data (e.g., fra.traineddata) is installed on your system. You can find information on supported languages here: https://www.paddleocr.ai/main/en/version3.x/algorithm/PP-OCRv5/PP-OCRv5_multi_languages.html
+# For AWS Comprehend, only English and Spanish are supported https://docs.aws.amazon.com/comprehend/latest/dg/how-pii.html ['en', 'es']
+# AWS Textract automatically detects the language of the document and supports the following languages: https://aws.amazon.com/textract/faqs/#topic-0. 'English, Spanish, Italian, Portuguese, French, German. Handwriting, Invoices and Receipts, Identity documents and Queries processing are in English only'
+
+textract_language_choices = get_or_create_env_var(
+ "textract_language_choices", "['en', 'es', 'fr', 'de', 'it', 'pt']"
+)
+aws_comprehend_language_choices = get_or_create_env_var(
+ "aws_comprehend_language_choices", "['en', 'es']"
+)
+
+# The choices that the user sees
+MAPPED_LANGUAGE_CHOICES = get_or_create_env_var(
+ "MAPPED_LANGUAGE_CHOICES",
+ "['english', 'french', 'german', 'spanish', 'italian', 'dutch', 'portuguese', 'chinese', 'japanese', 'korean', 'lithuanian', 'macedonian', 'norwegian_bokmaal', 'polish', 'romanian', 'russian', 'slovenian', 'swedish', 'catalan', 'ukrainian']",
+)
+LANGUAGE_CHOICES = get_or_create_env_var(
+ "LANGUAGE_CHOICES",
+ "['en', 'fr', 'de', 'es', 'it', 'nl', 'pt', 'zh', 'ja', 'ko', 'lt', 'mk', 'nb', 'pl', 'ro', 'ru', 'sl', 'sv', 'ca', 'uk']",
+)
+
+###
+# Duplicate detection settings
+###
+DEFAULT_DUPLICATE_DETECTION_THRESHOLD = float(
+ get_or_create_env_var("DEFAULT_DUPLICATE_DETECTION_THRESHOLD", "0.95")
+)
+DEFAULT_MIN_CONSECUTIVE_PAGES = int(
+ get_or_create_env_var("DEFAULT_MIN_CONSECUTIVE_PAGES", "1")
+)
+USE_GREEDY_DUPLICATE_DETECTION = convert_string_to_boolean(
+ get_or_create_env_var("USE_GREEDY_DUPLICATE_DETECTION", "True")
+)
+DEFAULT_COMBINE_PAGES = convert_string_to_boolean(
+ get_or_create_env_var("DEFAULT_COMBINE_PAGES", "True")
+) # Combine text from the same page number within a file. Alternative will enable line-level duplicate detection.
+DEFAULT_MIN_WORD_COUNT = int(get_or_create_env_var("DEFAULT_MIN_WORD_COUNT", "10"))
+REMOVE_DUPLICATE_ROWS = convert_string_to_boolean(
+ get_or_create_env_var("REMOVE_DUPLICATE_ROWS", "False")
+)
+
+
+###
+# File output options
+###
+# Should the output pdf redaction boxes be drawn using the custom box colour?
+USE_GUI_BOX_COLOURS_FOR_OUTPUTS = convert_string_to_boolean(
+ get_or_create_env_var("USE_GUI_BOX_COLOURS_FOR_OUTPUTS", "False")
+)
+
+# This is the colour of the output pdf redaction boxes. Should be a tuple of three integers between 0 and 255
+CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "(0, 0, 0)")
+
+if CUSTOM_BOX_COLOUR == "grey":
+ # only "grey" is currently supported as a custom box colour by name, or a tuple of three integers between 0 and 255
+ CUSTOM_BOX_COLOUR = (128, 128, 128)
+else:
+ try:
+ components_str = CUSTOM_BOX_COLOUR.strip("()").split(",")
+ CUSTOM_BOX_COLOUR = tuple(
+ int(c.strip()) for c in components_str
+ ) # Always gives a tuple of three integers between 0 and 255
+ except Exception as e:
+ print(f"Error initialising CUSTOM_BOX_COLOUR: {e}, returning default black")
+ CUSTOM_BOX_COLOUR = (
+ 0,
+ 0,
+ 0,
+ ) # Default to black if the custom box colour is not a valid tuple of three integers between 0 and 255
+
+# Apply redactions defaults to images, graphics, and text, from: https://pymupdf.readthedocs.io/en/latest/page.html#Page.apply_redactions
+# For images, the default is set to 0, to ignore. Text presented in images is effectively removed by the overlapping rectangle shape that becomes an embedded part of the document (see the redact_single_box function in file_redaction.py).
+APPLY_REDACTIONS_IMAGES = int(
+ get_or_create_env_var("APPLY_REDACTIONS_IMAGES", "0")
+) # The default (2) blanks out overlapping pixels. PDF_REDACT_IMAGE_NONE | 0 ignores, and PDF_REDACT_IMAGE_REMOVE | 1 completely removes images overlapping any redaction annotation. Option PDF_REDACT_IMAGE_REMOVE_UNLESS_INVISIBLE | 3 only removes images that are actually visible.
+APPLY_REDACTIONS_GRAPHICS = int(
+ get_or_create_env_var("APPLY_REDACTIONS_GRAPHICS", "0")
+) # How to redact overlapping vector graphics (also called "line-art" or "drawings"). (2) removes any overlapping vector graphics. PDF_REDACT_LINE_ART_NONE | 0 ignores, and PDF_REDACT_LINE_ART_REMOVE_IF_COVERED | 1 removes graphics fully contained in a redaction annotation.
+APPLY_REDACTIONS_TEXT = int(
+ get_or_create_env_var("APPLY_REDACTIONS_TEXT", "0")
+) # The default PDF_REDACT_TEXT_REMOVE | 0 removes all characters whose boundary box overlaps any redaction rectangle. This complies with the original legal / data protection intentions of redaction annotations. Other use cases however may require to keep text while redacting vector graphics or images. This can be achieved by setting text=True|PDF_REDACT_TEXT_NONE | 1. This does not comply with the data protection intentions of redaction annotations. Do so at your own risk.
+
+# If you don't want to redact the text, but instead just draw a box over it, set this to True
+RETURN_PDF_FOR_REVIEW = convert_string_to_boolean(
+ get_or_create_env_var("RETURN_PDF_FOR_REVIEW", "True")
+)
+
+# When True (and RETURN_PDF_FOR_REVIEW), write _redacted.pdf then _redactions_for_review.pdf
+# using two sequential full-document passes instead of two simultaneous PyMuPDF Document
+# objects. Cuts peak RAM (~halves PDF working set); roughly doubles apply time for those outputs.
+TWO_PASS_REVIEW_PDF_LOW_MEMORY = convert_string_to_boolean(
+ get_or_create_env_var("TWO_PASS_REVIEW_PDF_LOW_MEMORY", "False")
+)
+
+RETURN_REDACTED_PDF = convert_string_to_boolean(
+ get_or_create_env_var("RETURN_REDACTED_PDF", "True")
+) # Return a redacted PDF at the end of the redaction task. Could be useful to set this to "False" if you want to ensure that the user always goes to the 'Review Redactions' tab before getting the final redacted PDF product.
+
+COMPRESS_REDACTED_PDF = convert_string_to_boolean(
+ get_or_create_env_var("COMPRESS_REDACTED_PDF", "False")
+) # On low memory systems, the compression options in pymupdf can cause the app to crash if the PDF is longer than 500 pages or so. Setting this to False will save the PDF only with a basic cleaning option enabled
+
+# When True, print lightweight timing for apply_redactions_to_review_df_and_files (page_sizes build, per-page redaction, review CSV). For debugging hotspots only.
+PROFILE_REDACTION_APPLY = convert_string_to_boolean(
+ get_or_create_env_var("PROFILE_REDACTION_APPLY", "False")
+)
+
+# When True, merge overlapping or very close redaction boxes into fewer larger rectangles before applying (reduces redact_single_box calls on dense pages). Off by default. Warning - may turn a page into one single large redaction box if there are many redactions on the page.
+MERGE_SMALL_REDACTIONS = convert_string_to_boolean(
+ get_or_create_env_var("MERGE_SMALL_REDACTIONS", "False")
+)
+
+# When True, use Polars for review DataFrame coordinate scaling and CSV write in apply_redactions_to_review_df_and_files (faster for large annotation sets).
+USE_POLARS_FOR_REVIEW = convert_string_to_boolean(
+ get_or_create_env_var("USE_POLARS_FOR_REVIEW", "True")
+)
+
+# When True and multiple file paths are given, process each file in parallel in apply_redactions_to_review_df_and_files. Off by default.
+ENABLE_PARALLEL_FILES_APPLY_REDACTIONS = convert_string_to_boolean(
+ get_or_create_env_var("ENABLE_PARALLEL_FILES_APPLY_REDACTIONS", "True")
+)
+
+# When True, build review DataFrame from all_image_annotations in chunked parallel workers (experimental). Off by default.
+ENABLE_REVIEW_CSV_PARALLELISM = convert_string_to_boolean(
+ get_or_create_env_var("ENABLE_REVIEW_CSV_PARALLELISM", "True")
+)
+
+# Batch limits for AWS Comprehend and LLM entity detection. Batches are cut at the nearest
+# phrase-ending punctuation (PHRASE_ENDING_PUNCTUATION), newline, or end of page so text
+# is never cut mid-sentence.
+DEFAULT_NEW_BATCH_CHAR_COUNT = int(
+ get_or_create_env_var("DEFAULT_NEW_BATCH_CHAR_COUNT", "2500")
+)
+DEFAULT_NEW_BATCH_WORD_COUNT = int(DEFAULT_NEW_BATCH_CHAR_COUNT / 5)
+
+###
+# APP RUN / GUI OPTIONS
+###
+# Link to user guide - ensure it is a valid URL
+USER_GUIDE_URL = validate_safe_url(
+ get_or_create_env_var(
+ "USER_GUIDE_URL", "https://seanpedrick-case.github.io/doc_redaction"
+ )
+)
+
+DEFAULT_INTRO_TEXT = f"""# Document redaction
+
+ Redact personally identifiable information (PII) from documents (pdf, png, jpg), Word files (docx), or tabular data (xlsx/csv/parquet). Please see the [User Guide]({USER_GUIDE_URL}) for a full walkthrough of all the features in the app.
+
+ To extract text from documents, the 'Local' options are PikePDF for PDFs with selectable text, and OCR with Tesseract. Use AWS Textract to extract more complex elements e.g. handwriting, signatures, or unclear text. For PII identification, 'Local' (based on spaCy) gives good results if you are looking for common names or terms, or a custom list of terms to redact (see Redaction settings). AWS Comprehend gives better results at a small cost.
+
+ Additional options on the 'Redaction settings' include, the type of information to redact (e.g. people, places), custom terms to include/ exclude from redaction, fuzzy matching, language settings, and whole page redaction. After redaction is complete, you can view and modify suggested redactions on the 'Review redactions' tab to quickly create a final redacted document.
+
+ NOTE: The app is not 100% accurate, and it will miss some personal information. It is essential that all outputs are reviewed **by a human** before using the final outputs."""
+
+INTRO_TEXT = get_or_create_env_var("INTRO_TEXT", DEFAULT_INTRO_TEXT)
+
+# Read in intro text from a text file if it is a path to a text file
+if INTRO_TEXT.endswith(".txt"):
+ # Validate the path is safe (with base path for relative paths)
+ if validate_path_safety(INTRO_TEXT, base_path="."):
+ try:
+ # Use secure file read with explicit encoding
+ INTRO_TEXT = secure_file_read(".", INTRO_TEXT, encoding="utf-8")
+ # Format the text to replace {USER_GUIDE_URL} with the actual value
+ INTRO_TEXT = INTRO_TEXT.format(USER_GUIDE_URL=USER_GUIDE_URL)
+ except FileNotFoundError:
+ print(f"Warning: Intro text file not found: {INTRO_TEXT}")
+ INTRO_TEXT = DEFAULT_INTRO_TEXT
+ except Exception as e:
+ print(f"Error reading intro text file: {e}")
+ # Fallback to default
+ INTRO_TEXT = DEFAULT_INTRO_TEXT
+ else:
+ print(f"Warning: Unsafe file path detected for INTRO_TEXT: {INTRO_TEXT}")
+ INTRO_TEXT = DEFAULT_INTRO_TEXT
+
+# Sanitize the text
+INTRO_TEXT = sanitize_markdown_text(INTRO_TEXT.strip('"').strip("'"))
+
+# Ensure we have valid content after sanitization
+if not INTRO_TEXT or not INTRO_TEXT.strip():
+ print("Warning: Intro text is empty after sanitization, using default intro text")
+ INTRO_TEXT = sanitize_markdown_text(DEFAULT_INTRO_TEXT)
+
+TLDEXTRACT_CACHE = get_or_create_env_var("TLDEXTRACT_CACHE", "tmp/tld/")
+TLDEXTRACT_CACHE = ensure_folder_within_app_directory(TLDEXTRACT_CACHE)
+try:
+ extract = TLDExtract(cache_dir=TLDEXTRACT_CACHE)
+except Exception as e:
+ print(f"Error initialising TLDExtract: {e}")
+ extract = TLDExtract(cache_dir=None)
+
+# Get some environment variables and Launch the Gradio app
+COGNITO_AUTH = convert_string_to_boolean(get_or_create_env_var("COGNITO_AUTH", "False"))
+
+SHOW_FEEDBACK_BUTTONS = convert_string_to_boolean(
+ get_or_create_env_var("SHOW_FEEDBACK_BUTTONS", "False")
+)
+
+SHOW_ALL_OUTPUTS_IN_OUTPUT_FOLDER = convert_string_to_boolean(
+ get_or_create_env_var("SHOW_ALL_OUTPUTS_IN_OUTPUT_FOLDER", "True")
+)
+
+APPLY_DUPLICATES_TO_FILE_AUTOMATICALLY = convert_string_to_boolean(
+ get_or_create_env_var("APPLY_DUPLICATES_TO_FILE_AUTOMATICALLY", "False")
+)
+
+SHOW_EXAMPLES = convert_string_to_boolean(
+ get_or_create_env_var("SHOW_EXAMPLES", "True")
+)
+SHOW_AWS_EXAMPLES = convert_string_to_boolean(
+ get_or_create_env_var("SHOW_AWS_EXAMPLES", "False")
+)
+SHOW_DIFFICULT_OCR_EXAMPLES = convert_string_to_boolean(
+ get_or_create_env_var("SHOW_DIFFICULT_OCR_EXAMPLES", "False")
+)
+
+RUN_ALL_EXAMPLES_THROUGH_AWS = convert_string_to_boolean(
+ get_or_create_env_var("RUN_ALL_EXAMPLES_THROUGH_AWS", "False")
+)
+
+FILE_INPUT_HEIGHT = int(get_or_create_env_var("FILE_INPUT_HEIGHT", "200"))
+
+RUN_DIRECT_MODE = convert_string_to_boolean(
+ get_or_create_env_var("RUN_DIRECT_MODE", "False")
+)
+
+# Direct mode configuration options
+DIRECT_MODE_DEFAULT_USER = get_or_create_env_var(
+ "DIRECT_MODE_DEFAULT_USER", ""
+) # Default username for cli/direct mode requests
+DIRECT_MODE_TASK = get_or_create_env_var(
+ "DIRECT_MODE_TASK", "redact"
+) # 'redact', 'deduplicate', 'summarise', 'textract', or 'combine_review_pdfs'
+DIRECT_MODE_INPUT_FILE = get_or_create_env_var(
+ "DIRECT_MODE_INPUT_FILE", ""
+) # Path to input file; for combine_review_pdfs use comma-separated paths (at least 2)
+DIRECT_MODE_OUTPUT_DIR = get_or_create_env_var(
+ "DIRECT_MODE_OUTPUT_DIR", OUTPUT_FOLDER
+) # Output directory
+DIRECT_MODE_OUTPUT_DIR = ensure_folder_within_app_directory(DIRECT_MODE_OUTPUT_DIR)
+DIRECT_MODE_DUPLICATE_TYPE = get_or_create_env_var(
+ "DIRECT_MODE_DUPLICATE_TYPE", "pages"
+) # 'pages' or 'tabular'
+
+# Additional direct mode configuration options for user customization
+DIRECT_MODE_LANGUAGE = get_or_create_env_var(
+ "DIRECT_MODE_LANGUAGE", DEFAULT_LANGUAGE
+) # Language for document processing
+DIRECT_MODE_PII_DETECTOR = get_or_create_env_var(
+ "DIRECT_MODE_PII_DETECTOR", LOCAL_PII_OPTION
+) # PII detection method
+DIRECT_MODE_OCR_METHOD = get_or_create_env_var(
+ "DIRECT_MODE_OCR_METHOD", "Local OCR"
+) # OCR method for PDF/image processing
+DIRECT_MODE_OCR_FIRST_PASS_MAX_WORKERS = max(
+ 1,
+ int(
+ get_or_create_env_var(
+ "DIRECT_MODE_OCR_FIRST_PASS_MAX_WORKERS",
+ str(OCR_FIRST_PASS_MAX_WORKERS),
+ )
+ ),
+) # Max threads for OCR first pass in redact_image_pdf (1 = sequential)
+DIRECT_MODE_SUMMARY_PAGE_GROUP_MAX_WORKERS = max(
+ 1,
+ int(
+ get_or_create_env_var(
+ "DIRECT_MODE_SUMMARY_PAGE_GROUP_MAX_WORKERS",
+ str(SUMMARY_PAGE_GROUP_MAX_WORKERS),
+ )
+ ),
+) # Max threads for page-group summarisation (1 = sequential)
+DIRECT_MODE_PAGE_MIN = int(
+ get_or_create_env_var("DIRECT_MODE_PAGE_MIN", str(DEFAULT_PAGE_MIN))
+) # First page to process
+DIRECT_MODE_PAGE_MAX = int(
+ get_or_create_env_var("DIRECT_MODE_PAGE_MAX", str(DEFAULT_PAGE_MAX))
+) # Last page to process
+DIRECT_MODE_IMAGES_DPI = float(
+ get_or_create_env_var("DIRECT_MODE_IMAGES_DPI", str(IMAGES_DPI))
+) # DPI for image processing
+DIRECT_MODE_CHOSEN_LOCAL_OCR_MODEL = get_or_create_env_var(
+ "DIRECT_MODE_CHOSEN_LOCAL_OCR_MODEL", DEFAULT_LOCAL_OCR_MODEL
+) # Local OCR model choice
+DIRECT_MODE_PREPROCESS_LOCAL_OCR_IMAGES = convert_string_to_boolean(
+ get_or_create_env_var(
+ "DIRECT_MODE_PREPROCESS_LOCAL_OCR_IMAGES", str(PREPROCESS_LOCAL_OCR_IMAGES)
+ )
+) # Preprocess images before OCR
+DIRECT_MODE_COMPRESS_REDACTED_PDF = convert_string_to_boolean(
+ get_or_create_env_var(
+ "DIRECT_MODE_COMPRESS_REDACTED_PDF", str(COMPRESS_REDACTED_PDF)
+ )
+) # Compress redacted PDF
+DIRECT_MODE_RETURN_PDF_END_OF_REDACTION = convert_string_to_boolean(
+ get_or_create_env_var(
+ "DIRECT_MODE_RETURN_PDF_END_OF_REDACTION", str(RETURN_REDACTED_PDF)
+ )
+) # Return PDF at end of redaction
+DIRECT_MODE_EXTRACT_FORMS = convert_string_to_boolean(
+ get_or_create_env_var("DIRECT_MODE_EXTRACT_FORMS", "False")
+) # Extract forms during Textract analysis
+DIRECT_MODE_EXTRACT_TABLES = convert_string_to_boolean(
+ get_or_create_env_var("DIRECT_MODE_EXTRACT_TABLES", "False")
+) # Extract tables during Textract analysis
+DIRECT_MODE_EXTRACT_LAYOUT = convert_string_to_boolean(
+ get_or_create_env_var("DIRECT_MODE_EXTRACT_LAYOUT", "False")
+) # Extract layout during Textract analysis
+DIRECT_MODE_EXTRACT_SIGNATURES = convert_string_to_boolean(
+ get_or_create_env_var("DIRECT_MODE_EXTRACT_SIGNATURES", "False")
+) # Extract signatures during Textract analysis
+DIRECT_MODE_MATCH_FUZZY_WHOLE_PHRASE_BOOL = convert_string_to_boolean(
+ get_or_create_env_var("DIRECT_MODE_MATCH_FUZZY_WHOLE_PHRASE_BOOL", "True")
+) # Match fuzzy whole phrase boolean
+DIRECT_MODE_ANON_STRATEGY = get_or_create_env_var(
+ "DIRECT_MODE_ANON_STRATEGY", DEFAULT_TABULAR_ANONYMISATION_STRATEGY
+) # Anonymisation strategy for tabular data
+DIRECT_MODE_FUZZY_MISTAKES = int(
+ get_or_create_env_var(
+ "DIRECT_MODE_FUZZY_MISTAKES", str(DEFAULT_FUZZY_SPELLING_MISTAKES_NUM)
+ )
+) # Number of fuzzy spelling mistakes allowed
+DIRECT_MODE_SIMILARITY_THRESHOLD = float(
+ get_or_create_env_var(
+ "DIRECT_MODE_SIMILARITY_THRESHOLD", str(DEFAULT_DUPLICATE_DETECTION_THRESHOLD)
+ )
+) # Similarity threshold for duplicate detection
+DIRECT_MODE_MIN_WORD_COUNT = int(
+ get_or_create_env_var("DIRECT_MODE_MIN_WORD_COUNT", str(DEFAULT_MIN_WORD_COUNT))
+) # Minimum word count for duplicate detection
+DIRECT_MODE_MIN_CONSECUTIVE_PAGES = int(
+ get_or_create_env_var(
+ "DIRECT_MODE_MIN_CONSECUTIVE_PAGES", str(DEFAULT_MIN_CONSECUTIVE_PAGES)
+ )
+) # Minimum consecutive pages for duplicate detection
+DIRECT_MODE_GREEDY_MATCH = convert_string_to_boolean(
+ get_or_create_env_var(
+ "DIRECT_MODE_GREEDY_MATCH", str(USE_GREEDY_DUPLICATE_DETECTION)
+ )
+) # Use greedy matching for duplicate detection
+DIRECT_MODE_COMBINE_PAGES = convert_string_to_boolean(
+ get_or_create_env_var("DIRECT_MODE_COMBINE_PAGES", str(DEFAULT_COMBINE_PAGES))
+) # Combine pages for duplicate detection
+DIRECT_MODE_REMOVE_DUPLICATE_ROWS = convert_string_to_boolean(
+ get_or_create_env_var(
+ "DIRECT_MODE_REMOVE_DUPLICATE_ROWS", str(REMOVE_DUPLICATE_ROWS)
+ )
+) # Remove duplicate rows in tabular data
+
+# Textract Batch Operations Options
+DIRECT_MODE_TEXTRACT_ACTION = get_or_create_env_var(
+ "DIRECT_MODE_TEXTRACT_ACTION", ""
+) # Textract action for batch operations
+DIRECT_MODE_JOB_ID = get_or_create_env_var(
+ "DIRECT_MODE_JOB_ID", ""
+) # Job ID for Textract operations
+
+# Lambda-specific configuration options
+LAMBDA_POLL_INTERVAL = int(
+ get_or_create_env_var("LAMBDA_POLL_INTERVAL", "30")
+) # Polling interval in seconds for Textract job status
+LAMBDA_MAX_POLL_ATTEMPTS = int(
+ get_or_create_env_var("LAMBDA_MAX_POLL_ATTEMPTS", "120")
+) # Maximum number of polling attempts for Textract job completion
+LAMBDA_PREPARE_IMAGES = convert_string_to_boolean(
+ get_or_create_env_var("LAMBDA_PREPARE_IMAGES", "True")
+) # Prepare images for OCR processing
+LAMBDA_EXTRACT_SIGNATURES = convert_string_to_boolean(
+ get_or_create_env_var("LAMBDA_EXTRACT_SIGNATURES", "False")
+) # Extract signatures during Textract analysis
+LAMBDA_DEFAULT_USERNAME = get_or_create_env_var(
+ "LAMBDA_DEFAULT_USERNAME", "lambda_user"
+) # Default username for Lambda operations
+
+
+### ALLOW LIST
+
+GET_DEFAULT_ALLOW_LIST = convert_string_to_boolean(
+ get_or_create_env_var("GET_DEFAULT_ALLOW_LIST", "False")
+)
+
+ALLOW_LIST_PATH = get_or_create_env_var(
+ "ALLOW_LIST_PATH", ""
+) # config/default_allow_list.csv
+
+S3_ALLOW_LIST_PATH = get_or_create_env_var(
+ "S3_ALLOW_LIST_PATH", ""
+) # default_allow_list.csv # This is a path within the DOCUMENT_REDACTION_BUCKET
+
+if ALLOW_LIST_PATH:
+ OUTPUT_ALLOW_LIST_PATH = ALLOW_LIST_PATH
+else:
+ OUTPUT_ALLOW_LIST_PATH = "config/default_allow_list.csv"
+
+### DENY LIST
+
+GET_DEFAULT_DENY_LIST = convert_string_to_boolean(
+ get_or_create_env_var("GET_DEFAULT_DENY_LIST", "False")
+)
+
+S3_DENY_LIST_PATH = get_or_create_env_var(
+ "S3_DENY_LIST_PATH", ""
+) # default_deny_list.csv # This is a path within the DOCUMENT_REDACTION_BUCKET
+
+DENY_LIST_PATH = get_or_create_env_var(
+ "DENY_LIST_PATH", ""
+) # config/default_deny_list.csv
+
+if DENY_LIST_PATH:
+ OUTPUT_DENY_LIST_PATH = DENY_LIST_PATH
+else:
+ OUTPUT_DENY_LIST_PATH = "config/default_deny_list.csv"
+
+### WHOLE PAGE REDACTION LIST
+
+GET_DEFAULT_WHOLE_PAGE_REDACTION_LIST = get_or_create_env_var(
+ "GET_DEFAULT_WHOLE_PAGE_REDACTION_LIST", "False"
+)
+
+S3_WHOLE_PAGE_REDACTION_LIST_PATH = get_or_create_env_var(
+ "S3_WHOLE_PAGE_REDACTION_LIST_PATH", ""
+) # default_whole_page_redaction_list.csv # This is a path within the DOCUMENT_REDACTION_BUCKET
+
+WHOLE_PAGE_REDACTION_LIST_PATH = get_or_create_env_var(
+ "WHOLE_PAGE_REDACTION_LIST_PATH", ""
+) # config/default_whole_page_redaction_list.csv
+
+if WHOLE_PAGE_REDACTION_LIST_PATH:
+ OUTPUT_WHOLE_PAGE_REDACTION_LIST_PATH = WHOLE_PAGE_REDACTION_LIST_PATH
+else:
+ OUTPUT_WHOLE_PAGE_REDACTION_LIST_PATH = (
+ "config/default_whole_page_redaction_list.csv"
+ )
+
+###
+# COST CODE OPTIONS
+###
+
+SHOW_COSTS = convert_string_to_boolean(get_or_create_env_var("SHOW_COSTS", "False"))
+
+GET_COST_CODES = convert_string_to_boolean(
+ get_or_create_env_var("GET_COST_CODES", "False")
+)
+
+DEFAULT_COST_CODE = get_or_create_env_var("DEFAULT_COST_CODE", "")
+
+COST_CODES_PATH = get_or_create_env_var(
+ "COST_CODES_PATH", ""
+) # 'config/COST_CENTRES.csv' # file should be a csv file with a single table in it that has two columns with a header. First column should contain cost codes, second column should contain a name or description for the cost code
+
+S3_COST_CODES_PATH = get_or_create_env_var(
+ "S3_COST_CODES_PATH", ""
+) # COST_CENTRES.csv # This is a path within the DOCUMENT_REDACTION_BUCKET
+
+# A default path in case s3 cost code location is provided but no local cost code location given
+if COST_CODES_PATH:
+ OUTPUT_COST_CODES_PATH = COST_CODES_PATH
+else:
+ OUTPUT_COST_CODES_PATH = "config/cost_codes.csv"
+
+ENFORCE_COST_CODES = convert_string_to_boolean(
+ get_or_create_env_var("ENFORCE_COST_CODES", "False")
+)
+# If you have cost codes listed, is it compulsory to choose one before redacting?
+
+if ENFORCE_COST_CODES:
+ GET_COST_CODES = True
+
+
+###
+# WHOLE DOCUMENT API OPTIONS
+###
+
+SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS = convert_string_to_boolean(
+ get_or_create_env_var("SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS", "False")
+) # This feature not currently implemented
+
+TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET = get_or_create_env_var(
+ "TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET", ""
+)
+
+TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER = get_or_create_env_var(
+ "TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER", "input"
+)
+
+TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER = get_or_create_env_var(
+ "TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER", "output"
+)
+
+LOAD_PREVIOUS_TEXTRACT_JOBS_S3 = convert_string_to_boolean(
+ get_or_create_env_var("LOAD_PREVIOUS_TEXTRACT_JOBS_S3", "False")
+)
+# Whether or not to load previous Textract jobs from S3
+
+TEXTRACT_JOBS_S3_LOC = get_or_create_env_var(
+ "TEXTRACT_JOBS_S3_LOC", "output"
+) # Subfolder in the DOCUMENT_REDACTION_BUCKET where the Textract jobs are stored
+
+TEXTRACT_JOBS_S3_INPUT_LOC = get_or_create_env_var(
+ "TEXTRACT_JOBS_S3_INPUT_LOC", "input"
+) # Subfolder in the DOCUMENT_REDACTION_BUCKET where the Textract jobs are stored
+
+TEXTRACT_JOBS_LOCAL_LOC = get_or_create_env_var(
+ "TEXTRACT_JOBS_LOCAL_LOC", "output"
+) # Local subfolder where the Textract jobs are stored
+
+DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS = int(
+ get_or_create_env_var("DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS", "7")
+) # How many days into the past should whole document Textract jobs be displayed? After that, the data is not deleted from the Textract jobs csv, but it is just filtered out. Included to align with S3 buckets where the file outputs will be automatically deleted after X days.
+
+
+###
+# Config vars output format
+###
+
+# Convert string environment variables to string or list
+CSV_ACCESS_LOG_HEADERS = _get_env_list(CSV_ACCESS_LOG_HEADERS)
+CSV_FEEDBACK_LOG_HEADERS = _get_env_list(CSV_FEEDBACK_LOG_HEADERS)
+CSV_USAGE_LOG_HEADERS = _get_env_list(CSV_USAGE_LOG_HEADERS)
+
+DYNAMODB_ACCESS_LOG_HEADERS = _get_env_list(DYNAMODB_ACCESS_LOG_HEADERS)
+DYNAMODB_FEEDBACK_LOG_HEADERS = _get_env_list(DYNAMODB_FEEDBACK_LOG_HEADERS)
+DYNAMODB_USAGE_LOG_HEADERS = _get_env_list(DYNAMODB_USAGE_LOG_HEADERS)
+if CHOSEN_COMPREHEND_ENTITIES:
+ CHOSEN_COMPREHEND_ENTITIES = _get_env_list(CHOSEN_COMPREHEND_ENTITIES)
+if FULL_COMPREHEND_ENTITY_LIST:
+ FULL_COMPREHEND_ENTITY_LIST = _get_env_list(FULL_COMPREHEND_ENTITY_LIST)
+if FULL_LLM_ENTITY_LIST:
+ FULL_LLM_ENTITY_LIST = _get_env_list(FULL_LLM_ENTITY_LIST)
+if CHOSEN_LLM_ENTITIES:
+ CHOSEN_LLM_ENTITIES = _get_env_list(CHOSEN_LLM_ENTITIES)
+if CHOSEN_REDACT_ENTITIES:
+ CHOSEN_REDACT_ENTITIES = _get_env_list(CHOSEN_REDACT_ENTITIES)
+if FULL_ENTITY_LIST:
+ FULL_ENTITY_LIST = _get_env_list(FULL_ENTITY_LIST)
+
+SHOW_CUSTOM_VLM_ENTITIES = convert_string_to_boolean(
+ get_or_create_env_var("SHOW_CUSTOM_VLM_ENTITIES", "False")
+)
+
+if SHOW_CUSTOM_VLM_ENTITIES and (
+ SHOW_VLM_MODEL_OPTIONS
+ or SHOW_INFERENCE_SERVER_VLM_OPTIONS
+ or SHOW_BEDROCK_VLM_MODELS
+):
+ FULL_ENTITY_LIST.extend(["CUSTOM_VLM_FACES", "CUSTOM_VLM_SIGNATURE"])
+ FULL_COMPREHEND_ENTITY_LIST.extend(["CUSTOM_VLM_FACES", "CUSTOM_VLM_SIGNATURE"])
+ FULL_LLM_ENTITY_LIST.extend(["CUSTOM_VLM_FACES", "CUSTOM_VLM_SIGNATURE"])
+
+if DEFAULT_TEXT_COLUMNS:
+ DEFAULT_TEXT_COLUMNS = _get_env_list(DEFAULT_TEXT_COLUMNS)
+if DEFAULT_EXCEL_SHEETS:
+ DEFAULT_EXCEL_SHEETS = _get_env_list(DEFAULT_EXCEL_SHEETS)
+
+if CUSTOM_ENTITIES:
+ CUSTOM_ENTITIES = _get_env_list(CUSTOM_ENTITIES)
+
+if DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX:
+ DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX = _get_env_list(
+ DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX
+ )
+
+if ALLOWED_ORIGINS:
+ ALLOWED_ORIGINS = _get_env_list(ALLOWED_ORIGINS)
+
+if ALLOWED_HOSTS:
+ ALLOWED_HOSTS = _get_env_list(ALLOWED_HOSTS)
+
+if textract_language_choices:
+ textract_language_choices = _get_env_list(textract_language_choices)
+if aws_comprehend_language_choices:
+ aws_comprehend_language_choices = _get_env_list(aws_comprehend_language_choices)
+
+if MAPPED_LANGUAGE_CHOICES:
+ MAPPED_LANGUAGE_CHOICES = _get_env_list(MAPPED_LANGUAGE_CHOICES)
+if LANGUAGE_CHOICES:
+ LANGUAGE_CHOICES = _get_env_list(LANGUAGE_CHOICES)
+
+LANGUAGE_MAP = dict(zip(MAPPED_LANGUAGE_CHOICES, LANGUAGE_CHOICES))
diff --git a/tools/custom_csvlogger.py b/tools/custom_csvlogger.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd05839d7a4fd28fcf0963fd5d72a70e7876d609
--- /dev/null
+++ b/tools/custom_csvlogger.py
@@ -0,0 +1,335 @@
+from __future__ import annotations
+
+import csv
+import os
+import time
+import uuid
+from collections.abc import Sequence
+from datetime import datetime
+from pathlib import Path
+
+# from multiprocessing import Lock
+from threading import Lock
+from typing import TYPE_CHECKING, Any
+
+import boto3
+import botocore
+from gradio import utils
+from gradio_client import utils as client_utils
+
+from tools.config import AWS_ACCESS_KEY, AWS_REGION, AWS_SECRET_KEY, RUN_AWS_FUNCTIONS
+
+if TYPE_CHECKING:
+ from gradio.components import Component
+
+from gradio.flagging import FlaggingCallback
+
+
+class CSVLogger_custom(FlaggingCallback):
+ """
+ The default implementation of the FlaggingCallback abstract class in gradio>=5.0. Each flagged
+ sample (both the input and output data) is logged to a CSV file with headers on the machine running
+ the gradio app. Unlike ClassicCSVLogger, this implementation is concurrent-safe and it creates a new
+ dataset file every time the headers of the CSV (derived from the labels of the components) change. It also
+ only creates columns for "username" and "flag" if the flag_option and username are provided, respectively.
+
+ Example:
+ import gradio as gr
+ def image_classifier(inp):
+ return {'cat': 0.3, 'dog': 0.7}
+ demo = gr.Interface(fn=image_classifier, inputs="image", outputs="label",
+ flagging_callback=CSVLogger())
+ Guides: using-flagging
+ """
+
+ def __init__(
+ self,
+ simplify_file_data: bool = True,
+ verbose: bool = False,
+ dataset_file_name: str | None = None,
+ ):
+ """
+ Parameters:
+ simplify_file_data: If True, the file data will be simplified before being written to the CSV file. If CSVLogger is being used to cache examples, this is set to False to preserve the original FileData class
+ verbose: If True, prints messages to the console about the dataset file creation
+ dataset_file_name: The name of the dataset file to be created (should end in ".csv"). If None, the dataset file will be named "dataset1.csv" or the next available number.
+ """
+ self.simplify_file_data = simplify_file_data
+ self.verbose = verbose
+ self.dataset_file_name = dataset_file_name
+ self.lock = Lock()
+
+ def setup(
+ self,
+ components: Sequence[Component],
+ flagging_dir: str | Path,
+ ):
+ self.components = components
+ self.flagging_dir = Path(flagging_dir)
+ self.first_time = True
+
+ def _create_dataset_file(
+ self,
+ additional_headers: list[str] | None = None,
+ replacement_headers: list[str] | None = None,
+ ):
+ os.makedirs(self.flagging_dir, exist_ok=True)
+
+ if replacement_headers:
+ if additional_headers is None:
+ additional_headers = list()
+
+ if len(replacement_headers) != len(self.components):
+ raise ValueError(
+ f"replacement_headers must have the same length as components "
+ f"({len(replacement_headers)} provided, {len(self.components)} expected)"
+ )
+ headers = replacement_headers + additional_headers + ["timestamp"]
+ else:
+ if additional_headers is None:
+ additional_headers = []
+ headers = (
+ [
+ getattr(component, "label", None) or f"component {idx}"
+ for idx, component in enumerate(self.components)
+ ]
+ + additional_headers
+ + ["timestamp"]
+ )
+
+ headers = utils.sanitize_list_for_csv(headers)
+ dataset_files = list(Path(self.flagging_dir).glob("dataset*.csv"))
+
+ if self.dataset_file_name:
+ self.dataset_filepath = self.flagging_dir / self.dataset_file_name
+ elif dataset_files:
+ try:
+ from tools.secure_regex_utils import (
+ safe_extract_latest_number_from_filename,
+ )
+
+ latest_file = max(
+ dataset_files,
+ key=lambda f: safe_extract_latest_number_from_filename(f.stem) or 0,
+ )
+ latest_num = (
+ safe_extract_latest_number_from_filename(latest_file.stem) or 0
+ )
+
+ with open(latest_file, newline="", encoding="utf-8") as csvfile:
+ reader = csv.reader(csvfile)
+ existing_headers = next(reader, None)
+
+ if existing_headers != headers:
+ new_num = latest_num + 1
+ self.dataset_filepath = self.flagging_dir / f"dataset{new_num}.csv"
+ else:
+ self.dataset_filepath = latest_file
+ except Exception:
+ self.dataset_filepath = self.flagging_dir / "dataset1.csv"
+ else:
+ self.dataset_filepath = self.flagging_dir / "dataset1.csv"
+
+ if not Path(self.dataset_filepath).exists():
+ with open(
+ self.dataset_filepath, "w", newline="", encoding="utf-8"
+ ) as csvfile:
+ writer = csv.writer(csvfile)
+ writer.writerow(utils.sanitize_list_for_csv(headers))
+ if self.verbose:
+ print("Created dataset file at:", self.dataset_filepath)
+ elif self.verbose:
+ print("Using existing dataset file at:", self.dataset_filepath)
+
+ def flag(
+ self,
+ flag_data: list[Any],
+ flag_option: str | None = None,
+ username: str | None = None,
+ save_to_csv: bool = True,
+ save_to_dynamodb: bool = False,
+ dynamodb_table_name: str | None = None,
+ dynamodb_headers: list[str] | None = None, # New: specify headers for DynamoDB
+ replacement_headers: list[str] | None = None,
+ ) -> int:
+ if self.first_time:
+ additional_headers = list()
+ if flag_option is not None:
+ additional_headers.append("flag")
+ if username is not None:
+ additional_headers.append("username")
+ additional_headers.append("id")
+ self._create_dataset_file(
+ additional_headers=additional_headers,
+ replacement_headers=replacement_headers,
+ )
+ self.first_time = False
+
+ csv_data = list()
+ for idx, (component, sample) in enumerate(
+ zip(self.components, flag_data, strict=False)
+ ):
+ save_dir = (
+ self.flagging_dir
+ / client_utils.strip_invalid_filename_characters(
+ getattr(component, "label", None) or f"component {idx}"
+ )
+ )
+ if utils.is_prop_update(sample):
+ csv_data.append(str(sample))
+ else:
+ data = (
+ component.flag(sample, flag_dir=save_dir)
+ if sample is not None
+ else ""
+ )
+ if self.simplify_file_data:
+ data = utils.simplify_file_data_in_str(data)
+ csv_data.append(data)
+
+ if flag_option is not None:
+ csv_data.append(flag_option)
+ if username is not None:
+ csv_data.append(username)
+
+ generated_id = str(uuid.uuid4())
+ csv_data.append(generated_id)
+
+ timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[
+ :-3
+ ] # Correct format for Amazon Athena
+ csv_data.append(timestamp)
+
+ # Build the headers
+ headers = [
+ getattr(component, "label", None) or f"component {idx}"
+ for idx, component in enumerate(self.components)
+ ]
+ if flag_option is not None:
+ headers.append("flag")
+ if username is not None:
+ headers.append("username")
+ headers.append("id")
+ headers.append("timestamp")
+
+ line_count = -1
+
+ if save_to_csv:
+ with self.lock:
+ with open(
+ self.dataset_filepath, "a", newline="", encoding="utf-8"
+ ) as csvfile:
+ writer = csv.writer(csvfile)
+ writer.writerow(utils.sanitize_list_for_csv(csv_data))
+ with open(self.dataset_filepath, encoding="utf-8") as csvfile:
+ line_count = len(list(csv.reader(csvfile))) - 1
+
+ if save_to_dynamodb is True:
+
+ if RUN_AWS_FUNCTIONS:
+ try:
+ # print("Connecting to DynamoDB via existing SSO connection")
+ dynamodb = boto3.resource("dynamodb", region_name=AWS_REGION)
+
+ dynamodb.meta.client.list_tables()
+
+ except Exception as e:
+ print("No SSO credentials found:", e)
+ if AWS_ACCESS_KEY and AWS_SECRET_KEY:
+ # print(
+ # "Trying to get DynamoDB credentials from environment variables"
+ # )
+ dynamodb = boto3.resource(
+ "dynamodb",
+ aws_access_key_id=AWS_ACCESS_KEY,
+ aws_secret_access_key=AWS_SECRET_KEY,
+ region_name=AWS_REGION,
+ )
+ else:
+ raise Exception(
+ "AWS credentials for DynamoDB logging not found"
+ )
+ else:
+ raise Exception("AWS credentials for DynamoDB logging not found")
+
+ if dynamodb_table_name is None:
+ raise ValueError(
+ "You must provide a dynamodb_table_name if save_to_dynamodb is True"
+ )
+
+ if dynamodb_headers:
+ dynamodb_headers = dynamodb_headers
+ if not dynamodb_headers and replacement_headers:
+ dynamodb_headers = replacement_headers
+ elif headers:
+ dynamodb_headers = headers
+ elif not dynamodb_headers:
+ raise ValueError(
+ "Headers not found. You must provide dynamodb_headers or replacement_headers to create a new table."
+ )
+
+ if flag_option is not None:
+ if "flag" not in dynamodb_headers:
+ dynamodb_headers.append("flag")
+ if username is not None:
+ if "username" not in dynamodb_headers:
+ dynamodb_headers.append("username")
+ if "timestamp" not in dynamodb_headers:
+ dynamodb_headers.append("timestamp")
+ if "id" not in dynamodb_headers:
+ dynamodb_headers.append("id")
+
+ # Table doesn't exist — create it
+ try:
+ table = dynamodb.Table(dynamodb_table_name)
+ table.load()
+ except botocore.exceptions.ClientError as e:
+ if e.response["Error"]["Code"] == "ResourceNotFoundException":
+
+ attribute_definitions = [
+ {
+ "AttributeName": "id",
+ "AttributeType": "S",
+ } # Only define key attributes here
+ ]
+
+ table = dynamodb.create_table(
+ TableName=dynamodb_table_name,
+ KeySchema=[
+ {"AttributeName": "id", "KeyType": "HASH"} # Partition key
+ ],
+ AttributeDefinitions=attribute_definitions,
+ BillingMode="PAY_PER_REQUEST",
+ )
+ # Wait until the table exists
+ table.meta.client.get_waiter("table_exists").wait(
+ TableName=dynamodb_table_name
+ )
+ time.sleep(5)
+ print(f"Table '{dynamodb_table_name}' created successfully.")
+ else:
+ raise
+
+ # Prepare the DynamoDB item to upload
+
+ try:
+ item = {
+ "id": str(generated_id), # UUID primary key
+ "timestamp": timestamp,
+ }
+
+ # Map the headers to values
+ item.update(
+ {
+ header: str(value)
+ for header, value in zip(dynamodb_headers, csv_data)
+ }
+ )
+
+ table.put_item(Item=item)
+
+ # print("Successfully uploaded log to DynamoDB")
+ except Exception as e:
+ print("Could not upload log to DynamobDB due to", e)
+
+ return line_count
diff --git a/tools/custom_image_analyser_engine.py b/tools/custom_image_analyser_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7b0acd4b1a4c975ef9c22e83b5c955de77ae4fd
--- /dev/null
+++ b/tools/custom_image_analyser_engine.py
@@ -0,0 +1,12872 @@
+import ast
+import base64
+import copy
+import io
+import json
+import math
+import os
+import re
+import threading
+import time
+from concurrent.futures import ThreadPoolExecutor
+from copy import deepcopy
+from dataclasses import dataclass
+from datetime import datetime
+from functools import partial
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import botocore
+import cv2
+import gradio as gr
+import numpy as np
+import pandas as pd
+import pytesseract
+import requests
+import spaces
+from pdfminer.layout import LTChar
+from PIL import Image, ImageDraw, ImageFont
+from presidio_analyzer import AnalyzerEngine, RecognizerResult
+
+from tools.config import (
+ AWS_LLM_PII_OPTION,
+ AWS_PII_OPTION,
+ CLOUD_LLM_PII_CUSTOM_INSTRUCTIONS_MODEL_CHOICE,
+ CLOUD_LLM_PII_MODEL_CHOICE,
+ CONVERT_LINE_TO_WORD_LEVEL,
+ DEFAULT_INFERENCE_SERVER_VLM_MODEL,
+ DEFAULT_LANGUAGE,
+ DEFAULT_NEW_BATCH_CHAR_COUNT,
+ DEFAULT_NEW_BATCH_WORD_COUNT,
+ FULL_COMPREHEND_ENTITY_LIST,
+ HYBRID_OCR_CONFIDENCE_THRESHOLD,
+ HYBRID_OCR_MAX_NEW_TOKENS,
+ HYBRID_OCR_MAX_WORDS,
+ HYBRID_OCR_PADDING,
+ IMAGES_DPI,
+ INFERENCE_SERVER_API_URL,
+ INFERENCE_SERVER_DISABLE_THINKING,
+ INFERENCE_SERVER_LLM_PII_MODEL_CHOICE,
+ INFERENCE_SERVER_MODEL_NAME,
+ INFERENCE_SERVER_PII_OPTION,
+ INFERENCE_SERVER_TIMEOUT,
+ LINE_TO_WORD_SEGMENT_MAX_WORKERS,
+ LLM_MAX_NEW_TOKENS,
+ LLM_TEMPERATURE,
+ LOAD_PADDLE_AT_STARTUP,
+ LOCAL_OCR_MODEL_OPTIONS,
+ LOCAL_PII_OPTION,
+ LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE,
+ LOCAL_TRANSFORMERS_LLM_PII_OPTION,
+ MAX_NEW_TOKENS,
+ MAX_SPACES_GPU_RUN_TIME,
+ MAX_WORKERS,
+ MERGE_BOUNDING_BOXES,
+ OUTPUT_FOLDER,
+ PADDLE_DET_DB_UNCLIP_RATIO,
+ PADDLE_FONT_PATH,
+ PADDLE_MODEL_PATH,
+ PADDLE_USE_TEXTLINE_ORIENTATION,
+ PREPARE_PAGE_FOR_HYBRID_VLM_BEFORE_PADDLE,
+ PREPROCESS_LOCAL_OCR_IMAGES,
+ REPORT_VLM_OUTPUTS_TO_GUI,
+ SAVE_EXAMPLE_HYBRID_IMAGES,
+ SAVE_PAGE_OCR_VISUALISATIONS,
+ SAVE_PREPROCESS_IMAGES,
+ SAVE_TEXTRACT_BEDROCK_HYBRID_EXAMPLES,
+ SAVE_VLM_INPUT_IMAGES,
+ SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL,
+ TESSERACT_SEGMENTATION_LEVEL,
+ TESSERACT_WORD_LEVEL_OCR,
+ USE_LLAMA_SWAP,
+ USE_TRANSFORMERS_VLM_MODEL_AS_LLM,
+ VLM_DEFAULT_STREAM,
+ VLM_HYBRID_MIN_IMAGE_SIZE,
+ VLM_MAX_ASPECT_RATIO,
+ VLM_MAX_DPI,
+ VLM_MAX_IMAGE_SIZE,
+ VLM_MIN_DPI,
+ VLM_MIN_IMAGE_SIZE,
+)
+from tools.helper_functions import clean_unicode_text, get_system_font_path
+from tools.llm_funcs import _extract_choice_message_text
+from tools.load_spacy_model_custom_recognisers import custom_entities
+from tools.presidio_analyzer_custom import recognizer_result_from_dict
+from tools.run_vlm import (
+ extract_text_from_image_vlm,
+ full_page_ocr_people_vlm_prompt,
+ full_page_ocr_signature_vlm_prompt,
+ full_page_ocr_vlm_prompt,
+ model_default_do_sample,
+ model_default_max_new_tokens,
+ model_default_min_p,
+ model_default_presence_penalty,
+ model_default_prompt,
+ model_default_repetition_penalty,
+ model_default_seed,
+ model_default_temperature,
+ model_default_top_k,
+ model_default_top_p,
+)
+from tools.secure_path_utils import validate_folder_containment
+from tools.secure_regex_utils import safe_sanitize_text
+from tools.word_segmenter import AdaptiveSegmenter
+
+# AWS Comprehend billing: 1 unit = 100 characters (entity recognition, PII, etc.)
+COMPREHEND_CHARACTERS_PER_UNIT = 100
+
+# Phrase-ending punctuation marks (batch boundaries)
+PHRASE_ENDING_PUNCTUATION = {".", "!", "?", ";", ":"}
+
+# When Bedrock VLM word count differs from Textract by this many or less, we still
+# accept Bedrock text and derive word-level boxes from the Textract line bbox via
+# line-to-word segmentation.
+MAX_WORD_COUNT_DIFF_FOR_LINE_DERIVED_WORDS = 6
+
+
+def ends_with_phrase_punctuation(word: str) -> bool:
+ """Check if a word ends with phrase-ending punctuation."""
+ if not word:
+ return False
+ # Check if the word ends with any phrase-ending punctuation
+ return any(word.rstrip().endswith(punct) for punct in PHRASE_ENDING_PUNCTUATION)
+
+
+if LOAD_PADDLE_AT_STARTUP:
+ # Set PaddleOCR font path BEFORE importing to prevent font downloads during import
+ if (
+ PADDLE_FONT_PATH
+ and PADDLE_FONT_PATH.strip()
+ and os.path.exists(PADDLE_FONT_PATH)
+ ):
+ os.environ["PADDLE_PDX_LOCAL_FONT_FILE_PATH"] = PADDLE_FONT_PATH
+ else:
+ system_font_path = get_system_font_path()
+ if system_font_path:
+ os.environ["PADDLE_PDX_LOCAL_FONT_FILE_PATH"] = system_font_path
+
+ try:
+ from paddleocr import PaddleOCR
+
+ print("PaddleOCR imported successfully")
+ except Exception as e:
+ print(f"Error importing PaddleOCR: {e}")
+ PaddleOCR = None
+else:
+ PaddleOCR = None
+
+
+# --- Language utilities ---
+def _normalize_lang(language: str) -> str:
+ return language.strip().lower().replace("-", "_") if language else "en"
+
+
+def _tesseract_lang_code(language: str) -> str:
+ """Map a user language input to a Tesseract traineddata code."""
+ lang = _normalize_lang(language)
+
+ mapping = {
+ # Common
+ "en": "eng",
+ "eng": "eng",
+ "fr": "fra",
+ "fre": "fra",
+ "fra": "fra",
+ "de": "deu",
+ "ger": "deu",
+ "deu": "deu",
+ "es": "spa",
+ "spa": "spa",
+ "it": "ita",
+ "ita": "ita",
+ "nl": "nld",
+ "dut": "nld",
+ "nld": "nld",
+ "pt": "por",
+ "por": "por",
+ "ru": "rus",
+ "rus": "rus",
+ "ar": "ara",
+ "ara": "ara",
+ # Nordics
+ "sv": "swe",
+ "swe": "swe",
+ "no": "nor",
+ "nb": "nor",
+ "nn": "nor",
+ "nor": "nor",
+ "fi": "fin",
+ "fin": "fin",
+ "da": "dan",
+ "dan": "dan",
+ # Eastern/Central
+ "pl": "pol",
+ "pol": "pol",
+ "cs": "ces",
+ "cz": "ces",
+ "ces": "ces",
+ "hu": "hun",
+ "hun": "hun",
+ "ro": "ron",
+ "rum": "ron",
+ "ron": "ron",
+ "bg": "bul",
+ "bul": "bul",
+ "el": "ell",
+ "gre": "ell",
+ "ell": "ell",
+ # Asian
+ "ja": "jpn",
+ "jp": "jpn",
+ "jpn": "jpn",
+ "zh": "chi_sim",
+ "zh_cn": "chi_sim",
+ "zh_hans": "chi_sim",
+ "chi_sim": "chi_sim",
+ "zh_tw": "chi_tra",
+ "zh_hk": "chi_tra",
+ "zh_tr": "chi_tra",
+ "chi_tra": "chi_tra",
+ "hi": "hin",
+ "hin": "hin",
+ "bn": "ben",
+ "ben": "ben",
+ "ur": "urd",
+ "urd": "urd",
+ "fa": "fas",
+ "per": "fas",
+ "fas": "fas",
+ }
+
+ return mapping.get(lang, "eng")
+
+
+def _paddle_lang_code(language: str) -> str:
+ """Map a user language input to a PaddleOCR language code.
+
+ PaddleOCR supports codes like: 'en', 'ch', 'chinese_cht', 'korean', 'japan', 'german', 'fr', 'it', 'es',
+ as well as script packs like 'arabic', 'cyrillic', 'latin'.
+ """
+ lang = _normalize_lang(language)
+
+ mapping = {
+ "en": "en",
+ "fr": "fr",
+ "de": "german",
+ "es": "es",
+ "it": "it",
+ "pt": "pt",
+ "nl": "nl",
+ "ru": "cyrillic", # Russian is covered by cyrillic models
+ "uk": "cyrillic",
+ "bg": "cyrillic",
+ "sr": "cyrillic",
+ "ar": "arabic",
+ "tr": "tr",
+ "fa": "arabic", # fallback to arabic script pack
+ "zh": "ch",
+ "zh_cn": "ch",
+ "zh_tw": "chinese_cht",
+ "zh_hk": "chinese_cht",
+ "ja": "japan",
+ "jp": "japan",
+ "ko": "korean",
+ "hi": "latin", # fallback; dedicated Hindi not always available
+ }
+
+ return mapping.get(lang, "en")
+
+
+@dataclass
+class OCRResult:
+ text: str
+ left: int
+ top: int
+ width: int
+ height: int
+ conf: float = None
+ line: int = None
+ model: str = (
+ None # Track which OCR model was used (e.g., "Tesseract", "Paddle", "VLM")
+ )
+
+
+@dataclass
+class CustomImageRecognizerResult:
+ entity_type: str
+ start: int
+ end: int
+ score: float
+ left: int
+ top: int
+ width: int
+ height: int
+ text: str
+ color: tuple = (0, 0, 0)
+
+
+class ImagePreprocessor:
+ """ImagePreprocessor class. Parent class for image preprocessing objects."""
+
+ def __init__(self, use_greyscale: bool = True) -> None:
+ self.use_greyscale = use_greyscale
+
+ def preprocess_image(self, image: Image.Image) -> Tuple[Image.Image, dict]:
+ return image, {}
+
+ def convert_image_to_array(self, image: Image.Image) -> np.ndarray:
+ if isinstance(image, np.ndarray):
+ img = image
+ else:
+ if self.use_greyscale:
+ image = image.convert("L")
+ img = np.asarray(image)
+ return img
+
+ @staticmethod
+ def _get_bg_color(
+ image: np.ndarray, is_greyscale: bool, invert: bool = False
+ ) -> Union[int, Tuple[int, int, int]]:
+ # Note: Modified to expect numpy array for bincount
+ if invert:
+ image = 255 - image # Simple inversion for greyscale numpy array
+
+ if is_greyscale:
+ bg_color = int(np.bincount(image.flatten()).argmax())
+ else:
+ # This part would need more complex logic for color numpy arrays
+ # For this pipeline, we only use greyscale, so it's fine.
+ # A simple alternative:
+ from scipy import stats
+
+ bg_color = tuple(stats.mode(image.reshape(-1, 3), axis=0)[0][0])
+ return bg_color
+
+ @staticmethod
+ def _get_image_contrast(image: np.ndarray) -> Tuple[float, float]:
+ contrast = np.std(image)
+ mean_intensity = np.mean(image)
+ return contrast, mean_intensity
+
+
+class BilateralFilter(ImagePreprocessor):
+ """Applies bilateral filtering."""
+
+ def __init__(
+ self, diameter: int = 9, sigma_color: int = 75, sigma_space: int = 75
+ ) -> None:
+ super().__init__(use_greyscale=True)
+ self.diameter = diameter
+ self.sigma_color = sigma_color
+ self.sigma_space = sigma_space
+
+ def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, dict]:
+ # Modified to accept and return numpy array for consistency in the pipeline
+ filtered_image = cv2.bilateralFilter(
+ image, self.diameter, self.sigma_color, self.sigma_space
+ )
+ metadata = {
+ "diameter": self.diameter,
+ "sigma_color": self.sigma_color,
+ "sigma_space": self.sigma_space,
+ }
+ return filtered_image, metadata
+
+
+class SegmentedAdaptiveThreshold(ImagePreprocessor):
+ """Applies adaptive thresholding."""
+
+ def __init__(
+ self,
+ block_size: int = 21,
+ contrast_threshold: int = 40,
+ c_low_contrast: int = 5,
+ c_high_contrast: int = 10,
+ bg_threshold: int = 127,
+ ) -> None:
+ super().__init__(use_greyscale=True)
+ self.block_size = (
+ block_size if block_size % 2 == 1 else block_size + 1
+ ) # Ensure odd
+ self.c_low_contrast = c_low_contrast
+ self.c_high_contrast = c_high_contrast
+ self.bg_threshold = bg_threshold
+ self.contrast_threshold = contrast_threshold
+
+ def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, dict]:
+ # Modified to accept and return numpy array
+ background_color = self._get_bg_color(image, True)
+ contrast, _ = self._get_image_contrast(image)
+ c = (
+ self.c_low_contrast
+ if contrast <= self.contrast_threshold
+ else self.c_high_contrast
+ )
+
+ if background_color < self.bg_threshold: # Dark background, light text
+ adaptive_threshold_image = cv2.adaptiveThreshold(
+ image,
+ 255,
+ cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+ cv2.THRESH_BINARY_INV,
+ self.block_size,
+ -c,
+ )
+ else: # Light background, dark text
+ adaptive_threshold_image = cv2.adaptiveThreshold(
+ image,
+ 255,
+ cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+ cv2.THRESH_BINARY,
+ self.block_size,
+ c,
+ )
+ metadata = {"C": c, "background_color": background_color, "contrast": contrast}
+ return adaptive_threshold_image, metadata
+
+
+class ImageRescaling(ImagePreprocessor):
+ """Rescales images based on their size."""
+
+ def __init__(self, target_dpi: int = 300, assumed_input_dpi: int = 96) -> None:
+ super().__init__(use_greyscale=True)
+ self.target_dpi = target_dpi
+ self.assumed_input_dpi = assumed_input_dpi
+
+ def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, dict]:
+ # Modified to accept and return numpy array
+ scale_factor = self.target_dpi / self.assumed_input_dpi
+ metadata = {"scale_factor": 1.0}
+
+ if scale_factor != 1.0:
+ width = int(image.shape[1] * scale_factor)
+ height = int(image.shape[0] * scale_factor)
+ dimensions = (width, height)
+
+ # Use better interpolation for upscaling vs downscaling
+ interpolation = cv2.INTER_CUBIC if scale_factor > 1.0 else cv2.INTER_AREA
+ rescaled_image = cv2.resize(image, dimensions, interpolation=interpolation)
+ metadata["scale_factor"] = scale_factor
+ return rescaled_image, metadata
+
+ return image, metadata
+
+
+class ContrastSegmentedImageEnhancer(ImagePreprocessor):
+ """Class containing all logic to perform contrastive segmentation."""
+
+ def __init__(
+ self,
+ bilateral_filter: Optional[BilateralFilter] = None,
+ adaptive_threshold: Optional[SegmentedAdaptiveThreshold] = None,
+ image_rescaling: Optional[ImageRescaling] = None,
+ low_contrast_threshold: int = 40,
+ ) -> None:
+ super().__init__(use_greyscale=True)
+ self.bilateral_filter = bilateral_filter or BilateralFilter()
+ self.adaptive_threshold = adaptive_threshold or SegmentedAdaptiveThreshold()
+ self.image_rescaling = image_rescaling or ImageRescaling()
+ self.low_contrast_threshold = low_contrast_threshold
+
+ def _improve_contrast(self, image: np.ndarray) -> Tuple[np.ndarray, str, str]:
+ contrast, mean_intensity = self._get_image_contrast(image)
+ if contrast <= self.low_contrast_threshold:
+ # Using CLAHE as a generally more robust alternative
+ clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
+ adjusted_image = clahe.apply(image)
+ adjusted_contrast, _ = self._get_image_contrast(adjusted_image)
+ else:
+ adjusted_image = image
+ adjusted_contrast = contrast
+ return adjusted_image, contrast, adjusted_contrast
+
+ def _deskew(self, image_np: np.ndarray) -> np.ndarray:
+ """
+ Corrects the skew of an image.
+ This method works best on a grayscaled image.
+ """
+ # We'll work with a copy for angle detection
+ gray = (
+ cv2.cvtColor(image_np, cv2.COLOR_BGR2GRAY)
+ if len(image_np.shape) == 3
+ else image_np.copy()
+ )
+
+ # Invert the image for contour finding
+ thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
+
+ coords = np.column_stack(np.where(thresh > 0))
+ angle = cv2.minAreaRect(coords)[-1]
+
+ # Adjust the angle for rotation
+ if angle < -45:
+ angle = -(90 + angle)
+ else:
+ angle = -angle
+
+ # Don't rotate if the angle is negligible
+ if abs(angle) < 0.1:
+ return image_np
+
+ h, w = image_np.shape[:2]
+ center = (w // 2, h // 2)
+ M = cv2.getRotationMatrix2D(center, angle, 1.0)
+
+ # Use the original numpy image for the rotation to preserve quality
+ rotated = cv2.warpAffine(
+ image_np, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE
+ )
+
+ return rotated
+
+ def preprocess_image(
+ self,
+ image: Image.Image,
+ perform_deskew: bool = False,
+ perform_binarization: bool = False,
+ ) -> Tuple[Image.Image, dict]:
+ """
+ A pipeline for OCR preprocessing.
+ Order: Deskew -> Greyscale -> Rescale -> Denoise -> Enhance Contrast -> Binarize
+ """
+ # 1. Convert PIL image to NumPy array for OpenCV processing
+ # Assuming the original image is RGB
+ image_np = np.array(image.convert("RGB"))
+ # OpenCV uses BGR, so we convert RGB to BGR
+ image_np_bgr = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR)
+
+ # --- REVISED PIPELINE ---
+
+ # 2. Deskew the image (critical new step)
+ # This is best done early on the full-quality image.
+ if perform_deskew:
+ deskewed_image_np = self._deskew(image_np_bgr)
+ else:
+ deskewed_image_np = image_np_bgr
+
+ # 3. Convert to greyscale
+ # Your convert_image_to_array probably does this, but for clarity:
+ gray_image_np = cv2.cvtColor(deskewed_image_np, cv2.COLOR_BGR2GRAY)
+
+ # 4. Rescale image to optimal DPI
+ # Assuming your image_rescaling object can handle a greyscale numpy array
+ rescaled_image_np, scale_metadata = self.image_rescaling.preprocess_image(
+ gray_image_np
+ )
+
+ # 5. Apply filtering for noise reduction
+ # Suggestion: A Median filter is often very effective for scanned docs
+ # filtered_image_np = cv2.medianBlur(rescaled_image_np, 3)
+ # Or using your existing bilateral filter:
+ filtered_image_np, _ = self.bilateral_filter.preprocess_image(rescaled_image_np)
+
+ # 6. Improve contrast
+ adjusted_image_np, _, _ = self._improve_contrast(filtered_image_np)
+
+ # 7. Adaptive Thresholding (Binarization) - Final optional step
+ if perform_binarization:
+ final_image_np, threshold_metadata = (
+ self.adaptive_threshold.preprocess_image(adjusted_image_np)
+ )
+ else:
+ final_image_np = adjusted_image_np
+ threshold_metadata = {}
+
+ # Combine metadata
+ final_metadata = {**scale_metadata, **threshold_metadata}
+
+ # Convert final numpy array back to PIL Image for return
+ # The final image is greyscale, so it's safe to use 'L' mode
+ return Image.fromarray(final_image_np).convert("L"), final_metadata
+
+
+def rescale_ocr_data(ocr_data, scale_factor: float):
+
+ # We loop from 0 to the number of detected words.
+ num_boxes = len(ocr_data["text"])
+ for i in range(num_boxes):
+ # We only want to process actual words, not empty boxes Tesseract might find
+ if int(ocr_data["conf"][i]) > -1: # -1 confidence is for structural elements
+ # Get coordinates from the processed image using the index 'i'
+ x_proc = ocr_data["left"][i]
+ y_proc = ocr_data["top"][i]
+ w_proc = ocr_data["width"][i]
+ h_proc = ocr_data["height"][i]
+
+ # Apply the inverse transformation (division)
+ x_orig = int(x_proc / scale_factor)
+ y_orig = int(y_proc / scale_factor)
+ w_orig = int(w_proc / scale_factor)
+ h_orig = int(h_proc / scale_factor)
+
+ # --- THE MAPPING STEP ---
+ # Update the dictionary values in-place using the same index 'i'
+ ocr_data["left"][i] = x_orig
+ ocr_data["top"][i] = y_orig
+ ocr_data["width"][i] = w_orig
+ ocr_data["height"][i] = h_orig
+
+ return ocr_data
+
+
+def filter_entities_for_language(
+ entities: List[str], valid_language_entities: List[str], language: str
+) -> List[str]:
+
+ if not valid_language_entities:
+ print(f"No valid entities supported for language: {language}")
+ # raise Warning(f"No valid entities supported for language: {language}")
+ if not entities:
+ print(f"No entities provided for language: {language}")
+ # raise Warning(f"No entities provided for language: {language}")
+
+ filtered_entities = [
+ entity for entity in entities if entity in valid_language_entities
+ ]
+
+ if not filtered_entities:
+ print(f"No relevant entities supported for language: {language}")
+ # raise Warning(f"No relevant entities supported for language: {language}")
+
+ if language != "en":
+ gr.Info(
+ f"Using {str(filtered_entities)} entities for local model analysis for language: {language}"
+ )
+
+ return filtered_entities
+
+
+def _get_tesseract_psm(segmentation_level: str) -> int:
+ """
+ Get the appropriate Tesseract PSM (Page Segmentation Mode) value based on segmentation level.
+
+ Args:
+ segmentation_level: "word" or "line"
+
+ Returns:
+ PSM value for Tesseract configuration
+ """
+ if segmentation_level.lower() == "line":
+ return 6 # Uniform block of text
+ elif segmentation_level.lower() == "word":
+ return 11 # Sparse text (word-level)
+ else:
+ print(
+ f"Warning: Unknown segmentation level '{segmentation_level}', defaulting to word-level (PSM 11)"
+ )
+ return 11
+
+
+def _extract_page_number_for_vlm_log(image_name: Optional[str]) -> Optional[int]:
+ """
+ Best-effort 0-based page index from an image basename for VLM log filenames.
+
+ Tries end-anchored patterns first, then the last ``_page_`` occurrence
+ anywhere in the name (e.g. ``doc.pdf_0_page_0000.png``).
+ """
+ if not image_name:
+ return None
+ end_patterns = (
+ r"_page_(\d+)\.(?:png|jpg|jpeg|webp|tif|tiff)$",
+ r"_page_(\d+)\.png$",
+ r"_(\d+)\.png$",
+ r"page_(\d+)\.png$",
+ )
+ for pattern in end_patterns:
+ match = re.search(pattern, image_name, re.IGNORECASE)
+ if match:
+ return int(match.group(1))
+ last_page: Optional[int] = None
+ for match in re.finditer(r"(?i)_page_(\d+)", image_name):
+ last_page = int(match.group(1))
+ return last_page
+
+
+def save_vlm_prompt_response(
+ prompt: str,
+ response_text: str,
+ output_folder: str,
+ model_choice: str,
+ image_name: Optional[str] = None,
+ page_number: Optional[int] = None,
+ temperature: Optional[float] = None,
+ max_new_tokens: Optional[int] = None,
+ top_p: Optional[float] = None,
+ model_type: str = "VLM",
+ task_suffix: Optional[str] = None,
+ input_tokens: Optional[int] = None,
+ output_tokens: Optional[int] = None,
+ image_width: Optional[int] = None,
+ image_height: Optional[int] = None,
+) -> str:
+ """
+ Save VLM prompt and response to a text file for traceability.
+
+ Args:
+ prompt: Prompt sent to VLM
+ response_text: Response text from VLM
+ output_folder: Output folder path
+ model_choice: Model used
+ image_name: Optional image name (without extension) for the filename
+ page_number: Optional 0-based page index for the filename (overrides parsing
+ ``image_name``). Displayed in the log body as 1-based when set or parsed.
+ temperature: Temperature used (if applicable)
+ max_new_tokens: Max tokens used (if applicable)
+ top_p: Top-p parameter used (if applicable)
+ model_type: Type of model (e.g., "VLM", "Bedrock", "Inference Server", "Gemini", "Azure/OpenAI")
+ task_suffix: Optional suffix to add to filename (e.g., "_person", "_sig") to distinguish task types
+ input_tokens: Input token count (API usage where available; local/estimated for Transformers)
+ output_tokens: Output token count (same)
+ image_width: Pixel width of the image sent to the VLM (after any resize/pad in the pipeline)
+ image_height: Pixel height of the image sent to the VLM
+
+ Returns:
+ Path to the saved file
+ """
+ # Create VLM logs subfolder
+ vlm_logs_folder = os.path.join(output_folder, "vlm_prompts_responses")
+ os.makedirs(vlm_logs_folder, exist_ok=True)
+
+ # Add task suffix to filename if provided
+ suffix_str = f"_{task_suffix}" if task_suffix else ""
+
+ effective_page: Optional[int] = page_number
+ if effective_page is None and image_name:
+ effective_page = _extract_page_number_for_vlm_log(image_name)
+
+ # Filenames always include a page segment: _page_NNNN or _page_unknown
+ if image_name:
+ safe_image_name = "".join(
+ c for c in image_name if c.isalnum() or c in (" ", "-", "_", ".")
+ ).strip()
+ safe_image_name = safe_image_name.replace(" ", "_")
+ safe_image_name = safe_image_name.rsplit(".", 1)[0]
+ if isinstance(effective_page, int):
+ page_part = f"_page_{effective_page:04d}"
+ else:
+ page_part = "_page_unknown"
+ filename = f"vlm_{safe_image_name}{page_part}{suffix_str}_{model_type.lower().replace(' ', '_')}.txt"
+ else:
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+ if isinstance(effective_page, int):
+ page_part = f"_page_{effective_page:04d}"
+ else:
+ page_part = "_page_unknown"
+ filename = f"vlm_{model_type.lower().replace(' ', '_')}{page_part}{suffix_str}_{timestamp}.txt"
+ filepath = os.path.join(vlm_logs_folder, filename)
+
+ # Write prompt and response to file
+ with open(filepath, "w", encoding="utf-8") as f:
+ f.write("=" * 80 + "\n")
+ f.write("VLM OCR - PROMPT AND RESPONSE LOG\n")
+ f.write("=" * 80 + "\n\n")
+
+ f.write(f"Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
+ if image_name:
+ f.write(f"Image: {image_name}\n")
+ if isinstance(effective_page, int):
+ f.write(f"Page: {effective_page + 1}\n")
+ if image_width is not None and image_height is not None:
+ f.write(
+ f"Image input size (pixels): {image_width} x {image_height} (width x height)\n"
+ )
+ elif image_width is not None or image_height is not None:
+ f.write(
+ f"Image input size (pixels): width={image_width}, height={image_height}\n"
+ )
+ f.write(
+ f"Input tokens: {input_tokens if input_tokens is not None else '(not reported)'}\n"
+ )
+ f.write(
+ f"Output tokens: {output_tokens if output_tokens is not None else '(not reported)'}\n"
+ )
+ f.write(f"Model: {model_choice}\n")
+ f.write(f"Model Type: {model_type}\n")
+ if temperature is not None:
+ f.write(f"Temperature: {temperature}\n")
+ if max_new_tokens is not None:
+ f.write(f"Max New Tokens: {max_new_tokens}\n")
+ if top_p is not None:
+ f.write(f"Top-p: {top_p}\n")
+ f.write("\n" + "=" * 80 + "\n")
+ f.write("PROMPT\n")
+ f.write("=" * 80 + "\n\n")
+ f.write(prompt)
+ f.write("\n\n" + "=" * 80 + "\n")
+ f.write("VLM RESPONSE\n")
+ f.write("=" * 80 + "\n\n")
+ f.write(response_text)
+ f.write("\n\n" + "=" * 80 + "\n")
+ f.write("END OF LOG\n")
+ f.write("=" * 80 + "\n")
+
+ return filepath
+
+
+def _exif_resolution_to_float(value: Any) -> Optional[float]:
+ """Convert EXIF/TIFF resolution (RATIONAL) to float."""
+ if value is None:
+ return None
+ try:
+ if hasattr(value, "numerator") and hasattr(value, "denominator"):
+ d = float(value.denominator)
+ if d == 0:
+ return None
+ return float(value.numerator) / d
+ if isinstance(value, tuple) and len(value) == 2:
+ d = float(value[1])
+ if d == 0:
+ return None
+ return float(value[0]) / d
+ return float(value)
+ except (TypeError, ValueError):
+ return None
+
+
+def _best_effort_pil_dpi(image: Image.Image, fallback: float = 72.0) -> float:
+ """
+ Best-effort DPI from PIL metadata before defaulting to ``fallback``.
+
+ Tries, in order: ``info['dpi']``, JPEG JFIF density/unit, EXIF X/Y resolution,
+ TIFF ``tag_v2`` resolution tags.
+ """
+
+ def _positive_max_pair(a: Any, b: Any) -> Optional[float]:
+ try:
+ x = float(a) if a is not None else 0.0
+ y = float(b) if b is not None else 0.0
+ m = max(x, y)
+ return m if m > 0 else None
+ except (TypeError, ValueError):
+ return None
+
+ dpi_raw = image.info.get("dpi")
+ if dpi_raw is not None:
+ if isinstance(dpi_raw, tuple) and len(dpi_raw) >= 2:
+ v = _positive_max_pair(dpi_raw[0], dpi_raw[1])
+ if v is not None:
+ return v
+ else:
+ try:
+ f = float(dpi_raw)
+ if f > 0:
+ return f
+ except (TypeError, ValueError):
+ pass
+
+ jfif_unit = image.info.get("jfif_unit")
+ jden = image.info.get("jfif_density")
+ if jden is not None and isinstance(jden, (tuple, list)) and len(jden) >= 2:
+ v = _positive_max_pair(jden[0], jden[1])
+ if v is not None:
+ if jfif_unit == 1:
+ return v
+ if jfif_unit == 2:
+ return v * 2.54
+
+ try:
+ exif = image.getexif()
+ if exif:
+ xres = _exif_resolution_to_float(exif.get(282))
+ yres = _exif_resolution_to_float(exif.get(283))
+ if xres is not None and yres is not None and xres > 0 and yres > 0:
+ d = max(xres, yres)
+ unit = exif.get(296, 2)
+ if unit == 2 or unit is None:
+ return d
+ if unit == 3:
+ return d * 2.54
+ except Exception:
+ pass
+
+ try:
+ tv = getattr(image, "tag_v2", None)
+ if tv is not None:
+ xres = _exif_resolution_to_float(tv.get(282))
+ yres = _exif_resolution_to_float(tv.get(283))
+ if xres is not None and yres is not None and xres > 0 and yres > 0:
+ d = max(xres, yres)
+ unit = tv.get(296)
+ if unit == 2 or unit is None:
+ return d
+ if unit == 3:
+ return d * 2.54
+ except Exception:
+ pass
+
+ # No DPI metadata found (or parse failed) across known PIL/JFIF/EXIF/TIFF fields.
+ # Make this explicit in logs so "reported DPI" isn't mistaken for a detected value.
+ try:
+ w, h = image.size
+ except Exception:
+ w, h = None, None
+ try:
+ img_format = getattr(image, "format", None)
+ except Exception:
+ img_format = None
+ print(
+ "VLM image preparation: DPI metadata not found; "
+ f"using fallback {fallback:.1f} DPI"
+ + (
+ f" ({w}x{h}{', ' + str(img_format) if img_format else ''})"
+ if w and h
+ else ""
+ )
+ )
+ return fallback
+
+
+def _save_image_with_config_dpi(image: Image.Image, path: str, **kwargs: Any) -> None:
+ """
+ Write a PIL image to disk with DPI metadata from ``IMAGES_DPI`` (PNG pHYs, JPEG JFIF).
+ Extra kwargs are forwarded to ``Image.save`` (e.g. format=, optimize=).
+ """
+ _d = max(1, int(round(float(IMAGES_DPI))))
+ kw = dict(kwargs)
+ kw.pop("dpi", None)
+ image.save(path, dpi=(_d, _d), **kw)
+
+
+def _prepare_image_for_vlm(
+ image: Image.Image,
+ ocr_method: Optional[str] = None,
+ max_image_size: Optional[int] = VLM_MAX_IMAGE_SIZE,
+ hybrid_vlm: bool = False,
+) -> Image.Image:
+ """
+ Prepare image for VLM: enforce pixel count and reported DPI bounds.
+
+ Scaling by factor ``s`` updates effective DPI as ``reported_dpi * s`` (same physical
+ document size). Chooses ``s`` so that:
+
+ - ``VLM_MIN_IMAGE_SIZE`` (full page) or ``VLM_HYBRID_MIN_IMAGE_SIZE`` (hybrid crops)
+ <= width*height*s^2 <= ``max_image_size``
+ - ``VLM_MIN_DPI`` <= reported_dpi * s <= ``VLM_MAX_DPI``
+
+ If constraints conflict, caps (max pixels / max DPI) take precedence and a warning is printed.
+
+ Args:
+ image: PIL Image to prepare
+ ocr_method: If it contains ``bedrock`` (case-insensitive), max pixel budget is
+ raised to 33554432 for Bedrock VLM.
+ max_image_size: Upper bound on total pixels (default ``VLM_MAX_IMAGE_SIZE``).
+ hybrid_vlm: If True, use ``VLM_HYBRID_MIN_IMAGE_SIZE`` as minimum pixels; otherwise
+ ``VLM_MIN_IMAGE_SIZE`` (whole-page VLM).
+
+ Returns:
+ Resized RGB-safe image when needed; DPI metadata updated after resize.
+ """
+ if image is None:
+ return image
+
+ if ocr_method and "bedrock" in ocr_method.lower():
+ max_image_size = min(
+ 33554432, VLM_MAX_IMAGE_SIZE
+ ) # Bedrock has a specific max pixel budget - it will fail if exceeded
+
+ if max_image_size is None or max_image_size <= 0:
+ max_image_size = VLM_MAX_IMAGE_SIZE
+
+ min_image_size = VLM_HYBRID_MIN_IMAGE_SIZE if hybrid_vlm else VLM_MIN_IMAGE_SIZE
+ min_image_size = max(0, int(min_image_size))
+
+ dpi_lo = min(VLM_MIN_DPI, VLM_MAX_DPI)
+ dpi_hi = max(VLM_MIN_DPI, VLM_MAX_DPI)
+
+ width, height = image.size
+ area = float(width * height)
+ if area <= 0:
+ return image
+
+ current_dpi = _best_effort_pil_dpi(image, fallback=float(IMAGES_DPI))
+ if current_dpi <= 0:
+ current_dpi = float(IMAGES_DPI)
+
+ # Effective DPI after uniform scale s is current_dpi * s.
+ s_min_dpi = dpi_lo / current_dpi
+ s_max_dpi = dpi_hi / current_dpi
+
+ s_min_px = math.sqrt(min_image_size / area) if min_image_size > 0 else 0.0
+ s_max_px = math.sqrt(max_image_size / area) if max_image_size > 0 else float("inf")
+
+ s_lo = max(s_min_dpi, s_min_px)
+ s_hi = min(s_max_dpi, s_max_px)
+
+ if s_lo > s_hi:
+ print(
+ f"VLM image preparation warning: constraints conflict "
+ f"(DPI {dpi_lo:.1f}-{dpi_hi:.1f}, pixels {min_image_size}-{max_image_size}, "
+ f"reported DPI {current_dpi:.1f}, {width}x{height}). "
+ f"Using scale {s_hi:.4f} (capping size/DPI)."
+ )
+ s = s_hi
+ else:
+ s = 1.0
+ if s < s_lo:
+ s = s_lo
+ elif s > s_hi:
+ s = s_hi
+
+ if abs(s - 1.0) < 1e-6:
+ return image
+
+ new_w = max(1, int(round(width * s)))
+ new_h = max(1, int(round(height * s)))
+ new_pixels = new_w * new_h
+ achieved_dpi = current_dpi * s
+ metadata_dpi = min(dpi_hi, max(dpi_lo, achieved_dpi))
+
+ if abs(s - 1.0) > 0.02:
+ print(
+ f"VLM image preparation: {width}x{height} ({int(area):,} px, ~{current_dpi:.1f} DPI) "
+ f"-> {new_w}x{new_h} (achieved ~{achieved_dpi:.1f} DPI, {new_pixels:,} px, config requirement ~{metadata_dpi:.1f} DPI), scale {s:.4f}"
+ )
+
+ resample = Image.Resampling.LANCZOS if s < 1.0 else Image.Resampling.BICUBIC
+ image = image.resize((new_w, new_h), resample)
+ try:
+ image.info["dpi"] = (metadata_dpi, metadata_dpi)
+ except Exception:
+ pass
+ return image
+
+
+def _pad_image_for_vlm_aspect_ratio(
+ image: Image.Image,
+ max_aspect: Optional[float] = None,
+) -> Image.Image:
+ """
+ Pad image so aspect ratio max(w/h, h/w) <= max_aspect (default: ``VLM_MAX_ASPECT_RATIO``).
+ Used for Bedrock, inference-server, Gemini, Azure/OpenAI, and local transformers VLM to avoid
+ API or model issues on very long/thin hybrid crops. Returns RGB image.
+ """
+ if max_aspect is None:
+ max_aspect = VLM_MAX_ASPECT_RATIO
+ if image is None:
+ return image
+ try:
+ w, h = image.size
+ if w < 1 or h < 1:
+ return image.convert("RGB") if image.mode != "RGB" else image
+ current = max(w / float(h), h / float(w))
+ if current <= max_aspect:
+ return image.convert("RGB") if image.mode != "RGB" else image
+ img = image.convert("RGB") if image.mode != "RGB" else image
+ if w >= h:
+ new_h = max(int(math.ceil(w / max_aspect)), 1)
+ if new_h != h:
+ print(
+ f"VLM aspect ratio padding: width >= height. Original size: {w}x{h}. New size: {w}x{new_h}. Applied extra padding to height to achieve aspect ratio <= {max_aspect}."
+ )
+ new_w, new_h = w, new_h
+ else:
+ new_w = max(int(math.ceil(h / max_aspect)), 1)
+ if new_w != w:
+ print(
+ f"VLM aspect ratio padding: height > width. Original size: {w}x{h}. New size: {new_w}x{h}. Applied extra padding to width to achieve aspect ratio <= {max_aspect}."
+ )
+ new_w, new_h = new_w, h
+ canvas = Image.new("RGB", (new_w, new_h), color=(255, 255, 255))
+ scale = min(new_w / float(w), new_h / float(h))
+ if scale < 1.0:
+ rw = max(1, int(round(w * scale)))
+ rh = max(1, int(round(h * scale)))
+ paste_img = img.resize((rw, rh), Image.Resampling.LANCZOS)
+ else:
+ paste_img = img
+ rw, rh = w, h
+ ox = max((new_w - rw) // 2, 0)
+ oy = max((new_h - rh) // 2, 0)
+ canvas.paste(paste_img, (ox, oy))
+ return canvas
+ except Exception:
+ return image.convert("RGB") if image.mode != "RGB" else image
+
+
+def _prepare_hybrid_line_crop_for_vlm(image: Image.Image) -> Image.Image:
+ """
+ Resize/DPI-budget and aspect-pad a line crop for hybrid local VLM or hybrid inference-server.
+
+ Matches the image pipeline in ``_vlm_ocr_predict`` immediately before ``extract_text_from_image_vlm``.
+ Caller should supply an RGB image (e.g. after line crop).
+ """
+ image = _prepare_image_for_vlm(image, hybrid_vlm=True)
+ try:
+ image = _pad_image_for_vlm_aspect_ratio(image)
+ except Exception:
+ pass
+ return image
+
+
+def _call_inference_server_vlm_api(
+ image: Image.Image,
+ prompt: str,
+ api_url: str = None,
+ model_name: str = None,
+ max_new_tokens: int = None,
+ temperature: float = None,
+ top_p: float = None,
+ top_k: int = None,
+ repetition_penalty: float = None,
+ timeout: int = None,
+ stream: bool = VLM_DEFAULT_STREAM,
+ seed: int = None,
+ do_sample: bool = None,
+ min_p: float = None,
+ presence_penalty: float = None,
+ use_llama_swap: bool = USE_LLAMA_SWAP,
+ disable_thinking: bool = INFERENCE_SERVER_DISABLE_THINKING,
+ apply_aspect_ratio_padding: bool = True,
+) -> Tuple[str, int, int, int, int]:
+ """
+ Calls a inference-server API endpoint with an image and text prompt.
+
+ This function converts a PIL Image to base64 and sends it to the inference-server
+ API endpoint using the OpenAI-compatible chat completions format.
+
+ Args:
+ image: PIL Image to process
+ prompt: Text prompt for the VLM
+ api_url: Base URL of the inference-server API (defaults to INFERENCE_SERVER_API_URL from config)
+ model_name: Optional model name to use (defaults to INFERENCE_SERVER_MODEL_NAME from config)
+ max_new_tokens: Maximum number of tokens to generate
+ temperature: Sampling temperature
+ top_p: Nucleus sampling parameter
+ top_k: Top-k sampling parameter
+ repetition_penalty: Penalty for token repetition
+ timeout: Request timeout in seconds (defaults to INFERENCE_SERVER_TIMEOUT from config)
+ stream: Whether to stream the response
+ seed: Random seed for generation
+ do_sample: If True, use sampling (do_sample=True).
+ If False, use greedy decoding (do_sample=False).
+ min_p: Minimum probability threshold for token sampling.
+ presence_penalty: Penalty for token presence.
+ use_llama_swap: Whether to use llama-swap for the model (defaults to USE_LLAMA_SWAP from config).
+ If True and model_name is provided, the model name will be included in the payload.
+ disable_thinking: When True, adds chat_template_kwargs={"enable_thinking": False} to the
+ request payload. This is the vLLM-native equivalent of appending in the
+ local transformers path (VLM_DISABLE_QWEN3_5_THINKING). Defaults to
+ INFERENCE_SERVER_DISABLE_THINKING from config.
+ apply_aspect_ratio_padding: When False, send the image unchanged (caller already ran
+ ``_pad_image_for_vlm_aspect_ratio``). Full-page callers keep the default True.
+ Returns:
+ Tuple of response text, input tokens, output tokens, and image width/height
+ in pixels (as encoded for the API, including padding when applied).
+ On success, also prints prompt/completion token counts and output tok/s to stdout.
+
+ Raises:
+ ConnectionError: If the API request fails
+ ValueError: If the response format is invalid
+ """
+ if api_url is None:
+ api_url = INFERENCE_SERVER_API_URL
+ if model_name is None:
+ model_name = (
+ INFERENCE_SERVER_MODEL_NAME if INFERENCE_SERVER_MODEL_NAME else None
+ )
+ if timeout is None:
+ timeout = INFERENCE_SERVER_TIMEOUT
+
+ # Pad image so aspect ratio <= VLM_MAX_ASPECT_RATIO; hybrid crops can be very long/thin
+ if apply_aspect_ratio_padding:
+ try:
+ image = _pad_image_for_vlm_aspect_ratio(image)
+ except Exception as e:
+ print(
+ f"Warning: could not pad image for inference-server VLM aspect ratio: {e}"
+ )
+
+ # Convert PIL Image to base64
+ buffer = io.BytesIO()
+ image.save(buffer, format="PNG")
+ image_bytes = buffer.getvalue()
+ image_base64 = base64.b64encode(image_bytes).decode("utf-8")
+
+ # Prepare the request payload in OpenAI-compatible format
+ messages = [
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "image_url",
+ "image_url": {"url": f"data:image/png;base64,{image_base64}"},
+ },
+ {"type": "text", "text": prompt},
+ ],
+ }
+ ]
+
+ payload = {
+ "messages": messages,
+ "stream": stream,
+ }
+
+ # Add model name if specified and use llama-swap
+ if model_name and use_llama_swap:
+ payload["model"] = model_name
+
+ # Disable thinking for Qwen3/Qwen3.5 models served by vLLM. vLLM applies the chat template
+ # server-side and honours enable_thinking=False via chat_template_kwargs, which is the exact
+ # server-side equivalent of appending in the local transformers path.
+ if disable_thinking:
+ payload["chat_template_kwargs"] = {"enable_thinking": False}
+
+ if do_sample is not None:
+ payload["do_sample"] = do_sample
+ if temperature is not None:
+ payload["temperature"] = temperature
+ if top_p is not None:
+ payload["top_p"] = top_p
+ if min_p is not None:
+ payload["min_p"] = min_p
+ if top_k is not None:
+ payload["top_k"] = top_k
+ if repetition_penalty is not None:
+ payload["repeat_penalty"] = repetition_penalty
+ if presence_penalty is not None:
+ payload["presence_penalty"] = presence_penalty
+ if max_new_tokens is not None:
+ payload["max_tokens"] = max_new_tokens
+ if seed is not None:
+ payload["seed"] = seed
+
+ # Handle deterministic (greedy) generation
+ if do_sample is False:
+ # Greedy decoding (deterministic): always pick the highest probability token
+ # This emulates transformers' do_sample=False behavior
+ payload["temperature"] = 0 # Temperature=0 makes it deterministic
+ payload["top_k"] = 1 # Only consider top 1 token (greedy)
+ payload["top_p"] = 1.0 # Consider all tokens (but top_k=1 overrides this)
+ payload["min_p"] = 0.0 # Minimum probability threshold for token sampling.
+ payload["presence_penalty"] = 1.0 # Penalty for token presence.
+ payload["repeat_penalty"] = 1.0 # No penalty for deterministic
+
+ endpoint = f"{api_url}/v1/chat/completions"
+
+ # Retry logic: try up to 5 times for connection errors
+ max_retries = 5
+ retry_delay = 2 # seconds between retries
+
+ for attempt in range(max_retries):
+ try:
+ if stream:
+ # Handle streaming response
+ response = requests.post(
+ endpoint,
+ json=payload,
+ headers={"Content-Type": "application/json"},
+ stream=True,
+ timeout=timeout,
+ )
+ response.raise_for_status()
+
+ stream_start = time.perf_counter()
+
+ # Some OpenAI-compatible servers stream *cumulative* delta.content (full text
+ # generated so far on each chunk). Printing every chunk reprints completed lines.
+ # Others stream incremental tokens only. Support both.
+ accumulated_response = ""
+ # Track only the current in-progress line (since the last newline) so GUI line
+ # reporting doesn't repeatedly emit the entire response.
+ line_buffer = ""
+ accumulated_response_line = ""
+ output_tokens = 0
+ final_chunk = None
+
+ for line in response.iter_lines():
+ if not line: # Skip empty lines
+ continue
+
+ line = line.decode("utf-8")
+ if line.startswith("data: "):
+ data = line[6:] # Remove 'data: ' prefix
+ if data.strip() == "[DONE]":
+ break
+ try:
+ chunk = json.loads(data)
+ # Store the last chunk in case it contains usage info
+ final_chunk = chunk
+
+ if "choices" in chunk and len(chunk["choices"]) > 0:
+ delta = chunk["choices"][0].get("delta", {})
+ token = delta.get("content", "")
+ if not token:
+ continue
+ if token == accumulated_response:
+ continue
+ if accumulated_response and token.startswith(
+ accumulated_response
+ ):
+ new_part = token[len(accumulated_response) :]
+ if new_part:
+ print(new_part, end="", flush=True)
+ accumulated_response = token
+ else:
+ print(token, end="", flush=True)
+ accumulated_response += token
+ output_tokens += 1
+
+ # Maintain line-only buffer for GUI reporting.
+ try:
+ # Prefer the delta we just received; supports both cumulative and incremental streams.
+ if accumulated_response and token.startswith(
+ accumulated_response
+ ):
+ # This is a cumulative chunk; line_buffer should only get the new part.
+ line_buffer += (
+ new_part if "new_part" in locals() else ""
+ )
+ else:
+ # This is an incremental token (or reset); append the token to current line.
+ line_buffer += token
+ except Exception:
+ # If anything is inconsistent, fall back to using the full accumulated response.
+ line_buffer = accumulated_response
+
+ if "\n" in line_buffer:
+ parts = line_buffer.split("\n")
+ complete_lines = parts[:-1]
+ line_buffer = parts[-1] if parts else ""
+ accumulated_response_line = line_buffer
+ if REPORT_VLM_OUTPUTS_TO_GUI:
+ for _ln in complete_lines:
+ if _ln.strip():
+ try:
+ gr.Info(_ln, duration=2)
+ except Exception:
+ pass
+ else:
+ accumulated_response_line = line_buffer
+ except json.JSONDecodeError:
+ continue
+
+ print() # newline after stream finishes
+
+ text = accumulated_response
+ stream_elapsed_s = time.perf_counter() - stream_start
+
+ # Try to extract token usage from final chunk if available
+ input_tokens = 0
+ if final_chunk and "usage" in final_chunk:
+ usage = final_chunk["usage"]
+ input_tokens = usage.get("prompt_tokens", 0)
+ # Use the actual output tokens from usage if available, otherwise use our count
+ output_tokens_from_usage = usage.get("completion_tokens", 0)
+ if output_tokens_from_usage > 0:
+ output_tokens = output_tokens_from_usage
+ else:
+ # Estimate input tokens based on prompt length and image
+ # Rough approximation: prompt tokens + image tokens (estimate based on image size)
+ prompt_word_count = len(prompt.split())
+ # Estimate image tokens: roughly 1 token per 100 pixels (very rough approximation)
+ image_tokens_estimate = max(
+ 100, (image.size[0] * image.size[1]) // 100
+ )
+ input_tokens = prompt_word_count + image_tokens_estimate
+
+ if stream_elapsed_s > 0 and output_tokens > 0:
+ gen_tok_s = output_tokens / stream_elapsed_s
+ print(
+ f"Inference-server VLM: prompt_tokens={input_tokens}, "
+ f"completion_tokens={output_tokens}, speed={gen_tok_s:.2f} tok/s",
+ flush=True,
+ )
+ else:
+ print(
+ f"Inference-server VLM: prompt_tokens={input_tokens}, "
+ f"completion_tokens={output_tokens}, speed=n/a",
+ flush=True,
+ )
+
+ iw, ih = image.size
+ return text, input_tokens, output_tokens, iw, ih
+
+ else:
+ # Handle non-streaming response
+ req_start = time.perf_counter()
+ response = requests.post(
+ endpoint,
+ json=payload,
+ headers={"Content-Type": "application/json"},
+ timeout=timeout,
+ )
+ response.raise_for_status()
+
+ result = response.json()
+
+ # Ensure the response has the expected format
+ if "choices" not in result or len(result["choices"]) == 0:
+ raise ValueError(
+ "Invalid response format from inference-server: no choices found"
+ )
+
+ choice = result["choices"][0]
+ content = _extract_choice_message_text(choice)
+
+ if not (content and str(content).strip()):
+ raise ValueError(
+ "Invalid response format from inference-server: no content in message"
+ )
+
+ # Extract token usage from response
+ input_tokens = 0
+ output_tokens = 0
+ if "usage" in result:
+ usage = result["usage"]
+ input_tokens = usage.get("prompt_tokens", 0)
+ output_tokens = usage.get("completion_tokens", 0)
+
+ req_elapsed_s = time.perf_counter() - req_start
+ if req_elapsed_s > 0 and output_tokens > 0:
+ gen_tok_s = output_tokens / req_elapsed_s
+ print(
+ f"Inference-server VLM: prompt_tokens={input_tokens}, "
+ f"completion_tokens={output_tokens}, speed={gen_tok_s:.2f} tok/s",
+ flush=True,
+ )
+ else:
+ print(
+ f"Inference-server VLM: prompt_tokens={input_tokens}, "
+ f"completion_tokens={output_tokens}, speed=n/a",
+ flush=True,
+ )
+
+ iw, ih = image.size
+ return content, input_tokens, output_tokens, iw, ih
+
+ except (
+ requests.exceptions.RequestException,
+ requests.exceptions.HTTPError,
+ ) as e:
+ # Retry on connection errors or HTTP errors (like 500 Server Error)
+ if attempt < max_retries - 1:
+ print(
+ f"Inference-server VLM API call failed (attempt {attempt + 1}/{max_retries}): {str(e)}"
+ )
+ print(f"Retrying in {retry_delay} seconds...")
+ time.sleep(retry_delay)
+ retry_delay *= 2 # Exponential backoff
+ continue
+ else:
+ # Final attempt failed, raise the error
+ raise ConnectionError(
+ f"Failed to connect to inference-server at {api_url} after {max_retries} attempts: {str(e)}"
+ )
+ except json.JSONDecodeError as e:
+ # Don't retry on JSON decode errors - these are likely permanent issues
+ raise ValueError(f"Invalid JSON response from inference-server: {str(e)}")
+ except Exception as e:
+ # Don't retry on other exceptions - these are likely permanent issues
+ raise RuntimeError(f"Error calling inference-server API: {str(e)}")
+
+
+def _call_bedrock_vlm_api(
+ image: Image.Image,
+ prompt: str,
+ model_choice: str = None,
+ bedrock_runtime=None,
+ max_new_tokens: int = None,
+ temperature: float = None,
+ top_p: float = None,
+ timeout: int = 60,
+ max_retries: int = 5,
+ retry_delay_seconds: float = 2.0,
+) -> Tuple[str, int, int, int, int]:
+ """
+ Calls AWS Bedrock API with an image and text prompt for vision models.
+
+ Args:
+ image: PIL Image to process
+ prompt: Text prompt for the VLM
+ model_choice: Bedrock model ID (e.g., "anthropic.claude-3-5-sonnet-20241022-v2:0")
+ bedrock_runtime: boto3 Bedrock runtime client
+ max_new_tokens: Maximum number of tokens to generate
+ temperature: Sampling temperature
+ top_p: Nucleus sampling parameter
+ timeout: Request timeout in seconds
+ max_retries: Maximum number of retry attempts on failure (default 5)
+ retry_delay_seconds: Delay in seconds between retries (default 2.0)
+
+ Returns:
+ Response text, input/output tokens, and image width/height in pixels
+ (after aspect-ratio padding sent to the API).
+
+ Raises:
+ ConnectionError: If the API request fails after all retries
+ ValueError: If the response format is invalid
+ """
+ if bedrock_runtime is None:
+ raise ValueError("bedrock_runtime client is required for Bedrock VLM calls")
+ if model_choice is None:
+ raise ValueError("model_choice is required for Bedrock VLM calls")
+
+ # Bedrock Converse API requires image aspect ratio <= 20:1. Pad to VLM_MAX_ASPECT_RATIO first.
+ try:
+ image = _pad_image_for_vlm_aspect_ratio(image)
+ except Exception as aspect_error:
+ print(
+ f"Warning: could not adjust image aspect ratio for Bedrock VLM: {aspect_error}"
+ )
+ # Final safeguard: never send aspect > 10:1 (AWS Converse limit)
+ try:
+ w, h = image.size
+ if w > 0 and h > 0:
+ aspect = max(w / float(h), h / float(w))
+ if aspect > 10.0:
+ image = _pad_image_for_vlm_aspect_ratio(image, max_aspect=10.0)
+ print(
+ f"Bedrock VLM: re-padded image to satisfy aspect ratio (was {aspect:.1f}:1)."
+ )
+ except Exception:
+ pass
+
+ # Encode the (possibly padded) image and send to Bedrock
+ buffer = io.BytesIO()
+ image.save(buffer, format="PNG")
+ image_bytes = buffer.getvalue()
+ base64.b64encode(image_bytes).decode("utf-8")
+
+ # Prepare messages for Bedrock converse API
+ # Bedrock supports images in the content array
+ messages = [
+ {
+ "role": "user",
+ "content": [
+ {"image": {"format": "png", "source": {"bytes": image_bytes}}},
+ {"text": prompt},
+ ],
+ }
+ ]
+
+ # Build inference config
+ inference_config = {
+ "maxTokens": max_new_tokens if max_new_tokens is not None else 4096,
+ }
+ if temperature is not None:
+ inference_config["temperature"] = temperature
+ if top_p is not None:
+ inference_config["topP"] = top_p
+
+ last_error = None
+ for attempt in range(1, max_retries + 1):
+ try:
+ # Call Bedrock converse API
+ api_response = bedrock_runtime.converse(
+ modelId=model_choice,
+ messages=messages,
+ inferenceConfig=inference_config,
+ )
+
+ # Extract response text
+ output_message = api_response["output"]["message"]
+ if "content" in output_message and len(output_message["content"]) > 0:
+ # Handle reasoning content if present
+ if "reasoningContent" in output_message["content"][0]:
+ # Extract the output text (skip reasoning)
+ if len(output_message["content"]) > 1:
+ text = output_message["content"][1]["text"]
+ else:
+ text = ""
+ else:
+ text = output_message["content"][0]["text"]
+ else:
+ raise ValueError("No content in Bedrock response")
+
+ # Extract token usage from API response
+ input_tokens = 0
+ output_tokens = 0
+ if "usage" in api_response:
+ usage = api_response["usage"]
+ input_tokens = usage.get("inputTokens", 0)
+ output_tokens = usage.get("outputTokens", 0)
+
+ iw, ih = image.size
+ return text, input_tokens, output_tokens, iw, ih
+
+ except Exception as e:
+ last_error = e
+ if attempt < max_retries:
+ print(
+ f"Bedrock API attempt {attempt}/{max_retries} failed: {e}. "
+ f"Retrying in {retry_delay_seconds}s..."
+ )
+ time.sleep(retry_delay_seconds)
+ else:
+ raise ConnectionError(
+ f"Failed to call Bedrock API after {max_retries} attempts: {str(last_error)}"
+ ) from last_error
+
+
+def _call_gemini_vlm_api(
+ image: Image.Image,
+ prompt: str,
+ client=None,
+ config=None,
+ model_choice: str = None,
+ max_new_tokens: int = None,
+ temperature: float = None,
+ timeout: int = 60,
+) -> Tuple[str, int, int]:
+ """
+ Calls Gemini API with an image and text prompt for vision models.
+
+ Args:
+ image: PIL Image to process
+ prompt: Text prompt for the VLM
+ client: Gemini ai.Client instance
+ config: Gemini types.GenerateContentConfig instance
+ model_choice: Gemini model name (e.g., "gemini-1.5-pro")
+ max_new_tokens: Maximum number of tokens to generate
+ temperature: Sampling temperature
+ timeout: Request timeout in seconds
+
+ Returns:
+ Tuple[str, int, int]: The generated text response, input tokens, output tokens
+
+ Raises:
+ ConnectionError: If the API request fails
+ ValueError: If the response format is invalid
+ """
+ if client is None:
+ raise ValueError("Gemini client is required for Gemini VLM calls")
+ if model_choice is None:
+ raise ValueError("model_choice is required for Gemini VLM calls")
+
+ try:
+ image = _pad_image_for_vlm_aspect_ratio(image)
+ except Exception as aspect_error:
+ print(
+ f"Warning: could not adjust image aspect ratio for Gemini VLM: {aspect_error}"
+ )
+
+ # Convert PIL Image to base64
+ buffer = io.BytesIO()
+ image.save(buffer, format="PNG")
+ image_bytes = buffer.getvalue()
+ base64.b64encode(image_bytes).decode("utf-8")
+
+ # Prepare content for Gemini API
+ # Gemini expects content as a list with parts containing image and text
+ try:
+ # Use the client to generate content with image
+ # Gemini API expects the image as part of the content
+ try:
+ import google.genai.types as types
+ except ImportError:
+ raise ImportError(
+ "Google GenAI types not available. Please install google-genai package."
+ )
+
+ # Create content with image and text
+ # For Gemini, we can pass image bytes directly or use inline_data
+ parts = [
+ types.Part.from_bytes(data=image_bytes, mime_type="image/png"),
+ types.Part.from_text(text=prompt),
+ ]
+
+ # Update config if needed
+ if config is None:
+ config = types.GenerateContentConfig(
+ temperature=temperature if temperature is not None else 0.7,
+ max_output_tokens=(
+ max_new_tokens if max_new_tokens is not None else 4096
+ ),
+ )
+ else:
+ # Update existing config
+ if temperature is not None:
+ config.temperature = temperature
+ if max_new_tokens is not None:
+ config.max_output_tokens = max_new_tokens
+
+ response = client.models.generate_content(
+ model=model_choice, contents=parts, config=config
+ )
+
+ # Extract text from response
+ text = ""
+ if hasattr(response, "text"):
+ text = response.text
+ elif hasattr(response, "candidates") and len(response.candidates) > 0:
+ if hasattr(response.candidates[0], "content"):
+ if hasattr(response.candidates[0].content, "parts"):
+ text_parts = []
+ for part in response.candidates[0].content.parts:
+ if hasattr(part, "text"):
+ text_parts.append(part.text)
+ text = "".join(text_parts)
+
+ if not text:
+ raise ValueError("No text content in Gemini response")
+
+ # Extract token usage from response
+ input_tokens = 0
+ output_tokens = 0
+ try:
+ if hasattr(response, "usage_metadata"):
+ usage = response.usage_metadata
+ if hasattr(usage, "prompt_token_count"):
+ input_tokens = usage.prompt_token_count
+ if hasattr(usage, "candidates_token_count"):
+ output_tokens = usage.candidates_token_count
+ except Exception:
+ pass # Token usage not available, return 0
+
+ return text, input_tokens, output_tokens
+
+ except Exception as e:
+ raise ConnectionError(f"Failed to call Gemini API: {str(e)}")
+
+
+def _call_azure_openai_vlm_api(
+ image: Image.Image,
+ prompt: str,
+ client=None,
+ model_choice: str = None,
+ max_new_tokens: int = None,
+ temperature: float = None,
+ timeout: int = 60,
+) -> Tuple[str, int, int]:
+ """
+ Calls Azure/OpenAI API with an image and text prompt for vision models.
+
+ Args:
+ image: PIL Image to process
+ prompt: Text prompt for the VLM
+ client: OpenAI client instance
+ model_choice: Model name (e.g., "gpt-4o", "gpt-4-vision-preview")
+ max_new_tokens: Maximum number of tokens to generate
+ temperature: Sampling temperature
+ timeout: Request timeout in seconds
+
+ Returns:
+ Tuple[str, int, int]: The generated text response, input tokens, output tokens
+
+ Raises:
+ ConnectionError: If the API request fails
+ ValueError: If the response format is invalid
+ """
+ if client is None:
+ raise ValueError("OpenAI client is required for Azure/OpenAI VLM calls")
+ if model_choice is None:
+ raise ValueError("model_choice is required for Azure/OpenAI VLM calls")
+
+ try:
+ image = _pad_image_for_vlm_aspect_ratio(image)
+ except Exception as aspect_error:
+ print(
+ f"Warning: could not adjust image aspect ratio for Azure/OpenAI VLM: {aspect_error}"
+ )
+
+ # Convert PIL Image to base64
+ buffer = io.BytesIO()
+ image.save(buffer, format="PNG")
+ image_bytes = buffer.getvalue()
+ image_base64 = base64.b64encode(image_bytes).decode("utf-8")
+
+ # Prepare messages in OpenAI format
+ messages = [
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "image_url",
+ "image_url": {"url": f"data:image/png;base64,{image_base64}"},
+ },
+ {"type": "text", "text": prompt},
+ ],
+ }
+ ]
+
+ try:
+ # Call OpenAI chat completions API
+ response = client.chat.completions.create(
+ model=model_choice,
+ messages=messages,
+ temperature=temperature if temperature is not None else 0.7,
+ max_completion_tokens=(
+ max_new_tokens if max_new_tokens is not None else 4096
+ ),
+ )
+
+ # Extract text from response
+ text = ""
+ if response.choices and len(response.choices) > 0:
+ message = response.choices[0].message
+ if hasattr(message, "content") and message.content:
+ text = message.content
+ else:
+ raise ValueError("No content in OpenAI response")
+ else:
+ raise ValueError("No choices in OpenAI response")
+
+ # Extract token usage from response
+ input_tokens = 0
+ output_tokens = 0
+ try:
+ if hasattr(response, "usage"):
+ usage = response.usage
+ if hasattr(usage, "prompt_tokens"):
+ input_tokens = usage.prompt_tokens
+ if hasattr(usage, "completion_tokens"):
+ output_tokens = usage.completion_tokens
+ except Exception:
+ pass # Token usage not available, return 0
+
+ return text, input_tokens, output_tokens
+
+ except Exception as e:
+ raise ConnectionError(f"Failed to call Azure/OpenAI API: {str(e)}")
+
+
+def _repair_vlm_json_common_quote_issues(s: str) -> str:
+ """
+ Best-effort repair for minor JSON issues seen in VLM outputs.
+
+ Common cases:
+ - Stray quote after numeric conf: {"conf": 0.85"} -> {"conf": 0.85}
+ - Conf glued to extra text: {"conf": 0.85"some text..."} -> {"conf": 0.85}
+ - A second trailing "conf : .6}" fragment appended inside an object
+ """
+ if not s or not isinstance(s, str):
+ return s
+
+ out = s
+
+ # Remove a stray quote (and any non-delimiter junk) immediately after numeric conf/confidence.
+ # Keep the numeric value and let json.loads succeed.
+ out = re.sub(
+ r'("conf(?:idence)?"\s*:\s*)(-?\d+(?:\.\d+)?)(?:"[^,}\]]*)',
+ r"\1\2",
+ out,
+ )
+
+ # Drop malformed trailing fragments like: , conf : .6}"
+ # (unquoted key, often followed by extra braces/quotes).
+ out = re.sub(
+ r",\s*conf\s*:\s*-?(?:\d+(?:\.\d+)?|\.\d+)\s*\}?\s*\"?\s*",
+ "",
+ out,
+ flags=re.IGNORECASE,
+ )
+
+ # If the model accidentally emits single quotes around keys/strings, prefer leaving as-is
+ # (other code paths already rely on strict JSON). Avoid aggressive rewriting here.
+ return out
+
+
+def _best_effort_extract_text_conf_from_messy_jsonish(
+ raw: str,
+) -> Optional[Dict[str, Any]]:
+ """
+ Last-resort extractor for hybrid VLM OCR single-line payloads when JSON is too broken.
+ Pulls the first `text` string and the first numeric `conf/confidence` value.
+ """
+ if not raw or not isinstance(raw, str):
+ return None
+
+ s = raw.strip()
+
+ # Extract text (prefer quoted JSON-like "text": "...")
+ text_match = re.search(
+ r'"text"\s*:\s*"(?P(?:[^"\\\\]|\\\\.)*)"',
+ s,
+ flags=re.IGNORECASE,
+ )
+ if not text_match:
+ text_match = re.search(
+ r"\btext\b\s*[:=]\s*\"(?P(?:[^\"\\\\]|\\\\.)*)\"",
+ s,
+ flags=re.IGNORECASE,
+ )
+ if not text_match:
+ return None
+ text_val = text_match.group("text")
+ try:
+ # Unescape common sequences
+ text_val = bytes(text_val, "utf-8").decode("unicode_escape")
+ except Exception:
+ pass
+ text_val = str(text_val).strip()
+ if not text_val:
+ return None
+
+ # Extract first confidence number after conf/confidence (tolerate unquoted key)
+ conf_match = re.search(
+ r"(?:\"?conf(?:idence)?\"?)\s*:\s*(?P-?(?:\d+(?:\.\d+)?|\.\d+))",
+ s,
+ flags=re.IGNORECASE,
+ )
+ out: Dict[str, Any] = {"text": text_val}
+ if conf_match:
+ out["confidence"] = conf_match.group("conf")
+ return out
+
+
+def _extract_last_text_dict_from_vlm_response(raw: str) -> Optional[Dict[str, Any]]:
+ """
+ Extract the last JSON object that contains a "text" key from a VLM response.
+ Handles thinking blocks (e.g. ...) that may contain multiple dicts;
+ the final answer is assumed to be the last valid dict in correct format.
+
+ Returns:
+ The parsed dict with "text" (and optionally "confidence"), or None if none found.
+ """
+ if not raw or not isinstance(raw, str):
+ return None
+ last_valid = None
+ i = 0
+ while i < len(raw):
+ start = raw.find("{", i)
+ if start == -1:
+ break
+ depth = 1
+ j = start + 1
+ while j < len(raw) and depth > 0:
+ if raw[j] == "{":
+ depth += 1
+ elif raw[j] == "}":
+ depth -= 1
+ j += 1
+ if depth != 0:
+ i = start + 1
+ continue
+ snippet = raw[start:j]
+ try:
+ snippet = _repair_vlm_json_common_quote_issues(snippet)
+ obj = json.loads(snippet)
+ if isinstance(obj, dict):
+ norm = _normalize_single_line_text_dict(obj)
+ if norm is not None:
+ last_valid = norm
+ except (json.JSONDecodeError, TypeError):
+ fallback = _best_effort_extract_text_conf_from_messy_jsonish(snippet)
+ if fallback is not None:
+ norm = _normalize_single_line_text_dict(fallback)
+ if norm is not None:
+ last_valid = norm
+ i = j
+ return last_valid
+
+
+def _extract_and_combine_text_dicts_from_vlm_response(
+ raw: str,
+) -> Optional[Dict[str, Any]]:
+ """
+ Extract all JSON objects that contain a "text" key from a VLM response, then combine them.
+ If the VLM returns each word in its own dict (e.g. [{"text": "Hello", "confidence": 0.9}, ...]),
+ the text from each entry is joined with spaces and confidence values are averaged.
+
+ Returns:
+ A single dict with "text" (combined) and "confidence" (average), or None if no valid dict found.
+ """
+ if not raw or not isinstance(raw, str):
+ return None
+ collected = []
+ i = 0
+ while i < len(raw):
+ start = raw.find("{", i)
+ if start == -1:
+ break
+ depth = 1
+ j = start + 1
+ while j < len(raw) and depth > 0:
+ if raw[j] == "{":
+ depth += 1
+ elif raw[j] == "}":
+ depth -= 1
+ j += 1
+ if depth != 0:
+ i = start + 1
+ continue
+ snippet = raw[start:j]
+ try:
+ snippet = _repair_vlm_json_common_quote_issues(snippet)
+ obj = json.loads(snippet)
+ if isinstance(obj, dict):
+ norm = _normalize_single_line_text_dict(obj)
+ if norm is not None:
+ collected.append(norm)
+ except (json.JSONDecodeError, TypeError):
+ fallback = _best_effort_extract_text_conf_from_messy_jsonish(snippet)
+ if fallback is not None:
+ norm = _normalize_single_line_text_dict(fallback)
+ if norm is not None:
+ collected.append(norm)
+ i = j
+ if not collected:
+ return None
+ if len(collected) == 1:
+ return collected[0]
+ # Multiple entries: combine text and average confidence
+ texts = []
+ confidences = []
+ for entry in collected:
+ t = entry.get("text")
+ if t is not None and isinstance(t, str) and t.strip():
+ texts.append(t.strip())
+ c = entry.get("confidence", entry.get("conf"))
+ if c is not None:
+ try:
+ confidences.append(float(c))
+ except (TypeError, ValueError):
+ pass
+ combined_text = " ".join(texts)
+ avg_confidence = sum(confidences) / len(confidences) if confidences else 1.0
+ return {"text": combined_text, "confidence": avg_confidence}
+
+
+def _vlm_ocr_predict(
+ image: Image.Image,
+ prompt: str = model_default_prompt,
+) -> Dict[str, Any]:
+ """
+ VLM OCR prediction function that mimics PaddleOCR's interface.
+
+ Args:
+ image: PIL Image to process
+ prompt: Text prompt for the VLM
+
+ Returns:
+ Dictionary in PaddleOCR format with 'rec_texts' and 'rec_scores'
+ """
+ try:
+ # Validate image exists and is not None
+ if image is None:
+ print("VLM OCR error: Image is None")
+ return {"rec_texts": [], "rec_scores": []}
+
+ # Validate image has valid size (at least 10x10 pixels)
+ try:
+ width, height = image.size
+ if width < 10 or height < 10:
+ print(
+ f"VLM OCR error: Image is too small ({width}x{height} pixels). Minimum size is 10x10."
+ )
+ return {"rec_texts": [], "rec_scores": []}
+ except Exception as size_error:
+ print(f"VLM OCR error: Could not get image size: {size_error}")
+ return {"rec_texts": [], "rec_scores": []}
+
+ # Ensure image is in RGB mode (convert if needed)
+ try:
+ if image.mode != "RGB":
+ # print(f"VLM OCR: Converting image from {image.mode} to RGB mode")
+ image = image.convert("RGB")
+ # Update width/height after conversion (should be same, but ensure consistency)
+ width, height = image.size
+ except Exception as convert_error:
+ print(f"VLM OCR error: Could not convert image to RGB: {convert_error}")
+ return {"rec_texts": [], "rec_scores": []}
+
+ # Same pipeline as hybrid inference-server line crops
+ try:
+ image = _prepare_hybrid_line_crop_for_vlm(image)
+ width, height = image.size
+ except Exception as prep_error:
+ print(f"VLM OCR error: Could not prepare image for VLM: {prep_error}")
+ return {"rec_texts": [], "rec_scores": []}
+
+ # Use the VLM to extract text
+ # Pass None for parameters to prioritize model-specific defaults from run_vlm.py
+ # If model defaults are not available, general defaults will be used (matching current values)
+ extracted_text, _, _ = extract_text_from_image_vlm(
+ text=prompt,
+ image=image,
+ max_new_tokens=None, # Use model default if available, otherwise MAX_NEW_TOKENS from config
+ temperature=None, # Use model default if available, otherwise 0.7
+ top_p=None, # Use model default if available, otherwise 0.9
+ min_p=None, # Use model default if available, otherwise 0.0
+ top_k=None, # Use model default if available, otherwise 50
+ repetition_penalty=None, # Use model default if available, otherwise 1.3
+ presence_penalty=None, # Use model default if available, otherwise None (only supported by Qwen3-VL models)
+ )
+
+ # Check if extracted_text is None or empty
+ if extracted_text is None:
+ # print("VLM OCR warning: extract_text_from_image_vlm returned None")
+ return {"rec_texts": [], "rec_scores": []}
+
+ if not isinstance(extracted_text, str):
+ # print(f"VLM OCR warning: extract_text_from_image_vlm returned unexpected type: {type(extracted_text)}")
+ return {"rec_texts": [], "rec_scores": []}
+
+ if not extracted_text.strip():
+ # print("VLM OCR warning: Extracted text is empty after stripping")
+ return {"rec_texts": [], "rec_scores": []}
+
+ # Parse VLM response: expect dictionary format {"text": "...", "confidence": ...}
+ # If VLM returns multiple dicts (e.g. one per word), combine text and average confidence
+ parsed = _extract_and_combine_text_dicts_from_vlm_response(extracted_text)
+ if parsed is None:
+ return {"rec_texts": [], "rec_scores": []}
+
+ text_content = parsed.get("text")
+ confidence = parsed.get("confidence")
+ if text_content is None or not isinstance(text_content, str):
+ return {"rec_texts": [], "rec_scores": []}
+ # Clamp confidence to [0, 1]; default 1.0 if missing or invalid
+ try:
+ score = float(confidence) if confidence is not None else 1.0
+ score = max(0.0, min(1.0, score))
+ except (TypeError, ValueError):
+ score = 1.0
+
+ cleaned_text = re.sub(r"[\r\n]+", " ", text_content).strip()
+ words = cleaned_text.split()
+
+ # Enforce output length below HYBRID_OCR_MAX_WORDS (truncate if over)
+ if len(words) > HYBRID_OCR_MAX_WORDS:
+ words = words[:HYBRID_OCR_MAX_WORDS]
+
+ result = {
+ "rec_texts": words,
+ "rec_scores": [score] * len(words),
+ }
+ return result
+
+ except Exception:
+ # print(f"VLM OCR error: {e}")
+ # print(f"VLM OCR error traceback: {traceback.format_exc()}")
+ return {"rec_texts": [], "rec_scores": []}
+
+
+@spaces.GPU(duration=MAX_SPACES_GPU_RUN_TIME)
+def _process_page_result_with_hybrid_vlm_ocr(
+ page_results: list,
+ image: Image.Image,
+ img_width: int,
+ img_height: int,
+ input_image_width: int,
+ input_image_height: int,
+ confidence_threshold: float,
+ image_name: str,
+ output_folder: str,
+ padding: int = 0,
+):
+ """
+ Processes OCR page results using a hybrid system that combines PaddleOCR for initial recognition
+ and VLM for low-confidence lines. When PaddleOCR's recognition confidence for a detected line is
+ below the specified threshold, the line is re-processed using a higher-quality (but slower) VLM
+ model and the result is used to replace the low-confidence recognition. Results are kept in
+ PaddleOCR's standard output format for downstream compatibility.
+
+ Args:
+ page_results (list): The list of page result dicts from PaddleOCR to process. Each dict should
+ contain keys like 'rec_texts', 'rec_scores', 'rec_polys', and optionally 'image_width',
+ 'image_height', and 'rec_models'.
+ image (PIL.Image.Image): The PIL Image object of the full page to allow line cropping.
+ img_width (int): The width of the (possibly preprocessed) image in pixels.
+ img_height (int): The height of the (possibly preprocessed) image in pixels.
+ input_image_width (int): The original image width (before any resizing/preprocessing).
+ input_image_height (int): The original image height (before any resizing/preprocessing).
+ confidence_threshold (float): Lines recognized by PaddleOCR with confidence lower than this
+ threshold will be replaced using the VLM.
+ image_name (str): The name of the source image, used for logging/debugging.
+ output_folder (str): The output folder path for saving example images.
+ padding (int): Padding to add around line crops.
+
+ Returns:
+ Modified page_results with VLM replacements for low-confidence lines.
+ """
+ if len(page_results) > 1:
+ print(
+ f"Hybrid Paddle+VLM: PaddleOCR returned {len(page_results)} result dicts for one image "
+ f"({image_name!r}); applying line-level VLM re-OCR only to the first dict to avoid duplicate VLM calls."
+ )
+ _hybrid_page_iter = page_results[:1] if len(page_results) > 1 else page_results
+
+ def _normalize_paddle_result_lists(rec_texts, rec_scores, rec_polys):
+ """
+ Normalizes PaddleOCR result lists to ensure they all have the same length.
+ Pads missing entries with appropriate defaults:
+ - rec_texts: empty string ""
+ - rec_scores: 0.0 (low confidence)
+ - rec_polys: empty list []
+
+ Args:
+ rec_texts: List of recognized text strings
+ rec_scores: List of confidence scores
+ rec_polys: List of bounding box polygons
+
+ Returns:
+ Tuple of (normalized_rec_texts, normalized_rec_scores, normalized_rec_polys, max_length)
+ """
+ len_texts = len(rec_texts)
+ len_scores = len(rec_scores)
+ len_polys = len(rec_polys)
+ max_length = max(len_texts, len_scores, len_polys)
+
+ # Only normalize if there's a mismatch
+ if max_length > 0 and (
+ len_texts != max_length
+ or len_scores != max_length
+ or len_polys != max_length
+ ):
+ print(
+ f"Warning: List length mismatch detected - rec_texts: {len_texts}, "
+ f"rec_scores: {len_scores}, rec_polys: {len_polys}. "
+ f"Padding to length {max_length}."
+ )
+
+ # Pad rec_texts
+ if len_texts < max_length:
+ rec_texts = list(rec_texts) + [""] * (max_length - len_texts)
+
+ # Pad rec_scores
+ if len_scores < max_length:
+ rec_scores = list(rec_scores) + [0.0] * (max_length - len_scores)
+
+ # Pad rec_polys
+ if len_polys < max_length:
+ rec_polys = list(rec_polys) + [[]] * (max_length - len_polys)
+
+ return rec_texts, rec_scores, rec_polys, max_length
+
+ # Helper function to create safe filename (inlined to avoid needing instance_self)
+ def _create_safe_filename_with_confidence(
+ original_text: str,
+ new_text: str,
+ conf: int,
+ new_conf: int,
+ ocr_type: str = "OCR",
+ ) -> str:
+ """Creates a safe filename using confidence values when text sanitization fails."""
+
+ # Helper to sanitize text similar to _sanitize_filename
+ def _sanitize_text_for_filename(
+ text: str,
+ max_length: int = 20,
+ fallback_prefix: str = "unknown_text",
+ ) -> str:
+ """Sanitizes text for use in filenames."""
+ sanitized = safe_sanitize_text(text)
+ # Remove leading/trailing underscores and spaces
+ sanitized = sanitized.strip("_ ")
+ # If empty after sanitization, use a default value
+ if not sanitized:
+ sanitized = fallback_prefix
+ # Limit to max_length characters
+ if len(sanitized) > max_length:
+ sanitized = sanitized[:max_length]
+ sanitized = sanitized.rstrip("_")
+ # Final check: if still empty or too short, use fallback
+ if not sanitized or len(sanitized) < 3:
+ sanitized = fallback_prefix
+ return sanitized
+
+ # Try to sanitize both texts
+ safe_original = _sanitize_text_for_filename(
+ original_text, max_length=15, fallback_prefix=f"orig_conf_{conf}"
+ )
+ safe_new = _sanitize_text_for_filename(
+ new_text, max_length=15, fallback_prefix=f"new_conf_{new_conf}"
+ )
+
+ # If both sanitizations resulted in fallback names, create a confidence-based name
+ if safe_original.startswith("orig_conf") and safe_new.startswith("new_conf"):
+ return f"{ocr_type}_conf_{conf}_to_conf_{new_conf}"
+
+ return f"{safe_original}_conf_{conf}_to_{safe_new}_conf_{new_conf}"
+
+ # Process each page result in paddle_results (see _hybrid_page_iter when len > 1)
+ for page_result in _hybrid_page_iter:
+ # Extract text recognition results from the paddle format
+ rec_texts = page_result.get("rec_texts", list())
+ rec_scores = page_result.get("rec_scores", list())
+ rec_polys = page_result.get("rec_polys", list())
+
+ # Normalize lists to ensure they all have the same length
+ rec_texts, rec_scores, rec_polys, num_lines = _normalize_paddle_result_lists(
+ rec_texts, rec_scores, rec_polys
+ )
+
+ # Update page_result with normalized lists
+ page_result["rec_texts"] = rec_texts
+ page_result["rec_scores"] = rec_scores
+ page_result["rec_polys"] = rec_polys
+
+ # Initialize rec_models list with "Paddle" as default for all lines
+ if (
+ "rec_models" not in page_result
+ or len(page_result.get("rec_models", [])) != num_lines
+ ):
+ rec_models = ["Paddle"] * num_lines
+ page_result["rec_models"] = rec_models
+ else:
+ rec_models = page_result["rec_models"]
+
+ # Since we're using the exact image PaddleOCR processed, coordinates are directly in image space
+ # No coordinate conversion needed - coordinates match the image dimensions exactly
+
+ # Process each line
+ # print(f"Processing {num_lines} lines from PaddleOCR results...")
+
+ for i in range(num_lines):
+ line_text = rec_texts[i]
+ line_conf = float(rec_scores[i]) * 100 # Convert to percentage
+ bounding_box = rec_polys[i]
+
+ # Skip if bounding box is empty (from padding)
+ # Handle numpy arrays, lists, and None values safely
+ if bounding_box is None:
+ continue
+
+ # Convert to list first to handle numpy arrays safely
+ if hasattr(bounding_box, "tolist"):
+ box = bounding_box.tolist()
+ else:
+ box = bounding_box
+
+ # Check if box is empty (handles both list and numpy array cases)
+ if not box or (isinstance(box, list) and len(box) == 0):
+ continue
+
+ # Skip empty lines
+ if not line_text.strip():
+ continue
+
+ # Convert polygon to bounding box
+ x_coords = [p[0] for p in box]
+ y_coords = [p[1] for p in box]
+ line_left_paddle = float(min(x_coords))
+ line_top_paddle = float(min(y_coords))
+ line_right_paddle = float(max(x_coords))
+ line_bottom_paddle = float(max(y_coords))
+ line_width_paddle = line_right_paddle - line_left_paddle
+ line_height_paddle = line_bottom_paddle - line_top_paddle
+
+ # Since we're using the exact image PaddleOCR processed, coordinates are already in image space
+ # No conversion needed - use coordinates directly
+ line_left = line_left_paddle
+ line_top = line_top_paddle
+ line_width = line_width_paddle
+ line_height = line_height_paddle
+
+ # Initialize model as PaddleOCR (default)
+
+ # Count words in PaddleOCR output
+ paddle_words = line_text.split()
+ paddle_word_count = len(paddle_words)
+
+ # If confidence is low, use VLM for a second opinion
+ if line_conf <= confidence_threshold:
+
+ # Ensure minimum line height for VLM processing
+ # If line_height is too small, use a minimum height based on typical text line height
+ min_line_height = max(
+ line_height, 20
+ ) # Minimum 20 pixels for text line
+
+ # Calculate crop coordinates with padding
+ # Convert floats to integers and apply padding, clamping to image bounds
+ crop_left = max(0, int(round(line_left - padding)))
+ crop_top = max(0, int(round(line_top - padding)))
+ crop_right = min(
+ img_width, int(round(line_left + line_width + padding))
+ )
+ crop_bottom = min(
+ img_height, int(round(line_top + min_line_height + padding))
+ )
+
+ # Ensure crop dimensions are valid
+ if crop_right <= crop_left or crop_bottom <= crop_top:
+ # Invalid crop, keep original PaddleOCR result
+ continue
+
+ # Crop the line image
+ cropped_image = image.crop(
+ (crop_left, crop_top, crop_right, crop_bottom)
+ )
+
+ # Check if cropped image is too small for VLM processing
+ crop_width = crop_right - crop_left
+ crop_height = crop_bottom - crop_top
+ if crop_width < 10 or crop_height < 10:
+ continue
+
+ # Ensure cropped image is in RGB mode before passing to VLM
+ if cropped_image.mode != "RGB":
+ cropped_image = cropped_image.convert("RGB")
+
+ # Save input image for debugging if environment variable is set
+ if SAVE_VLM_INPUT_IMAGES:
+ try:
+ vlm_debug_dir = os.path.join(
+ output_folder,
+ "hybrid_paddle_vlm_visualisations/hybrid_analysis_input_images",
+ )
+ os.makedirs(vlm_debug_dir, exist_ok=True)
+ line_text_safe = safe_sanitize_text(line_text)
+ line_text_shortened = line_text_safe[:20]
+ image_name_safe = safe_sanitize_text(image_name)
+ image_name_shortened = image_name_safe[:20]
+ filename = f"{image_name_shortened}_{line_text_shortened}_hybrid_analysis_input_image.png"
+ filepath = os.path.join(vlm_debug_dir, filename)
+ _save_image_with_config_dpi(cropped_image, filepath)
+ # print(f"Saved VLM input image to: {filepath}")
+ except Exception as save_error:
+ print(f"Warning: Could not save VLM input image: {save_error}")
+
+ # Use VLM for OCR on this line with error handling
+ vlm_result = None
+ vlm_rec_texts = []
+ vlm_rec_scores = []
+
+ try:
+ vlm_result = _vlm_ocr_predict(cropped_image)
+ vlm_rec_texts = (
+ vlm_result.get("rec_texts", []) if vlm_result else []
+ )
+ vlm_rec_scores = (
+ vlm_result.get("rec_scores", []) if vlm_result else []
+ )
+ except Exception:
+ # Ensure we keep original PaddleOCR result on error
+ vlm_rec_texts = []
+ vlm_rec_scores = []
+
+ if vlm_rec_texts and vlm_rec_scores:
+ # Combine VLM words into a single text string
+ vlm_text = " ".join(vlm_rec_texts)
+ vlm_word_count = len(vlm_rec_texts)
+ vlm_conf = float(
+ np.median(vlm_rec_scores)
+ ) # Keep as 0-1 range for paddle format
+
+ # Only replace if word counts match
+ word_count_allowed_difference = 4
+ if (
+ vlm_word_count - paddle_word_count
+ <= word_count_allowed_difference
+ and vlm_word_count - paddle_word_count
+ >= -word_count_allowed_difference
+ ):
+ text_output = f" Re-OCR'd line: '{line_text}' (conf: {line_conf:.1f}, words: {paddle_word_count}) "
+ text_output += f"-> '{vlm_text}' (conf: {vlm_conf*100:.1f}, words: {vlm_word_count}) [VLM]"
+ print(text_output)
+
+ if REPORT_VLM_OUTPUTS_TO_GUI:
+ try:
+ gr.Info(text_output, duration=2)
+ except Exception:
+ # gr.Info may not be available in worker process, ignore
+ pass
+
+ # For exporting example image comparisons
+ safe_filename = _create_safe_filename_with_confidence(
+ line_text,
+ vlm_text,
+ int(line_conf),
+ int(vlm_conf * 100),
+ "VLM",
+ )
+
+ if SAVE_EXAMPLE_HYBRID_IMAGES:
+ # Normalize and validate image_name to prevent path traversal attacks
+ normalized_image_name = os.path.normpath(
+ image_name + "_hybrid_paddle_vlm"
+ )
+ if (
+ ".." in normalized_image_name
+ or "/" in normalized_image_name
+ or "\\" in normalized_image_name
+ ):
+ normalized_image_name = "safe_image"
+
+ hybrid_ocr_examples_folder = (
+ output_folder
+ + f"/hybrid_ocr_examples/{normalized_image_name}"
+ )
+ # Validate the constructed path is safe
+ if not validate_folder_containment(
+ hybrid_ocr_examples_folder, OUTPUT_FOLDER
+ ):
+ raise ValueError(
+ f"Unsafe hybrid_ocr_examples folder path: {hybrid_ocr_examples_folder}"
+ )
+
+ if not os.path.exists(hybrid_ocr_examples_folder):
+ os.makedirs(hybrid_ocr_examples_folder)
+ output_image_path = (
+ hybrid_ocr_examples_folder + f"/{safe_filename}.png"
+ )
+ # print(f"Saving example image to {output_image_path}")
+ _save_image_with_config_dpi(
+ cropped_image, output_image_path
+ )
+
+ # Replace with VLM result in paddle_results format
+ # Update rec_texts, rec_scores, and rec_models for this line
+ rec_texts[i] = vlm_text
+ rec_scores[i] = vlm_conf
+ rec_models[i] = "VLM"
+ # Ensure page_result is updated with the modified rec_models list
+ page_result["rec_models"] = rec_models
+ else:
+ print(
+ f" Line: '{line_text}' (conf: {line_conf:.1f}, words: {paddle_word_count}) -> "
+ f"VLM result '{vlm_text}' (conf: {vlm_conf*100:.1f}, words: {vlm_word_count}) "
+ f"word count mismatch. Keeping PaddleOCR result."
+ )
+ else:
+ # VLM returned empty or no results - keep original PaddleOCR result
+ if line_conf <= confidence_threshold:
+ pass
+
+ # Debug: Print summary of model labels before returning
+ for page_idx, page_result in enumerate(page_results):
+ rec_models = page_result.get("rec_models", [])
+ sum(1 for m in rec_models if m == "VLM")
+ sum(1 for m in rec_models if m == "Paddle")
+
+ return page_results
+
+
+def _convert_single_line_to_word_level_standalone(
+ line_text: str,
+ line_left: int,
+ line_top: int,
+ line_width: int,
+ line_height: int,
+ line_conf: float,
+ image: Image.Image,
+ image_width: int,
+ image_height: int,
+ output_folder: str,
+ image_name: str = None,
+ line_model: str = "Bedrock VLM",
+) -> Dict[str, List]:
+ """
+ Converts a single line (text + line bbox) to word-level bounding boxes using
+ AdaptiveSegmenter. Used by the hybrid Textract+Bedrock path when word count
+ differs by <= MAX_WORD_COUNT_DIFF_FOR_LINE_DERIVED_WORDS so we keep Bedrock
+ text but derive word boxes from the line.
+
+ Returns dict with keys "text", "left", "top", "width", "height", "conf", "model"
+ (all lists, coordinates in full image space).
+ """
+ output = {
+ "text": [],
+ "left": [],
+ "top": [],
+ "width": [],
+ "height": [],
+ "conf": [],
+ "model": [],
+ }
+ if not (line_text or "").strip():
+ return output
+ if image is None or output_folder is None:
+ return output
+
+ if hasattr(image, "size"):
+ image_np = np.array(image)
+ if len(image_np.shape) == 3:
+ image_np = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR)
+ elif len(image_np.shape) == 2:
+ image_np = cv2.cvtColor(image_np, cv2.COLOR_GRAY2BGR)
+ else:
+ image_np = image.copy()
+ if len(image_np.shape) == 2:
+ image_np = cv2.cvtColor(image_np, cv2.COLOR_GRAY2BGR)
+
+ actual_height, actual_width = image_np.shape[:2]
+ if actual_width != image_width or actual_height != image_height:
+ image_width = actual_width
+ image_height = actual_height
+
+ line_left = int(max(0, min(line_left, image_width - 1)))
+ line_top = int(max(0, min(line_top, image_height - 1)))
+ line_width = int(max(1, min(line_width, image_width - line_left)))
+ line_height = int(max(1, min(line_height, image_height - line_top)))
+ if line_left >= image_width or line_top >= image_height:
+ return output
+ if line_left + line_width > image_width:
+ line_width = image_width - line_left
+ if line_top + line_height > image_height:
+ line_height = image_height - line_top
+ if line_width <= 0 or line_height <= 0:
+ return output
+
+ try:
+ line_image = image_np[
+ line_top : line_top + line_height,
+ line_left : line_left + line_width,
+ ]
+ except IndexError:
+ return output
+ if line_image is None or line_image.size == 0 or len(line_image.shape) < 2:
+ return output
+
+ conf_val = line_conf if isinstance(line_conf, (int, float)) else 100
+ try:
+ conf_val = max(0, min(100, float(conf_val)))
+ except (TypeError, ValueError):
+ conf_val = 100
+
+ single_line_data = {
+ "text": [line_text],
+ "left": [0],
+ "top": [0],
+ "width": [line_width],
+ "height": [line_height],
+ "conf": [conf_val],
+ "line": [0],
+ }
+ segmenter = AdaptiveSegmenter(output_folder=output_folder)
+ try:
+ word_output, _ = segmenter.segment(
+ single_line_data, line_image, image_name=image_name
+ )
+ except Exception:
+ word_output = None
+
+ if not word_output or not word_output.get("text"):
+ words = line_text.split()
+ if words:
+ num_chars = len("".join(words))
+ num_spaces = len(words) - 1
+ char_space_ratio = 2.0
+ denom = (num_chars * char_space_ratio + num_spaces) or 1
+ estimated_space_width = line_width / denom
+ avg_char_width = estimated_space_width * char_space_ratio
+ current_left = 0
+ for word in words:
+ word_width = len(word) * avg_char_width
+ clamped_left = max(0, min(current_left, line_width))
+ clamped_width = max(0, min(word_width, line_width - clamped_left))
+ output["text"].append(word)
+ output["left"].append(line_left + clamped_left)
+ output["top"].append(line_top)
+ output["width"].append(clamped_width)
+ output["height"].append(line_height)
+ output["conf"].append(conf_val)
+ output["model"].append(line_model)
+ current_left += word_width + estimated_space_width
+ return output
+
+ for j in range(len(word_output["text"])):
+ output["text"].append(word_output["text"][j])
+ output["left"].append(line_left + word_output["left"][j])
+ output["top"].append(line_top + word_output["top"][j])
+ output["width"].append(word_output["width"][j])
+ output["height"].append(word_output["height"][j])
+ output["conf"].append(
+ word_output["conf"][j]
+ if j < len(word_output.get("conf") or [])
+ else conf_val
+ )
+ output["model"].append(line_model)
+ return output
+
+
+def _process_textract_page_with_hybrid_bedrock_vlm(
+ page_line_level_ocr_results: Dict[str, Any],
+ page_line_level_ocr_results_with_words: Dict[str, Any],
+ image: Image.Image,
+ img_width: int,
+ img_height: int,
+ confidence_threshold: float,
+ padding: int,
+ bedrock_runtime: Any,
+ model_choice: str,
+ output_folder: str,
+ image_name: str,
+) -> Tuple[Dict[str, Any], int, int, str]:
+ """
+ For a single page's Textract results, re-run Bedrock VLM on lines whose
+ line-level confidence is below the threshold. Uses the actual line-level
+ page OCR object (page_line_level_ocr_results) for confidence and bbox;
+ the ocr results with words object is updated only at the end when mapping
+ back corrected text/confidence/words for successfully re-OCR'd lines.
+
+ Returns:
+ Tuple of (page_line_level_ocr_results_with_words, vlm_input_tokens,
+ vlm_output_tokens, vlm_model_name) for usage logging.
+ """
+ _empty_return = (page_line_level_ocr_results_with_words, 0, 0, model_choice or "")
+ if image is None or not page_line_level_ocr_results_with_words:
+ print("Image is None or no page line level OCR results with words found")
+ return _empty_return
+ results = page_line_level_ocr_results_with_words.get("results") or {}
+ if not results:
+ print("No results found")
+ return _empty_return
+ if bedrock_runtime is None or not model_choice:
+ print("Bedrock runtime is None or model choice is not set")
+ print(f"Bedrock runtime: {bedrock_runtime}")
+ print(f"Model choice: {model_choice}")
+ return _empty_return
+ line_level_results = page_line_level_ocr_results.get("results") or []
+ if not line_level_results:
+ return _empty_return
+
+ # Build line-level items from the actual line-level OCR (OCRResult list)
+ # Match by result.line -> key "text_line_{line}" in the with_words dict
+ line_level_items = []
+ for result in line_level_results:
+ line_num = (
+ getattr(result, "line", None)
+ if hasattr(result, "line")
+ else result.get("line") if isinstance(result, dict) else None
+ )
+ if line_num is None or line_num < 1:
+ continue
+ key = f"text_line_{line_num}"
+ if key not in results:
+ continue
+ if isinstance(result, dict):
+ conf = result.get("conf", result.get("confidence"))
+ else:
+ conf = getattr(result, "conf", None)
+ if conf is None:
+ conf = 0
+ try:
+ line_conf = float(conf)
+ except (TypeError, ValueError):
+ line_conf = 0
+ if isinstance(result, dict):
+ left = result.get("left", 0)
+ top = result.get("top", 0)
+ w = result.get("width", 0)
+ h = result.get("height", 0)
+ else:
+ left = getattr(result, "left", 0)
+ top = getattr(result, "top", 0)
+ w = getattr(result, "width", 0)
+ h = getattr(result, "height", 0)
+ bbox = (left, top, left + w, top + h)
+ line_level_items.append((key, line_conf, bbox))
+
+ # Ensure RGB for Bedrock
+ if image.mode != "RGB":
+ image = image.convert("RGB")
+
+ # Optional: folder and log file for saving example images and prompt/response (when config set)
+ save_examples = SAVE_TEXTRACT_BEDROCK_HYBRID_EXAMPLES
+ hybrid_examples_folder = None
+ inference_log_path = None
+ if save_examples and output_folder and image_name:
+ normalized_image_name = os.path.normpath(
+ image_name + "_textract_bedrock_hybrid"
+ )
+ if (
+ ".." in normalized_image_name
+ or "/" in normalized_image_name
+ or "\\" in normalized_image_name
+ ):
+ normalized_image_name = "safe_image"
+ hybrid_examples_folder = os.path.join(
+ output_folder, "textract_bedrock_hybrid_examples", normalized_image_name
+ )
+ if validate_folder_containment(hybrid_examples_folder, OUTPUT_FOLDER):
+ if not os.path.exists(hybrid_examples_folder):
+ os.makedirs(hybrid_examples_folder)
+ page_no = page_line_level_ocr_results_with_words.get("page", "?")
+ inference_log_path = os.path.join(
+ hybrid_examples_folder, f"page_{page_no}_inference_log.jsonl"
+ )
+ else:
+ save_examples = False
+
+ # Build list of (key, line_conf, bbox, cropped) for lines below threshold
+ tasks = []
+ for key, line_conf, bbox in line_level_items:
+ if line_conf > confidence_threshold:
+ continue
+ left, top, right, bottom = bbox
+ crop_left = max(0, int(left) - padding)
+ crop_top = max(0, int(top) - padding)
+ crop_right = min(img_width, int(right) + padding)
+ crop_bottom = min(img_height, int(bottom) + padding)
+ if crop_right <= crop_left or crop_bottom <= crop_top:
+ continue
+ cropped = image.crop((crop_left, crop_top, crop_right, crop_bottom))
+ if cropped.size[0] < 10 or cropped.size[1] < 10:
+ continue
+ tasks.append((key, line_conf, bbox, cropped))
+
+ def _run_one_line_vlm(
+ task: Tuple[str, float, Tuple, Image.Image],
+ ) -> Dict[str, Any]:
+ """Run Bedrock VLM on one line crop. Returns dict with key, line_conf, bbox, cropped, and either vlm_result or error."""
+ key, line_conf, bbox, cropped = task
+ prompt_used = model_default_prompt
+ try:
+ vlm_result = _bedrock_vlm_ocr_predict(
+ cropped,
+ model_choice=model_choice,
+ bedrock_runtime=bedrock_runtime,
+ return_prompt_and_response=save_examples,
+ )
+ if save_examples:
+ prompt_used = vlm_result.get("prompt", prompt_used)
+ return {
+ "key": key,
+ "line_conf": line_conf,
+ "bbox": bbox,
+ "cropped": cropped,
+ "vlm_result": vlm_result,
+ "prompt_used": prompt_used,
+ "raw_response": (
+ vlm_result.get("raw_response") if save_examples else None
+ ),
+ "error": None,
+ }
+ except Exception as e:
+ return {
+ "key": key,
+ "line_conf": line_conf,
+ "bbox": bbox,
+ "cropped": cropped,
+ "vlm_result": None,
+ "prompt_used": prompt_used,
+ "raw_response": None,
+ "error": str(e),
+ }
+
+ # Run VLM inference in parallel for all low-confidence lines
+ hybrid_vlm_input_tokens = 0
+ hybrid_vlm_output_tokens = 0
+ updates = []
+ vlm_results_list = []
+ if tasks:
+ max_workers_hybrid = min(MAX_WORKERS, len(tasks))
+ with ThreadPoolExecutor(max_workers=max_workers_hybrid) as executor:
+ vlm_results_list = list(executor.map(_run_one_line_vlm, tasks))
+
+ # Process each VLM result (post-processing and optional logging on main thread)
+ for res in vlm_results_list:
+ key = res["key"]
+ line_conf = res["line_conf"]
+ bbox = res["bbox"]
+ cropped = res["cropped"]
+ prompt_used = res["prompt_used"]
+ raw_response = res["raw_response"]
+ if res["error"] is not None:
+ print(f"Hybrid Textract-Bedrock VLM failed for line {key}: {res['error']}")
+ if save_examples and hybrid_examples_folder and inference_log_path:
+ try:
+ safe_name = (
+ f"{safe_sanitize_text(key)}_conf_{int(line_conf)}_error.png"
+ )
+ crop_path = os.path.join(hybrid_examples_folder, safe_name)
+ _save_image_with_config_dpi(cropped, crop_path)
+ log_entry = {
+ "key": key,
+ "line_conf": line_conf,
+ "prompt": prompt_used,
+ "error": res["error"],
+ "raw_response": None,
+ }
+ with open(inference_log_path, "a", encoding="utf-8") as log_f:
+ log_f.write(json.dumps(log_entry, ensure_ascii=False) + "\n")
+ except Exception as save_err:
+ print(f"Could not save hybrid example for {key}: {save_err}")
+ continue
+
+ vlm_result = res["vlm_result"]
+ hybrid_vlm_input_tokens += vlm_result.get("vlm_input_tokens", 0)
+ hybrid_vlm_output_tokens += vlm_result.get("vlm_output_tokens", 0)
+ rec_texts = vlm_result.get("rec_texts", [])
+ rec_scores = vlm_result.get("rec_scores", [])
+ if not rec_texts or not rec_scores:
+ if save_examples and hybrid_examples_folder and inference_log_path:
+ try:
+ safe_name = f"{safe_sanitize_text(key)}_conf_{int(line_conf)}.png"
+ crop_path = os.path.join(hybrid_examples_folder, safe_name)
+ _save_image_with_config_dpi(cropped, crop_path)
+ log_entry = {
+ "key": key,
+ "line_conf": line_conf,
+ "prompt": prompt_used,
+ "raw_response": raw_response,
+ "error": None,
+ "parsed_rec_texts": [],
+ }
+ with open(inference_log_path, "a", encoding="utf-8") as log_f:
+ log_f.write(json.dumps(log_entry, ensure_ascii=False) + "\n")
+ except Exception as save_err:
+ print(f"Could not save hybrid example for {key}: {save_err}")
+ continue
+
+ if save_examples and hybrid_examples_folder and inference_log_path:
+ try:
+ safe_name = f"{safe_sanitize_text(key)}_conf_{int(line_conf)}.png"
+ crop_path = os.path.join(hybrid_examples_folder, safe_name)
+ _save_image_with_config_dpi(cropped, crop_path)
+ log_entry = {
+ "key": key,
+ "line_conf": line_conf,
+ "prompt": prompt_used,
+ "raw_response": raw_response,
+ "error": None,
+ "parsed_rec_texts": rec_texts,
+ "parsed_rec_scores": rec_scores,
+ }
+ with open(inference_log_path, "a", encoding="utf-8") as log_f:
+ log_f.write(json.dumps(log_entry, ensure_ascii=False) + "\n")
+ except Exception as save_err:
+ print(f"Could not save hybrid example for {key}: {save_err}")
+
+ # Textract may have split punctuation into separate words (SPLIT_PUNCTUATION_FROM_WORDS),
+ # while the VLM returns punctuation attached (e.g. "ACTION."). Expand VLM words to match
+ # original word count by splitting trailing punctuation when the next original word is punct-only.
+ line_data_for_words = results.get(key) or {}
+ original_words_list = line_data_for_words.get("words") or []
+ original_word_count = len(original_words_list)
+ expanded_texts = []
+ expanded_scores = []
+ j = 0
+ i = 0
+ while i < original_word_count and j < len(rec_texts):
+ word = rec_texts[j]
+ score = rec_scores[j] if j < len(rec_scores) else rec_scores[-1]
+ next_orig = (
+ original_words_list[i + 1] if i + 1 < original_word_count else None
+ )
+ next_orig_text = (
+ (next_orig.get("text", "") or "") if isinstance(next_orig, dict) else ""
+ )
+ next_is_punct_only = bool(next_orig_text) and not re.search(
+ r"[\w]", next_orig_text
+ )
+ word_has_trailing_punct = bool(word) and bool(re.search(r"[^\w\s]$", word))
+ if next_is_punct_only and word_has_trailing_punct and len(word) > 1:
+ match = re.search(r"^(.*?)([^\w\s]+)$", word)
+ if match:
+ main, trail = match.group(1), match.group(2)
+ expanded_texts.append(main)
+ expanded_scores.append(score)
+ expanded_texts.append(trail)
+ expanded_scores.append(score)
+ i += 2
+ j += 1
+ else:
+ expanded_texts.append(word)
+ expanded_scores.append(score)
+ i += 1
+ j += 1
+ else:
+ expanded_texts.append(word)
+ expanded_scores.append(score)
+ i += 1
+ j += 1
+ same_word_count = len(expanded_texts) == original_word_count and j == len(
+ rec_texts
+ )
+ if same_word_count:
+ rec_texts = expanded_texts
+ rec_scores = expanded_scores
+
+ new_text = " ".join(rec_texts)
+ # rec_scores from Bedrock are 0-1; store as 0-100 to match Textract
+ # Use original word-level bounding boxes so replacement is shown per word, not per line
+ new_words = []
+ for i, txt in enumerate(rec_texts):
+ score = rec_scores[i] if i < len(rec_scores) else rec_scores[-1]
+ try:
+ sc = float(score)
+ if 0 <= sc <= 1:
+ sc = sc * 100
+ sc = max(0, min(100, sc))
+ except (TypeError, ValueError):
+ sc = 100
+ # Retain original word bounding box when available (word-level replacement boxes)
+ word_bbox = bbox
+ if i < len(original_words_list):
+ orig_word = original_words_list[i]
+ if isinstance(orig_word, dict):
+ orig_bbox = orig_word.get("bounding_box")
+ if isinstance(orig_bbox, (list, tuple)) and len(orig_bbox) == 4:
+ word_bbox = orig_bbox
+ new_words.append(
+ {
+ "text": txt,
+ "confidence": round(sc, 0),
+ "bounding_box": word_bbox,
+ "model": "Bedrock VLM",
+ }
+ )
+ avg_conf = (sum(rec_scores) / len(rec_scores)) * 100 if rec_scores else 100
+ avg_conf = max(0, min(100, avg_conf))
+ # Accept VLM result if: (1) word count matches and conf > 50, or
+ # (2) word count differs by <= MAX_WORD_COUNT_DIFF and conf > 50 — then use
+ # Bedrock text + Textract line bbox and derive word-level boxes via line-to-word.
+ vlm_conf_above_50 = avg_conf > 50
+ word_count_diff = abs(original_word_count - len(rec_texts))
+ use_line_derived_words = (
+ not same_word_count
+ and vlm_conf_above_50
+ and word_count_diff <= MAX_WORD_COUNT_DIFF_FOR_LINE_DERIVED_WORDS
+ )
+
+ if same_word_count and vlm_conf_above_50:
+ updates.append((key, new_text, avg_conf, new_words, line_conf))
+ elif use_line_derived_words:
+ # Use Bedrock text and Textract line bbox; derive word-level boxes.
+ left, top, right, bottom = bbox
+ line_w = max(1, int(right) - int(left))
+ line_h = max(1, int(bottom) - int(top))
+ word_level = _convert_single_line_to_word_level_standalone(
+ new_text,
+ int(left),
+ int(top),
+ line_w,
+ line_h,
+ avg_conf,
+ image,
+ img_width,
+ img_height,
+ output_folder,
+ image_name=image_name,
+ line_model="Bedrock VLM",
+ )
+ derived_words = []
+ for idx in range(len(word_level.get("text") or [])):
+ left = word_level["left"][idx]
+ top = word_level["top"][idx]
+ width = word_level["width"][idx]
+ height = word_level["height"][idx]
+ confidence = (
+ word_level["conf"][idx]
+ if idx < len(word_level.get("conf") or [])
+ else avg_conf
+ )
+ try:
+ confidence = max(0, min(100, float(confidence)))
+ except (TypeError, ValueError):
+ confidence = avg_conf
+ derived_words.append(
+ {
+ "text": word_level["text"][idx],
+ "confidence": round(confidence, 0),
+ "bounding_box": (left, top, left + width, top + height),
+ "model": "Bedrock VLM",
+ }
+ )
+ if derived_words:
+ updates.append((key, new_text, avg_conf, derived_words, line_conf))
+ else:
+ print(
+ f" Skipping VLM result for {key}: line-to-word returned no words (original={original_word_count}, VLM={len(rec_texts)}). Keeping Textract."
+ )
+ else:
+ if not same_word_count and not use_line_derived_words:
+ print(
+ f" Skipping VLM result for {key}: word count mismatch (original={original_word_count}, VLM={len(expanded_texts)}). Keeping Textract."
+ )
+ elif not vlm_conf_above_50:
+ print(
+ f" Skipping VLM result for {key}: VLM confidence {avg_conf:.0f} not above 50. Keeping Textract."
+ )
+
+ # Map back into ocr results with words and update line-level OCRResult objects
+ line_level_results = page_line_level_ocr_results.get("results") or []
+ for key, new_text, avg_conf, new_words, line_conf in updates:
+ line_data = results.get(key)
+ if line_data is not None and isinstance(line_data, dict):
+ line_data["text"] = new_text
+ line_data["confidence"] = round(avg_conf, 0)
+ line_data["words"] = new_words
+ line_data["model"] = "Bedrock VLM"
+ # Update corresponding line-level OCRResult (by line number from key "text_line_N")
+ try:
+ line_num = int(key.replace("text_line_", ""))
+ idx = line_num - 1
+ if 0 <= idx < len(line_level_results):
+ line_result = line_level_results[idx]
+ if hasattr(line_result, "text"):
+ line_result.text = new_text
+ line_result.conf = round(avg_conf, 0)
+ line_result.model = "Bedrock VLM"
+ elif isinstance(line_result, dict):
+ line_result["text"] = new_text
+ line_result["conf"] = round(avg_conf, 0)
+ line_result["model"] = "Bedrock VLM"
+ except (ValueError, TypeError):
+ pass
+ print(
+ f" Re-OCR'd line (Textract conf: {line_conf:.0f}) -> '{new_text}' (conf: {avg_conf:.0f}) [Bedrock VLM]"
+ )
+
+ if len(updates) == 0:
+ page_no = page_line_level_ocr_results_with_words.get("page", "?")
+ print(
+ f" Hybrid Textract + Bedrock VLM: no lines on page {page_no} met the low-confidence criteria (threshold={confidence_threshold:.0f}); no Bedrock VLM inference run for this page."
+ )
+
+ return (
+ page_line_level_ocr_results_with_words,
+ hybrid_vlm_input_tokens,
+ hybrid_vlm_output_tokens,
+ model_choice or "",
+ )
+
+
+def _inference_server_ocr_predict(
+ image: Image.Image,
+ prompt: str = model_default_prompt,
+ max_retries: int = 5,
+ model_name: str = None,
+ image_hybrid_line_prepared: bool = False,
+) -> Dict[str, Any]:
+ """
+ Inference-server OCR prediction function that mimics PaddleOCR's interface.
+ Calls an external inference-server API instead of a local model.
+
+ Args:
+ image: PIL Image to process
+ prompt: Text prompt for the VLM
+ max_retries: Maximum number of retry attempts for API calls (default: 5)
+ model_name: Name of the inference-server model to use
+ image_hybrid_line_prepared: If True, ``image`` was already processed with
+ ``_prepare_hybrid_line_crop_for_vlm`` (hybrid Paddle + inference-server path).
+
+ Returns:
+ Dictionary in PaddleOCR format with 'rec_texts' and 'rec_scores'
+
+ Raises:
+ Exception: If all retry attempts fail after max_retries attempts
+ """
+ try:
+ # Validate image exists and is not None
+ if image is None:
+ print("Inference-server OCR error: Image is None")
+ return {"rec_texts": [], "rec_scores": []}
+
+ # Validate image has valid size (at least 10x10 pixels)
+ try:
+ width, height = image.size
+ if width < 10 or height < 10:
+ print(
+ f"Inference-server OCR error: Image is too small ({width}x{height} pixels). Minimum size is 10x10."
+ )
+ return {"rec_texts": [], "rec_scores": []}
+ except Exception as size_error:
+ print(f"Inference-server OCR error: Could not get image size: {size_error}")
+ return {"rec_texts": [], "rec_scores": []}
+
+ # Ensure image is in RGB mode (convert if needed)
+ try:
+ if image.mode != "RGB":
+ image = image.convert("RGB")
+ width, height = image.size
+ except Exception as convert_error:
+ print(
+ f"Inference-server OCR error: Could not convert image to RGB: {convert_error}"
+ )
+ return {"rec_texts": [], "rec_scores": []}
+
+ if not image_hybrid_line_prepared:
+ # Check and resize image if it exceeds maximum size or DPI limits
+ try:
+ image = _prepare_image_for_vlm(image, hybrid_vlm=True)
+ width, height = image.size
+ except Exception as prep_error:
+ print(
+ f"Inference-server OCR error: Could not prepare image for VLM: {prep_error}"
+ )
+ return {"rec_texts": [], "rec_scores": []}
+
+ # Use the inference-server API to extract text with retry logic
+ extracted_text = None
+
+ for attempt in range(1, max_retries + 1):
+ try:
+ # Determine model_name: use provided parameter, then DEFAULT_INFERENCE_SERVER_VLM_MODEL, then INFERENCE_SERVER_MODEL_NAME
+ final_model_name = model_name
+ if final_model_name is None or final_model_name == "":
+ final_model_name = (
+ DEFAULT_INFERENCE_SERVER_VLM_MODEL
+ if DEFAULT_INFERENCE_SERVER_VLM_MODEL
+ else None
+ )
+ if final_model_name is None or final_model_name == "":
+ final_model_name = (
+ INFERENCE_SERVER_MODEL_NAME
+ if INFERENCE_SERVER_MODEL_NAME
+ else None
+ )
+
+ extracted_text, _vlm_input_tokens, _vlm_output_tokens, _, _ = (
+ _call_inference_server_vlm_api(
+ image=image,
+ prompt=prompt,
+ model_name=final_model_name,
+ max_new_tokens=HYBRID_OCR_MAX_NEW_TOKENS,
+ temperature=None,
+ top_p=None,
+ top_k=None,
+ repetition_penalty=None,
+ seed=None,
+ do_sample=model_default_do_sample,
+ min_p=None,
+ presence_penalty=None,
+ use_llama_swap=USE_LLAMA_SWAP,
+ apply_aspect_ratio_padding=not image_hybrid_line_prepared,
+ )
+ )
+ # If we get here, the API call succeeded
+ break
+ except Exception as api_error:
+ print(
+ f"Inference-server OCR retry attempt {attempt}/{max_retries} failed: {api_error}"
+ )
+ if attempt == max_retries:
+ # All retries exhausted, raise the exception
+ raise Exception(
+ f"Inference-server OCR failed after {max_retries} attempts. Last error: {str(api_error)}"
+ ) from api_error
+ # Continue to next retry attempt
+
+ # Check if extracted_text is None or empty
+ if extracted_text is None:
+ return {"rec_texts": [], "rec_scores": []}
+
+ if not isinstance(extracted_text, str):
+ return {"rec_texts": [], "rec_scores": []}
+
+ if extracted_text.strip():
+ # Try to parse VLM/LLM response for {"text": "...", "confidence": ...} or "conf"
+ # If multiple dicts (e.g. one per word), combine text and average confidence
+ parsed = _extract_and_combine_text_dicts_from_vlm_response(extracted_text)
+ if parsed is not None and isinstance(parsed.get("text"), str):
+ text_content = parsed.get("text")
+ # Prefer "confidence", fallback to "conf" (VLM may use either)
+ confidence = parsed.get("confidence", parsed.get("conf"))
+ try:
+ score = float(confidence) if confidence is not None else 1.0
+ # Normalise: if > 1 assume percentage (0–100), else 0–1
+ if score > 1.0:
+ score = score / 100.0
+ score = max(0.0, min(1.0, score))
+ except (TypeError, ValueError):
+ score = 1.0
+ cleaned_text = re.sub(r"[\r\n]+", " ", text_content).strip()
+ else:
+ # No parseable dict: use raw text and default confidence
+ cleaned_text = re.sub(r"[\r\n]+", " ", extracted_text).strip()
+ score = 1.0
+
+ # Split into words for compatibility with PaddleOCR format
+ words = cleaned_text.split()
+
+ # If text has more words than the line-level limit, assume something went wrong and skip it
+ if len(words) > HYBRID_OCR_MAX_WORDS:
+ print(
+ f"Inference-server OCR warning: Extracted text has {len(words)} words, which exceeds the {HYBRID_OCR_MAX_WORDS} word limit. Skipping."
+ )
+ return {"rec_texts": [], "rec_scores": []}
+
+ # Create PaddleOCR-compatible result; use VLM/LLM confidence when available
+ result = {
+ "rec_texts": words,
+ "rec_scores": [score] * len(words),
+ }
+
+ return result
+ else:
+ return {"rec_texts": [], "rec_scores": []}
+
+ except Exception as e:
+ # Re-raise if it's the retry exhaustion exception
+ if "failed after" in str(e) and "attempts" in str(e):
+ raise
+ # Otherwise, handle other exceptions as before
+ print(f"Inference-server OCR error: {e}")
+ import traceback
+
+ print(f"Inference-server OCR error traceback: {traceback.format_exc()}")
+ return {"rec_texts": [], "rec_scores": []}
+
+
+def _bedrock_vlm_ocr_predict(
+ image: Image.Image,
+ prompt: str = model_default_prompt,
+ model_choice: str = None,
+ bedrock_runtime=None,
+ max_retries: int = 10,
+ return_prompt_and_response: bool = False,
+) -> Dict[str, Any]:
+ """
+ Bedrock VLM OCR prediction function that mimics PaddleOCR's interface.
+
+ Args:
+ image: PIL Image to process
+ prompt: Text prompt for the VLM
+ model_choice: Bedrock model ID
+ bedrock_runtime: boto3 Bedrock runtime client
+ max_retries: Maximum number of retry attempts for API calls (default: 5)
+ return_prompt_and_response: If True, add "prompt" and "raw_response" to the
+ returned dict for logging (raw_response is the raw API text before parsing).
+
+ Returns:
+ Dictionary in PaddleOCR format with 'rec_texts' and 'rec_scores'
+ (and optionally 'prompt', 'raw_response' when return_prompt_and_response is True).
+ """
+ extracted_text = None
+ vlm_input_tokens_used = 0
+ vlm_output_tokens_used = 0
+
+ def _add_prompt_response(d: Dict[str, Any]) -> Dict[str, Any]:
+ if return_prompt_and_response:
+ d["prompt"] = prompt
+ d["raw_response"] = extracted_text
+ d["vlm_input_tokens"] = vlm_input_tokens_used
+ d["vlm_output_tokens"] = vlm_output_tokens_used
+ return d
+
+ try:
+ # Validate image exists and is not None
+ if image is None:
+ print("Bedrock VLM OCR error: Image is None")
+ return _add_prompt_response({"rec_texts": [], "rec_scores": []})
+
+ # Validate image has valid size (at least 10x10 pixels)
+ try:
+ width, height = image.size
+ if width < 10 or height < 10:
+ print(
+ f"Bedrock VLM OCR error: Image is too small ({width}x{height} pixels). Minimum size is 10x10."
+ )
+ return _add_prompt_response({"rec_texts": [], "rec_scores": []})
+ except Exception as size_error:
+ print(f"Bedrock VLM OCR error: Could not get image size: {size_error}")
+ return _add_prompt_response({"rec_texts": [], "rec_scores": []})
+
+ # Ensure image is in RGB mode (convert if needed)
+ try:
+ if image.mode != "RGB":
+ image = image.convert("RGB")
+ width, height = image.size
+ except Exception as convert_error:
+ print(
+ f"Bedrock VLM OCR error: Could not convert image to RGB: {convert_error}"
+ )
+ return _add_prompt_response({"rec_texts": [], "rec_scores": []})
+
+ # Check and resize image if it exceeds maximum size or DPI limits
+ # Skip resizing for AWS Bedrock VLM OCR
+ try:
+ from tools.config import BEDROCK_VLM_TEXT_EXTRACT_OPTION
+
+ image = _prepare_image_for_vlm(
+ image,
+ ocr_method=BEDROCK_VLM_TEXT_EXTRACT_OPTION,
+ hybrid_vlm=True,
+ )
+ width, height = image.size
+ except Exception as prep_error:
+ print(
+ f"Bedrock VLM OCR error: Could not prepare image for VLM: {prep_error}"
+ )
+ return _add_prompt_response({"rec_texts": [], "rec_scores": []})
+
+ # Use the Bedrock API to extract text with retry logic
+ for attempt in range(1, max_retries + 1):
+ try:
+ extracted_text, _vlm_input_tokens, _vlm_output_tokens, _, _ = (
+ _call_bedrock_vlm_api(
+ image=image,
+ prompt=prompt,
+ model_choice=model_choice,
+ bedrock_runtime=bedrock_runtime,
+ max_new_tokens=MAX_NEW_TOKENS,
+ temperature=model_default_temperature,
+ top_p=model_default_top_p,
+ )
+ )
+ vlm_input_tokens_used = _vlm_input_tokens
+ vlm_output_tokens_used = _vlm_output_tokens
+ # If we get here, the API call succeeded
+ break
+ except Exception as api_error:
+ print(
+ f"Bedrock VLM OCR retry attempt {attempt}/{max_retries} failed: {api_error}"
+ )
+ if attempt == max_retries:
+ raise Exception(
+ f"Bedrock VLM OCR failed after {max_retries} attempts. Last error: {str(api_error)}"
+ ) from api_error
+
+ # Check if extracted_text is None or empty
+ if extracted_text is None:
+ return _add_prompt_response({"rec_texts": [], "rec_scores": []})
+
+ if not isinstance(extracted_text, str):
+ return _add_prompt_response({"rec_texts": [], "rec_scores": []})
+
+ if extracted_text.strip():
+ # If Bedrock returns multiple dicts (e.g. one per word) {"text": "...", "confidence": ...}, combine and average (same as local VLM / inference server)
+ parsed = _extract_and_combine_text_dicts_from_vlm_response(extracted_text)
+ if parsed is not None and isinstance(parsed.get("text"), str):
+ text_content = parsed.get("text", "").strip()
+ conf = parsed.get("confidence", parsed.get("conf"))
+ try:
+ score = float(conf) if conf is not None else 1.0
+ if score > 1.0:
+ score = score / 100.0
+ score = max(0.0, min(1.0, score))
+ except (TypeError, ValueError):
+ score = 1.0
+ if text_content:
+ words = re.sub(r"[\r\n]+", " ", text_content).strip().split()
+ if len(words) <= HYBRID_OCR_MAX_WORDS:
+ return _add_prompt_response(
+ {
+ "rec_texts": words,
+ "rec_scores": [score] * len(words),
+ }
+ )
+ # Reject parsed result with empty text or zero confidence (e.g. {"text": "", "conf": 0.0})
+ if not text_content or score <= 0.0:
+ return _add_prompt_response({"rec_texts": [], "rec_scores": []})
+
+ # Try to parse VLM JSON response [{'bbox': [...], 'text': '...', 'conf': 0-1}, ...]
+ lines_data = None
+ text = extracted_text.strip()
+ try:
+ text = _fix_malformed_bbox_in_json_string(text)
+ except Exception:
+ pass
+
+ try:
+ lines_data = json.loads(text)
+ except json.JSONDecodeError:
+ pass
+
+ if lines_data is None:
+ json_match = re.search(r"```(?:json)?\s*(\[.*?\])", text, re.DOTALL)
+ if json_match:
+ try:
+ lines_data = json.loads(json_match.group(1))
+ except json.JSONDecodeError:
+ pass
+
+ if lines_data is None and "[" in text:
+ start_idx = text.find("[")
+ bracket_count = 0
+ end_idx = start_idx
+ for i in range(start_idx, len(text)):
+ if text[i] == "[":
+ bracket_count += 1
+ elif text[i] == "]":
+ bracket_count -= 1
+ if bracket_count == 0:
+ end_idx = i
+ break
+ if end_idx > start_idx:
+ try:
+ lines_data = json.loads(text[start_idx : end_idx + 1])
+ except json.JSONDecodeError:
+ pass
+
+ if lines_data is None:
+ try:
+ python_data = ast.literal_eval(text)
+ if isinstance(python_data, list):
+ lines_data = python_data
+ except Exception:
+ pass
+
+ if isinstance(lines_data, list) and len(lines_data) > 0:
+ rec_texts = []
+ rec_scores = []
+ for line_item in lines_data:
+ if not isinstance(line_item, dict):
+ continue
+ line_text = line_item.get("text_content") or line_item.get(
+ "text", ""
+ )
+ if line_text is None:
+ line_text = ""
+ line_text = str(line_text).strip()
+ if not line_text:
+ continue
+ conf = line_item.get("confidence", line_item.get("conf"))
+ try:
+ score = float(conf) if conf is not None else 1.0
+ if score > 1.0:
+ score = score / 100.0
+ score = max(0.0, min(1.0, score))
+ except (TypeError, ValueError):
+ score = 1.0
+ rec_texts.append(line_text)
+ rec_scores.append(score)
+ if rec_texts:
+ return _add_prompt_response(
+ {"rec_texts": rec_texts, "rec_scores": rec_scores}
+ )
+
+ # Fallback: treat response as plain text (e.g. different prompt)
+ cleaned_text = re.sub(r"[\r\n]+", " ", extracted_text)
+ cleaned_text = cleaned_text.strip()
+
+ words = cleaned_text.split()
+
+ if len(words) > HYBRID_OCR_MAX_WORDS:
+ print(
+ f"Bedrock VLM OCR warning: Extracted text has {len(words)} words, which exceeds the {HYBRID_OCR_MAX_WORDS} word limit. Skipping."
+ )
+ return _add_prompt_response({"rec_texts": [], "rec_scores": []})
+
+ result = {
+ "rec_texts": words,
+ "rec_scores": [1.0] * len(words),
+ }
+
+ return _add_prompt_response(result)
+ else:
+ return _add_prompt_response({"rec_texts": [], "rec_scores": []})
+
+ except Exception as e:
+ print(f"Bedrock VLM OCR error: {e}")
+ import traceback
+
+ print(f"Bedrock VLM OCR error traceback: {traceback.format_exc()}")
+ return _add_prompt_response({"rec_texts": [], "rec_scores": []})
+
+
+def _gemini_vlm_ocr_predict(
+ image: Image.Image,
+ prompt: str = model_default_prompt,
+ model_choice: str = None,
+ client=None,
+ config=None,
+ max_retries: int = 5,
+) -> Dict[str, Any]:
+ """
+ Gemini VLM OCR prediction function that mimics PaddleOCR's interface.
+
+ Args:
+ image: PIL Image to process
+ prompt: Text prompt for the VLM
+ model_choice: Gemini model name
+ client: Gemini ai.Client instance
+ config: Gemini types.GenerateContentConfig instance
+ max_retries: Maximum number of retry attempts for API calls (default: 5)
+
+ Returns:
+ Dictionary in PaddleOCR format with 'rec_texts' and 'rec_scores'
+ """
+ try:
+ # Validate image exists and is not None
+ if image is None:
+ print("Gemini VLM OCR error: Image is None")
+ return {"rec_texts": [], "rec_scores": []}
+
+ # Validate image has valid size (at least 10x10 pixels)
+ try:
+ width, height = image.size
+ if width < 10 or height < 10:
+ print(
+ f"Gemini VLM OCR error: Image is too small ({width}x{height} pixels). Minimum size is 10x10."
+ )
+ return {"rec_texts": [], "rec_scores": []}
+ except Exception as size_error:
+ print(f"Gemini VLM OCR error: Could not get image size: {size_error}")
+ return {"rec_texts": [], "rec_scores": []}
+
+ # Ensure image is in RGB mode (convert if needed)
+ try:
+ if image.mode != "RGB":
+ image = image.convert("RGB")
+ width, height = image.size
+ except Exception as convert_error:
+ print(
+ f"Gemini VLM OCR error: Could not convert image to RGB: {convert_error}"
+ )
+ return {"rec_texts": [], "rec_scores": []}
+
+ # Check and resize image if it exceeds maximum size or DPI limits
+ try:
+ image = _prepare_image_for_vlm(image, hybrid_vlm=True)
+ width, height = image.size
+ except Exception as prep_error:
+ print(
+ f"Gemini VLM OCR error: Could not prepare image for VLM: {prep_error}"
+ )
+ return {"rec_texts": [], "rec_scores": []}
+
+ # Use the Gemini API to extract text with retry logic
+ extracted_text = None
+
+ for attempt in range(1, max_retries + 1):
+ try:
+ extracted_text, _, _ = _call_gemini_vlm_api(
+ image=image,
+ prompt=prompt,
+ client=client,
+ config=config,
+ model_choice=model_choice,
+ max_new_tokens=MAX_NEW_TOKENS,
+ temperature=model_default_temperature,
+ )
+ # If we get here, the API call succeeded
+ break
+ except Exception as api_error:
+ print(
+ f"Gemini VLM OCR retry attempt {attempt}/{max_retries} failed: {api_error}"
+ )
+ if attempt == max_retries:
+ raise Exception(
+ f"Gemini VLM OCR failed after {max_retries} attempts. Last error: {str(api_error)}"
+ ) from api_error
+
+ # Check if extracted_text is None or empty
+ if extracted_text is None:
+ return {"rec_texts": [], "rec_scores": []}
+
+ if not isinstance(extracted_text, str):
+ return {"rec_texts": [], "rec_scores": []}
+
+ if extracted_text.strip():
+ # Try to parse VLM JSON response [{'bbox': [...], 'text': '...', 'conf': 0-1}, ...]
+ lines_data = None
+ text = extracted_text.strip()
+ try:
+ text = _fix_malformed_bbox_in_json_string(text)
+ except Exception:
+ pass
+
+ try:
+ lines_data = json.loads(text)
+ except json.JSONDecodeError:
+ pass
+
+ if lines_data is None:
+ json_match = re.search(r"```(?:json)?\s*(\[.*?\])", text, re.DOTALL)
+ if json_match:
+ try:
+ lines_data = json.loads(json_match.group(1))
+ except json.JSONDecodeError:
+ pass
+
+ if lines_data is None and "[" in text:
+ start_idx = text.find("[")
+ bracket_count = 0
+ end_idx = start_idx
+ for i in range(start_idx, len(text)):
+ if text[i] == "[":
+ bracket_count += 1
+ elif text[i] == "]":
+ bracket_count -= 1
+ if bracket_count == 0:
+ end_idx = i
+ break
+ if end_idx > start_idx:
+ try:
+ lines_data = json.loads(text[start_idx : end_idx + 1])
+ except json.JSONDecodeError:
+ pass
+
+ if lines_data is None:
+ try:
+ python_data = ast.literal_eval(text)
+ if isinstance(python_data, list):
+ lines_data = python_data
+ except Exception:
+ pass
+
+ if isinstance(lines_data, list) and len(lines_data) > 0:
+ rec_texts = []
+ rec_scores = []
+ for line_item in lines_data:
+ if not isinstance(line_item, dict):
+ continue
+ line_text = line_item.get("text_content") or line_item.get(
+ "text", ""
+ )
+ if line_text is None:
+ line_text = ""
+ line_text = str(line_text).strip()
+ if not line_text:
+ continue
+ conf = line_item.get("confidence", line_item.get("conf"))
+ try:
+ score = float(conf) if conf is not None else 1.0
+ if score > 1.0:
+ score = score / 100.0
+ score = max(0.0, min(1.0, score))
+ except (TypeError, ValueError):
+ score = 1.0
+ rec_texts.append(line_text)
+ rec_scores.append(score)
+ if rec_texts:
+ return {"rec_texts": rec_texts, "rec_scores": rec_scores}
+
+ # Fallback: treat response as plain text (e.g. different prompt)
+ cleaned_text = re.sub(r"[\r\n]+", " ", extracted_text)
+ cleaned_text = cleaned_text.strip()
+
+ words = cleaned_text.split()
+
+ if len(words) > HYBRID_OCR_MAX_WORDS:
+ print(
+ f"Gemini VLM OCR warning: Extracted text has {len(words)} words, which exceeds the {HYBRID_OCR_MAX_WORDS} word limit. Skipping."
+ )
+ return {"rec_texts": [], "rec_scores": []}
+
+ result = {
+ "rec_texts": words,
+ "rec_scores": [1.0] * len(words),
+ }
+
+ return result
+ else:
+ return {"rec_texts": [], "rec_scores": []}
+
+ except Exception as e:
+ print(f"Gemini VLM OCR error: {e}")
+ import traceback
+
+ print(f"Gemini VLM OCR error traceback: {traceback.format_exc()}")
+ return {"rec_texts": [], "rec_scores": []}
+
+
+def _azure_openai_vlm_ocr_predict(
+ image: Image.Image,
+ prompt: str = model_default_prompt,
+ model_choice: str = None,
+ client=None,
+ max_retries: int = 5,
+) -> Dict[str, Any]:
+ """
+ Azure/OpenAI VLM OCR prediction function that mimics PaddleOCR's interface.
+
+ Args:
+ image: PIL Image to process
+ prompt: Text prompt for the VLM
+ model_choice: Model name (e.g., "gpt-4o", "gpt-4-vision-preview")
+ client: OpenAI client instance
+ max_retries: Maximum number of retry attempts for API calls (default: 5)
+
+ Returns:
+ Dictionary in PaddleOCR format with 'rec_texts' and 'rec_scores'
+ """
+ try:
+ # Validate image exists and is not None
+ if image is None:
+ print("Azure/OpenAI VLM OCR error: Image is None")
+ return {"rec_texts": [], "rec_scores": []}
+
+ # Validate image has valid size (at least 10x10 pixels)
+ try:
+ width, height = image.size
+ if width < 10 or height < 10:
+ print(
+ f"Azure/OpenAI VLM OCR error: Image is too small ({width}x{height} pixels). Minimum size is 10x10."
+ )
+ return {"rec_texts": [], "rec_scores": []}
+ except Exception as size_error:
+ print(f"Azure/OpenAI VLM OCR error: Could not get image size: {size_error}")
+ return {"rec_texts": [], "rec_scores": []}
+
+ # Ensure image is in RGB mode (convert if needed)
+ try:
+ if image.mode != "RGB":
+ image = image.convert("RGB")
+ width, height = image.size
+ except Exception as convert_error:
+ print(
+ f"Azure/OpenAI VLM OCR error: Could not convert image to RGB: {convert_error}"
+ )
+ return {"rec_texts": [], "rec_scores": []}
+
+ # Check and resize image if it exceeds maximum size or DPI limits
+ try:
+ image = _prepare_image_for_vlm(image, hybrid_vlm=True)
+ width, height = image.size
+ except Exception as prep_error:
+ print(
+ f"Azure/OpenAI VLM OCR error: Could not prepare image for VLM: {prep_error}"
+ )
+ return {"rec_texts": [], "rec_scores": []}
+
+ # Use the Azure/OpenAI API to extract text with retry logic
+ extracted_text = None
+
+ for attempt in range(1, max_retries + 1):
+ try:
+ extracted_text, _, _ = _call_azure_openai_vlm_api(
+ image=image,
+ prompt=prompt,
+ client=client,
+ model_choice=model_choice,
+ max_new_tokens=MAX_NEW_TOKENS,
+ temperature=model_default_temperature,
+ )
+ # If we get here, the API call succeeded
+ break
+ except Exception as api_error:
+ print(
+ f"Azure/OpenAI VLM OCR retry attempt {attempt}/{max_retries} failed: {api_error}"
+ )
+ if attempt == max_retries:
+ raise Exception(
+ f"Azure/OpenAI VLM OCR failed after {max_retries} attempts. Last error: {str(api_error)}"
+ ) from api_error
+
+ # Check if extracted_text is None or empty
+ if extracted_text is None:
+ return {"rec_texts": [], "rec_scores": []}
+
+ if not isinstance(extracted_text, str):
+ return {"rec_texts": [], "rec_scores": []}
+
+ if extracted_text.strip():
+ # Try to parse VLM JSON response [{'bbox': [...], 'text': '...', 'conf': 0-1}, ...]
+ lines_data = None
+ text = extracted_text.strip()
+ try:
+ text = _fix_malformed_bbox_in_json_string(text)
+ except Exception:
+ pass
+
+ try:
+ lines_data = json.loads(text)
+ except json.JSONDecodeError:
+ pass
+
+ if lines_data is None:
+ json_match = re.search(r"```(?:json)?\s*(\[.*?\])", text, re.DOTALL)
+ if json_match:
+ try:
+ lines_data = json.loads(json_match.group(1))
+ except json.JSONDecodeError:
+ pass
+
+ if lines_data is None and "[" in text:
+ start_idx = text.find("[")
+ bracket_count = 0
+ end_idx = start_idx
+ for i in range(start_idx, len(text)):
+ if text[i] == "[":
+ bracket_count += 1
+ elif text[i] == "]":
+ bracket_count -= 1
+ if bracket_count == 0:
+ end_idx = i
+ break
+ if end_idx > start_idx:
+ try:
+ lines_data = json.loads(text[start_idx : end_idx + 1])
+ except json.JSONDecodeError:
+ pass
+
+ if lines_data is None:
+ try:
+ python_data = ast.literal_eval(text)
+ if isinstance(python_data, list):
+ lines_data = python_data
+ except Exception:
+ pass
+
+ if isinstance(lines_data, list) and len(lines_data) > 0:
+ rec_texts = []
+ rec_scores = []
+ for line_item in lines_data:
+ if not isinstance(line_item, dict):
+ continue
+ line_text = line_item.get("text_content") or line_item.get(
+ "text", ""
+ )
+ if line_text is None:
+ line_text = ""
+ line_text = str(line_text).strip()
+ if not line_text:
+ continue
+ conf = line_item.get("confidence", line_item.get("conf"))
+ try:
+ score = float(conf) if conf is not None else 1.0
+ if score > 1.0:
+ score = score / 100.0
+ score = max(0.0, min(1.0, score))
+ except (TypeError, ValueError):
+ score = 1.0
+ rec_texts.append(line_text)
+ rec_scores.append(score)
+ if rec_texts:
+ return {"rec_texts": rec_texts, "rec_scores": rec_scores}
+
+ # Fallback: treat response as plain text (e.g. different prompt)
+ cleaned_text = re.sub(r"[\r\n]+", " ", extracted_text)
+ cleaned_text = cleaned_text.strip()
+
+ words = cleaned_text.split()
+
+ if len(words) > HYBRID_OCR_MAX_WORDS:
+ print(
+ f"Azure/OpenAI VLM OCR warning: Extracted text has {len(words)} words, which exceeds the {HYBRID_OCR_MAX_WORDS} word limit. Skipping."
+ )
+ return {"rec_texts": [], "rec_scores": []}
+
+ result = {
+ "rec_texts": words,
+ "rec_scores": [1.0] * len(words),
+ }
+
+ return result
+ else:
+ return {"rec_texts": [], "rec_scores": []}
+
+ except Exception as e:
+ print(f"Azure/OpenAI VLM OCR error: {e}")
+ import traceback
+
+ print(f"Azure/OpenAI VLM OCR error traceback: {traceback.format_exc()}")
+ return {"rec_texts": [], "rec_scores": []}
+
+
+def plot_text_bounding_boxes(
+ image: Image.Image,
+ bounding_boxes: List[Dict],
+ image_name: str = "initial_vlm_output_bounding_boxes.png",
+ image_folder: str = "inference_server_visualisations",
+ output_folder: str = OUTPUT_FOLDER,
+ task_type: str = "ocr",
+):
+ """
+ Plots bounding boxes on an image with markers for each a name, using PIL, normalised coordinates, and different colors.
+
+ Args:
+ image: The PIL Image object.
+ bounding_boxes: A list of bounding boxes containing the name of the object
+ and their positions in normalized [y1 x1 y2 x2] format.
+ image_name: The name of the image for debugging.
+ image_folder: The folder name (relative to output_folder) where the image will be saved.
+ output_folder: The folder where the image will be saved.
+ task_type: The type of task the bounding boxes are for ("ocr", "person", "signature").
+ """
+
+ # Load the image
+ img = image
+ width, height = img.size
+ # Create a drawing object
+ draw = ImageDraw.Draw(img)
+
+ # Parsing out the markdown fencing
+ bounding_boxes = parse_json(bounding_boxes)
+
+ font = ImageFont.load_default()
+
+ # Iterate over the bounding boxes
+ for i, bbox_dict in enumerate(ast.literal_eval(bounding_boxes)):
+ color = "green"
+
+ # Extract the bounding box coordinates (preserve the original dict for text extraction)
+ if "bb" in bbox_dict:
+ bbox_coords = bbox_dict["bb"]
+ elif "bbox" in bbox_dict:
+ bbox_coords = bbox_dict["bbox"]
+ elif "bbox_2d" in bbox_dict:
+ bbox_coords = bbox_dict["bbox_2d"]
+ else:
+ # Skip if no valid bbox found
+ continue
+
+ # Ensure bbox_coords is a list with 4 elements
+ if not isinstance(bbox_coords, list) or len(bbox_coords) != 4:
+ # Try to fix malformed bbox
+ fixed_bbox = _fix_malformed_bbox(bbox_coords)
+ if fixed_bbox is not None:
+ bbox_coords = fixed_bbox
+ else:
+ continue
+
+ # Convert normalized coordinates to absolute coordinates
+ abs_y1 = int(bbox_coords[1] / 999 * height)
+ abs_x1 = int(bbox_coords[0] / 999 * width)
+ abs_y2 = int(bbox_coords[3] / 999 * height)
+ abs_x2 = int(bbox_coords[2] / 999 * width)
+
+ if abs_x1 > abs_x2:
+ abs_x1, abs_x2 = abs_x2, abs_x1
+
+ if abs_y1 > abs_y2:
+ abs_y1, abs_y2 = abs_y2, abs_y1
+
+ # Draw the bounding box
+ draw.rectangle(((abs_x1, abs_y1), (abs_x2, abs_y2)), outline=color, width=1)
+
+ # Draw the text - extract from the original dictionary, not the coordinates
+ text_to_draw = "No text"
+ if "text" in bbox_dict:
+ text_to_draw = bbox_dict["text"]
+ elif "text_content" in bbox_dict:
+ text_to_draw = bbox_dict["text_content"]
+
+ draw.text((abs_x1, abs_y2), text_to_draw, fill=color, font=font)
+
+ try:
+ debug_dir = os.path.join(
+ output_folder,
+ image_folder,
+ )
+ # Security: Validate that the constructed path is safe
+ normalized_debug_dir = os.path.normpath(os.path.abspath(debug_dir))
+ if not validate_folder_containment(normalized_debug_dir, OUTPUT_FOLDER):
+ raise ValueError(
+ f"Unsafe image folder path: {debug_dir}. Must be contained within {OUTPUT_FOLDER}"
+ )
+ os.makedirs(normalized_debug_dir, exist_ok=True)
+ # Increment the number at the end of image_name before .png
+ # This converts zero-indexed input to one-indexed output
+ incremented_image_name = image_name
+ if image_name.endswith(".png"):
+ # Find the number pattern at the end before .png
+ # Matches patterns like: _0.png, _00.png, 0.png, 00.png, etc.
+ pattern = r"(\d+)(\.png)$"
+ match = re.search(pattern, image_name)
+ if match:
+ number_str = match.group(1)
+ number = int(number_str)
+ incremented_number = number + 1
+ # Preserve the same number of digits (padding with zeros if needed)
+ incremented_str = str(incremented_number).zfill(len(number_str))
+ incremented_image_name = re.sub(
+ pattern, lambda m: incremented_str + m.group(2), image_name
+ )
+
+ image_name_safe = safe_sanitize_text(incremented_image_name)
+ image_name_shortened = image_name_safe[:50]
+ task_type_suffix = f"_{task_type}" if task_type != "ocr" else ""
+ filename = (
+ f"{image_name_shortened}_initial_bounding_box_output{task_type_suffix}.png"
+ )
+ filepath = os.path.join(normalized_debug_dir, filename)
+ _save_image_with_config_dpi(img, filepath)
+ except Exception as e:
+ print(f"Error saving image with bounding boxes: {e}")
+
+
+def parse_json(json_output):
+ # Parsing out the markdown fencing
+ lines = json_output.splitlines()
+ for i, line in enumerate(lines):
+ if line == "```json":
+ json_output = "\n".join(
+ lines[i + 1 :]
+ ) # Remove everything before "```json"
+ json_output = json_output.split("```")[
+ 0
+ ] # Remove everything after the closing "```"
+ break # Exit the loop once "```json" is found
+ return json_output
+
+
+def _fix_malformed_bbox_in_json_string(json_string):
+ """
+ Fixes malformed bounding box values in a JSON string before parsing.
+
+ Handles cases like:
+ - "bb": "779, 767, 874, 789], "text" (missing opening bracket, missing closing quote)
+ - "bb": "[779, 767, 874, 789]" (stringified array)
+ - "bb": "779, 767, 874, 789" (no brackets)
+
+ Args:
+ json_string: The raw JSON string that may contain malformed bbox values
+
+ Returns:
+ str: The JSON string with malformed bbox values fixed
+ """
+ import re
+
+ # Pattern 1: Match malformed bbox like: "bb": "779, 767, 874, 789], "text"
+ # The issue: missing opening bracket, missing closing quote after the bracket
+ # Matches: "bb": " followed by numbers, ], then , "
+ pattern1 = (
+ r'("(?:bb|bbox|bbox_2d)"\s*:\s*)"(\d+\s*,\s*\d+\s*,\s*\d+\s*,\s*\d+)\]\s*,\s*"'
+ )
+
+ def fix_bbox_match1(match):
+ key_part = match.group(1) # "bb": "
+ bbox_str = match.group(2) # "779, 767, 874, 789"
+
+ # Format as proper JSON array (no quotes around it)
+ fixed_bbox = "[" + bbox_str.strip() + "]"
+
+ # Return the fixed version: "bb": [779, 767, 874, 789], "
+ return key_part + fixed_bbox + ', "'
+
+ # Pattern 2: Match malformed bbox like: "bb": "779, 767, 874, 789]"
+ # Missing opening bracket, but has closing quote
+ pattern2 = r'("(?:bb|bbox|bbox_2d)"\s*:\s*)"(\d+\s*,\s*\d+\s*,\s*\d+\s*,\s*\d+)\]"'
+
+ def fix_bbox_match2(match):
+ key_part = match.group(1)
+ bbox_str = match.group(2)
+ fixed_bbox = "[" + bbox_str.strip() + "]"
+ return key_part + fixed_bbox + '"'
+
+ # Pattern 3: Match malformed bbox like: "bb": "779, 767, 874, 789] (end of object, no quote)
+ pattern3 = (
+ r'("(?:bb|bbox|bbox_2d)"\s*:\s*)"(\d+\s*,\s*\d+\s*,\s*\d+\s*,\s*\d+)\]\s*\}'
+ )
+
+ def fix_bbox_match3(match):
+ key_part = match.group(1)
+ bbox_str = match.group(2)
+ fixed_bbox = "[" + bbox_str.strip() + "]"
+ return key_part + fixed_bbox + "}"
+
+ # Apply the fixes in order
+ fixed_json = re.sub(pattern1, fix_bbox_match1, json_string)
+ fixed_json = re.sub(pattern2, fix_bbox_match2, fixed_json)
+ fixed_json = re.sub(pattern3, fix_bbox_match3, fixed_json)
+
+ return fixed_json
+
+
+def _repair_vlm_json_stray_coordinate_strings(
+ json_string: str,
+ default_text: str = "[UNKNOWN]",
+ default_conf: float = 0.9,
+) -> str:
+ """
+ Fix invalid JSON where the VLM repeats bbox coords as a lone quoted string
+ instead of \"text\" / \"conf\", e.g.:
+ {\"bbox\": [870, 290, 913, 316], \"870, 290, 913, 316\"}
+ """
+ if not json_string or not default_text:
+ return json_string
+ text_lit = json.dumps(str(default_text))
+ conf_lit = json.dumps(float(default_conf))
+ # After a closing `]` (end of bbox array), comma, quoted string of four ints only
+ pattern = re.compile(r'(\])\s*,\s*"(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)"')
+
+ def _repl(m):
+ return f'{m.group(1)}, "text": {text_lit}, "conf": {conf_lit}'
+
+ prev = None
+ s = json_string
+ while prev != s:
+ prev = s
+ s = pattern.sub(_repl, s)
+ return s
+
+
+def _repair_vlm_json_missing_text_key_after_bbox(json_string: str) -> str:
+ """
+ Fix invalid JSON where the VLM omits the \"text\" key before the OCR string.
+
+ Broken (not valid JSON — bare string where a key:value pair is required):
+ {\"bbox\": [42, 70, 223, 115], \"Method\", \"conf\": 0.882}
+ Fixed:
+ {\"bbox\": [42, 70, 223, 115], \"text\": \"Method\", \"conf\": 0.882}
+
+ Common Bedrock/VLM mistake after a correct first line or two.
+ """
+ if not json_string or not isinstance(json_string, str):
+ return json_string
+ # After closing bbox array `]`, comma, a quoted string, comma, then "conf":
+ # Insert "text": before that string (only when the string is not already "text").
+ pattern = re.compile(
+ r'\]\s*,\s*"((?:[^"\\]|\\.)*)"\s*,\s*"conf"\s*:',
+ )
+
+ def _repl(m) -> str:
+ inner = m.group(1)
+ text_lit = json.dumps(inner)
+ return f'], "text": {text_lit}, "conf":'
+
+ prev = None
+ s = json_string
+ while prev != s:
+ prev = s
+ s = pattern.sub(_repl, s)
+ return s
+
+
+def _preprocess_vlm_ocr_json_string(
+ raw: Optional[str],
+ implied_label: Optional[str] = None,
+) -> str:
+ """Chain bbox fixes and stray-coordinate repair before json.loads."""
+ if not raw or not isinstance(raw, str):
+ return ""
+ s = raw.strip()
+ s = _fix_malformed_bbox_in_json_string(s)
+ label = implied_label if implied_label else "[UNKNOWN]"
+ s = _repair_vlm_json_stray_coordinate_strings(s, default_text=label)
+ s = _repair_vlm_json_missing_text_key_after_bbox(s)
+ s = _repair_vlm_json_common_quote_issues(s)
+ return s
+
+
+CUSTOM_VLM_CANONICAL_LABELS = frozenset({"[FACE]", "[SIGNATURE]"})
+
+
+def _get_vlm_item_conf_field(item: dict):
+ """Best-effort confidence field from common or fuzzy VLM keys."""
+ if not isinstance(item, dict):
+ return None
+ for k in ("confidence", "conf", "confidence_level", "confidence_score"):
+ if item.get(k) is not None:
+ return item.get(k)
+ # Fuzzy match: any key containing "conf" (covers confidence_level, conf_score, etc.)
+ for key, val in item.items():
+ if val is None:
+ continue
+ lk = str(key).lower()
+ if "conf" in lk:
+ return val
+ return None
+
+
+def _extract_vlm_line_text(item: dict) -> str:
+ """
+ Best-effort string for general OCR line items when the model uses alternate keys.
+ Order avoids grabbing non-OCR fields like 'label' used for classes.
+ """
+ for key in ("text", "text_content", "content", "transcription"):
+ val = item.get(key)
+ if val is None:
+ continue
+ if isinstance(val, str):
+ s = val.strip()
+ if s:
+ return s
+ else:
+ s = str(val).strip()
+ if s:
+ return s
+ # Fuzzy match: accept any key that contains "text", but avoid obvious non-text fields.
+ for key, val in item.items():
+ if val is None:
+ continue
+ lk = str(key).lower()
+ if "text" not in lk:
+ continue
+ if lk in ("context", "texture", "text_direction"):
+ continue
+ if "label" in lk:
+ continue
+ if isinstance(val, str):
+ s = val.strip()
+ if s:
+ return s
+ else:
+ s = str(val).strip()
+ if s:
+ return s
+ return ""
+
+
+def _get_vlm_item_bbox_field(item: dict):
+ """Raw bbox value from common VLM keys (may be list or malformed string)."""
+ if not isinstance(item, dict):
+ return None
+ if item.get("bbox_2d") is not None:
+ return item.get("bbox_2d")
+ if item.get("bbox") is not None:
+ return item.get("bbox")
+ if item.get("bb") is not None:
+ return item.get("bb")
+ # Fuzzy match: any key containing bbox/bounding_box.
+ for key, val in item.items():
+ if val is None:
+ continue
+ lk = str(key).lower()
+ if "bbox" in lk or "bounding_box" in lk or "boundingbox" in lk:
+ return val
+ return None
+
+
+def _normalize_single_line_text_dict(obj: Dict[str, Any]) -> Optional[Dict[str, Any]]:
+ """
+ Map content/conf aliases to text/confidence for hybrid / single-line VLM dicts.
+ """
+ if not isinstance(obj, dict):
+ return None
+ text = obj.get("text")
+ if text is None or (isinstance(text, str) and not text.strip()):
+ for alt in ("content", "transcription", "text_content"):
+ v = obj.get(alt)
+ if v is not None:
+ if isinstance(v, str) and v.strip():
+ text = v.strip()
+ break
+ if not isinstance(v, str) and str(v).strip():
+ text = str(v).strip()
+ break
+ if text is None:
+ return None
+ if not isinstance(text, str):
+ text = str(text)
+ conf = _get_vlm_item_conf_field(obj)
+ out = {"text": text}
+ if conf is not None:
+ out["confidence"] = conf
+ return out
+
+
+def _fix_malformed_bbox(bbox):
+ """
+ Attempts to fix malformed bounding box values.
+
+ Handles cases where bbox is:
+ - A string like "779, 767, 874, 789]" (missing opening bracket)
+ - A string like "[779, 767, 874, 789]" (should be parsed)
+ - A string like "779, 767, 874, 789" (no brackets at all)
+ - Already a valid list (returns as-is)
+
+ Args:
+ bbox: The bounding box value (could be list, string, or other)
+
+ Returns:
+ list: A list of 4 numbers [x1, y1, x2, y2], or None if parsing fails
+ """
+ # If it's already a valid list, return it
+ if isinstance(bbox, list) and len(bbox) == 4:
+ return bbox
+
+ # If it's not a string, we can't fix it
+ if not isinstance(bbox, str):
+ return None
+
+ try:
+ # Remove any leading/trailing whitespace
+ bbox_str = bbox.strip()
+
+ # Remove quotes if present
+ if bbox_str.startswith('"') and bbox_str.endswith('"'):
+ bbox_str = bbox_str[1:-1]
+ elif bbox_str.startswith("'") and bbox_str.endswith("'"):
+ bbox_str = bbox_str[1:-1]
+
+ # Try to extract numbers from various formats
+ # Pattern 1: "779, 767, 874, 789]" (missing opening bracket)
+ # Pattern 2: "[779, 767, 874, 789]" (has brackets)
+ # Pattern 3: "779, 767, 874, 789" (no brackets)
+
+ # Remove brackets if present
+ if bbox_str.startswith("["):
+ bbox_str = bbox_str[1:]
+ if bbox_str.endswith("]"):
+ bbox_str = bbox_str[:-1]
+
+ # Split by comma and extract numbers
+ parts = [part.strip() for part in bbox_str.split(",")]
+
+ if len(parts) != 4:
+ return None
+
+ # Convert each part to float
+ coords = []
+ for part in parts:
+ try:
+ coords.append(float(part))
+ except (ValueError, TypeError):
+ return None
+
+ return coords
+
+ except Exception:
+ return None
+
+
+def _parse_vlm_line_item_to_geometry(
+ line_item: dict,
+ implied_label: Optional[str],
+ warn_prefix: str,
+) -> Optional[Tuple[str, List[float], float]]:
+ """
+ Parse one VLM JSON line object into text, xyxy floats, and raw confidence.
+ For person/signature passes (implied_label in CUSTOM_VLM_CANONICAL_LABELS),
+ text is always the canonical label when the bbox is valid; model text keys are ignored.
+ """
+ if not isinstance(line_item, dict):
+ return None
+
+ canon = None
+ if implied_label and str(implied_label).strip() in CUSTOM_VLM_CANONICAL_LABELS:
+ canon = str(implied_label).strip()
+
+ raw_bbox = _get_vlm_item_bbox_field(line_item)
+ if raw_bbox is None:
+ raw_bbox = []
+
+ fixed_bbox = _fix_malformed_bbox(raw_bbox)
+ if fixed_bbox is not None:
+ bbox = fixed_bbox
+ if not isinstance(raw_bbox, list) or len(raw_bbox) != 4:
+ dbg_txt = canon or _extract_vlm_line_text(line_item) or "?"
+ print(
+ f"{warn_prefix}: Fixed malformed bbox for line '{dbg_txt[:50]}': "
+ f"{raw_bbox} -> {fixed_bbox}"
+ )
+ elif isinstance(raw_bbox, list) and len(raw_bbox) == 4:
+ bbox = raw_bbox
+ else:
+ dbg_txt = canon or _extract_vlm_line_text(line_item) or "?"
+ print(
+ f"{warn_prefix} warning: Invalid bbox format for line '{dbg_txt[:50]}': {raw_bbox}"
+ )
+ return None
+
+ try:
+ x1 = float(bbox[0])
+ y1 = float(bbox[1])
+ x2 = float(bbox[2])
+ y2 = float(bbox[3])
+ except (ValueError, TypeError):
+ dbg_txt = canon or _extract_vlm_line_text(line_item) or "?"
+ print(
+ f"{warn_prefix} warning: Invalid bbox coordinates for line '{dbg_txt[:50]}': {bbox}"
+ )
+ return None
+
+ if x2 <= x1 or y2 <= y1:
+ dbg_txt = canon or _extract_vlm_line_text(line_item) or "?"
+ print(
+ f"{warn_prefix} warning: Invalid bbox dimensions for line '{dbg_txt[:50]}': {bbox}"
+ )
+ return None
+
+ if canon:
+ text = canon
+ conf_raw = _get_vlm_item_conf_field(line_item)
+ if conf_raw is None:
+ conf_raw = 0.9
+ else:
+ text = _extract_vlm_line_text(line_item)
+ if not text:
+ return None
+ conf_raw = _get_vlm_item_conf_field(line_item)
+ if conf_raw is None:
+ conf_raw = 100
+
+ try:
+ confidence = float(conf_raw)
+ except (TypeError, ValueError):
+ confidence = 0.9 if canon else 100.0
+
+ return (text, [x1, y1, x2, y2], confidence)
+
+
+def _vlm_page_ocr_predict(
+ image: Image.Image,
+ image_name: str = "vlm_page_ocr_input_image.png",
+ normalised_coords_range: Optional[int] = 999,
+ output_folder: str = OUTPUT_FOLDER,
+ detect_people_only: bool = False,
+ detect_signatures_only: bool = False,
+ progress: Optional[gr.Progress] = gr.Progress(),
+ page_index_0: Optional[int] = None,
+) -> Tuple[Dict[str, List], int, int, str]:
+ """
+ VLM page-level OCR prediction that returns structured line-level results with bounding boxes.
+
+ Args:
+ image: PIL Image to process (full page)
+ image_name: Name of the image for debugging
+ normalised_coords_range: If set, bounding boxes are assumed to be in normalized coordinates
+ from 0 to this value (e.g., 999 as used in the full-page VLM prompt). Coordinates will be rescaled to match the processed image size. If None, coordinates are assumed to be in absolute pixel coordinates.
+ output_folder: The folder where output images will be saved
+ Returns:
+ Dictionary with 'text', 'left', 'top', 'width', 'height', 'conf', 'model' keys
+ matching the format expected by perform_ocr
+ """
+ try:
+
+ # Validate image exists and is not None
+ if image is None:
+ print("VLM page OCR error: Image is None")
+ return {
+ "text": [],
+ "left": [],
+ "top": [],
+ "width": [],
+ "height": [],
+ "conf": [],
+ "model": [],
+ }
+
+ # Validate image has valid size (at least 10x10 pixels)
+ try:
+ width, height = image.size
+ if width < 10 or height < 10:
+ print(
+ f"VLM page OCR error: Image is too small ({width}x{height} pixels). Minimum size is 10x10."
+ )
+ return {
+ "text": [],
+ "left": [],
+ "top": [],
+ "width": [],
+ "height": [],
+ "conf": [],
+ "model": [],
+ }
+ except Exception as size_error:
+ print(f"VLM page OCR error: Could not get image size: {size_error}")
+ return {
+ "text": [],
+ "left": [],
+ "top": [],
+ "width": [],
+ "height": [],
+ "conf": [],
+ "model": [],
+ }
+
+ # Ensure image is in RGB mode (convert if needed)
+ try:
+ if image.mode != "RGB":
+ image = image.convert("RGB")
+ width, height = image.size
+ except Exception as convert_error:
+ print(
+ f"VLM page OCR error: Could not convert image to RGB: {convert_error}"
+ )
+ return {
+ "text": [],
+ "left": [],
+ "top": [],
+ "width": [],
+ "height": [],
+ "conf": [],
+ "model": [],
+ }
+
+ # Check and resize image if it exceeds maximum size or DPI limits
+ scale_x = 1.0
+ scale_y = 1.0
+ try:
+ original_width, original_height = image.size
+ processed_image = _prepare_image_for_vlm(image)
+ # Pad so aspect ratio <= VLM_MAX_ASPECT_RATIO for hybrid/long pages
+ processed_image = _pad_image_for_vlm_aspect_ratio(processed_image)
+ processed_width, processed_height = processed_image.size
+
+ # Use float division to avoid rounding errors
+ scale_x = (
+ float(original_width) / float(processed_width)
+ if processed_width > 0
+ else 1.0
+ )
+ scale_y = (
+ float(original_height) / float(processed_height)
+ if processed_height > 0
+ else 1.0
+ )
+
+ # Debug: print scale factors to verify
+ if scale_x != 1.0 or scale_y != 1.0:
+ print(f"Scale factors: x={scale_x:.6f}, y={scale_y:.6f}")
+ print(
+ f"Original: {original_width}x{original_height}, Processed: {processed_width}x{processed_height}"
+ )
+ except Exception as prep_error:
+ print(f"VLM page OCR error: Could not prepare image for VLM: {prep_error}")
+ return {
+ "text": [],
+ "left": [],
+ "top": [],
+ "width": [],
+ "height": [],
+ "conf": [],
+ "model": [],
+ }
+
+ # Save input image for debugging if environment variable is set
+ if SAVE_VLM_INPUT_IMAGES:
+ try:
+ vlm_debug_dir = os.path.join(
+ output_folder,
+ "vlm_visualisations/vlm_input_images",
+ )
+ os.makedirs(vlm_debug_dir, exist_ok=True)
+ # Increment the number at the end of image_name before .png
+ # This converts zero-indexed input to one-indexed output
+ incremented_image_name = image_name
+ if image_name.endswith(".png"):
+ # Find the number pattern at the end before .png
+ # Matches patterns like: _0.png, _00.png, 0.png, 00.png, etc.
+ pattern = r"(\d+)(\.png)$"
+ match = re.search(pattern, image_name)
+ if match:
+ number_str = match.group(1)
+ number = int(number_str)
+ incremented_number = number + 1
+ # Preserve the same number of digits (padding with zeros if needed)
+ incremented_str = str(incremented_number).zfill(len(number_str))
+ incremented_image_name = re.sub(
+ pattern, lambda m: incremented_str + m.group(2), image_name
+ )
+ image_name_safe = safe_sanitize_text(incremented_image_name)
+ image_name_shortened = image_name_safe[:50]
+ filename = f"{image_name_shortened}_vlm_page_input_image.png"
+ filepath = os.path.join(vlm_debug_dir, filename)
+ _save_image_with_config_dpi(processed_image, filepath)
+ # print(f"Saved VLM input image to: {filepath}")
+ except Exception as save_error:
+ print(f"Warning: Could not save VLM input image: {save_error}")
+
+ # Create prompt that requests structured JSON output with bounding boxes
+ if detect_people_only:
+ # progress(0.5, "Detecting faces on page...")
+ # print("Detecting faces on page...")
+ prompt = full_page_ocr_people_vlm_prompt
+ task_type = "face"
+ elif detect_signatures_only:
+ # progress(0.5, "Detecting signatures on page...")
+ # print("Detecting signatures on page...")
+ prompt = full_page_ocr_signature_vlm_prompt
+ task_type = "signature"
+ else:
+ prompt = full_page_ocr_vlm_prompt
+ task_type = "ocr"
+
+ # Use the VLM to extract structured text
+ # Pass explicit model_default_* values for consistency with _inference_server_page_ocr_predict
+ extracted_text, vlm_input_tokens, vlm_output_tokens = (
+ extract_text_from_image_vlm(
+ text=prompt,
+ image=processed_image,
+ max_new_tokens=model_default_max_new_tokens,
+ temperature=model_default_temperature,
+ top_p=model_default_top_p,
+ min_p=model_default_min_p,
+ top_k=model_default_top_k,
+ repetition_penalty=model_default_repetition_penalty,
+ presence_penalty=model_default_presence_penalty,
+ seed=model_default_seed,
+ do_sample=model_default_do_sample,
+ )
+ )
+
+ # Save prompt and response to file
+ if extracted_text and isinstance(extracted_text, str) and output_folder:
+ try:
+ # Determine task suffix based on detection type
+ task_suffix = None
+ if detect_people_only:
+ task_suffix = "face"
+ elif detect_signatures_only:
+ task_suffix = "sig"
+
+ # Get model name for logging
+ vlm_model_name = (
+ SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL
+ if SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL
+ else "VLM"
+ )
+
+ saved_file = save_vlm_prompt_response(
+ prompt=prompt,
+ response_text=extracted_text,
+ output_folder=output_folder,
+ model_choice=vlm_model_name,
+ image_name=image_name,
+ page_number=page_index_0,
+ temperature=model_default_temperature,
+ max_new_tokens=model_default_max_new_tokens,
+ top_p=model_default_top_p,
+ model_type="VLM",
+ task_suffix=task_suffix,
+ input_tokens=vlm_input_tokens,
+ output_tokens=vlm_output_tokens,
+ image_width=processed_image.size[0],
+ image_height=processed_image.size[1],
+ )
+ print(f"Saved VLM prompt/response to: {saved_file}")
+ except Exception as save_error:
+ print(f"Warning: Could not save VLM prompt/response: {save_error}")
+
+ # Check if extracted_text is None or empty
+ if extracted_text is None or not isinstance(extracted_text, str):
+ print(
+ "VLM page OCR warning: extract_text_from_image_vlm returned None or invalid type"
+ )
+ return (
+ {
+ "text": [],
+ "left": [],
+ "top": [],
+ "width": [],
+ "height": [],
+ "conf": [],
+ "model": [],
+ },
+ 0,
+ 0,
+ (
+ SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL
+ if SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL
+ else "VLM"
+ ),
+ )
+
+ # Try to parse JSON from the response
+ # The VLM might return JSON wrapped in markdown code blocks or with extra text
+ extracted_text = extracted_text.strip()
+
+ # Fix malformed bounding box values in the JSON string before parsing
+ # This handles cases like: "bb": "779, 767, 874, 789],
+ _inf_implied = None
+ if detect_people_only:
+ _inf_implied = "[FACE]"
+ elif detect_signatures_only:
+ _inf_implied = "[SIGNATURE]"
+ extracted_text = _preprocess_vlm_ocr_json_string(
+ extracted_text, implied_label=_inf_implied
+ )
+
+ lines_data = None
+
+ # First, try to parse the entire response as JSON
+ try:
+ lines_data = json.loads(extracted_text)
+ except json.JSONDecodeError:
+ pass
+
+ # If that fails, try to extract JSON from markdown code blocks
+ if lines_data is None:
+ json_match = re.search(
+ r"```(?:json)?\s*(\[.*?\])", extracted_text, re.DOTALL
+ )
+ if json_match:
+ try:
+ lines_data = json.loads(json_match.group(1))
+ except json.JSONDecodeError:
+ pass
+
+ # If that fails, try to find JSON array in the text (more lenient)
+ if lines_data is None:
+ # Try to find array starting with [ and ending with ]
+ # This is a simple approach - look for balanced brackets
+ start_idx = extracted_text.find("[")
+ if start_idx >= 0:
+ bracket_count = 0
+ end_idx = start_idx
+ for i in range(start_idx, len(extracted_text)):
+ if extracted_text[i] == "[":
+ bracket_count += 1
+ elif extracted_text[i] == "]":
+ bracket_count -= 1
+ if bracket_count == 0:
+ end_idx = i
+ break
+ if end_idx > start_idx:
+ try:
+ lines_data = json.loads(extracted_text[start_idx : end_idx + 1])
+ except json.JSONDecodeError:
+ pass
+
+ # If that fails, try parsing multiple JSON arrays (may span multiple lines)
+ # This handles cases where the response has multiple JSON arrays separated by newlines
+ # Each array might be on a single line or span multiple lines
+ if lines_data is None:
+ try:
+ combined_data = []
+ # Find all JSON arrays in the text (they may span multiple lines)
+ # This approach handles both single-line and multi-line arrays
+ text = extracted_text
+ while True:
+ start_idx = text.find("[")
+ if start_idx < 0:
+ break
+
+ # Find the matching closing bracket
+ bracket_count = 0
+ end_idx = start_idx
+ for i in range(start_idx, len(text)):
+ if text[i] == "[":
+ bracket_count += 1
+ elif text[i] == "]":
+ bracket_count -= 1
+ if bracket_count == 0:
+ end_idx = i
+ break
+
+ if end_idx > start_idx:
+ try:
+ array_str = text[start_idx : end_idx + 1]
+ array_data = json.loads(array_str)
+ if isinstance(array_data, list):
+ combined_data.extend(array_data)
+ except json.JSONDecodeError:
+ pass
+
+ # Move past this array to find the next one
+ text = text[end_idx + 1 :]
+
+ if combined_data:
+ lines_data = combined_data
+ except Exception:
+ pass
+
+ # If that fails, try to interpret the response as a Python literal (handles single-quoted lists/dicts)
+ if lines_data is None:
+ try:
+ python_data = ast.literal_eval(extracted_text)
+ if isinstance(python_data, list):
+ lines_data = python_data
+ except Exception:
+ pass
+
+ # Final attempt: try to parse as-is
+ if lines_data is None:
+ try:
+ lines_data = json.loads(extracted_text)
+ except json.JSONDecodeError:
+ pass
+
+ # If we still couldn't parse JSON, return empty results
+ if lines_data is None:
+ print("VLM page OCR error: Could not parse JSON response")
+ print(
+ f"Response text: {extracted_text[:500]}"
+ ) # Print first 500 chars for debugging
+ return {
+ "text": [],
+ "left": [],
+ "top": [],
+ "width": [],
+ "height": [],
+ "conf": [],
+ "model": [],
+ }
+
+ if isinstance(lines_data, dict):
+ lines_data = [lines_data]
+ elif not isinstance(lines_data, list):
+ print(f"VLM page OCR error: Expected list, got {type(lines_data)}")
+ return {
+ "text": [],
+ "left": [],
+ "top": [],
+ "width": [],
+ "height": [],
+ "conf": [],
+ "model": [],
+ }
+
+ if SAVE_VLM_INPUT_IMAGES:
+ plot_text_bounding_boxes(
+ processed_image,
+ extracted_text,
+ image_name=image_name,
+ image_folder="vlm_visualisations",
+ output_folder=output_folder,
+ task_type=task_type,
+ )
+
+ # Store a copy of the processed image for debug visualization (before rescaling)
+ # IMPORTANT: This must be the EXACT same image that was sent to the API
+ processed_image_for_debug = (
+ processed_image.copy() if SAVE_VLM_INPUT_IMAGES else None
+ )
+
+ # Collect all valid bounding boxes before rescaling for debug visualization
+ pre_scaled_boxes = []
+
+ # Convert VLM results to expected format
+ result = {
+ "text": [],
+ "left": [],
+ "top": [],
+ "width": [],
+ "height": [],
+ "conf": [],
+ "model": [],
+ }
+
+ for line_item in lines_data:
+ parsed = _parse_vlm_line_item_to_geometry(
+ line_item,
+ _inf_implied,
+ "VLM page OCR",
+ )
+ if parsed is None:
+ continue
+ text, bbox_xyxy, confidence = parsed
+ x1, y1, x2, y2 = bbox_xyxy
+
+ # If coordinates are normalized (0 to normalised_coords_range), rescale directly to processed image dimensions
+ # This matches the ocr.ipynb approach: direct normalization to image size using /999 * dimension
+ # ocr.ipynb uses: abs_x1 = int(bounding_box["bbox_2d"][0]/999 * width)
+ # abs_y1 = int(bounding_box["bbox_2d"][1]/999 * height)
+ if normalised_coords_range is not None and normalised_coords_range > 0:
+ # Direct normalization: match ocr.ipynb approach exactly
+ # Formula: (coord / normalised_coords_range) * image_dimension
+ # Note: ocr.ipynb uses 999, but we allow configurable range
+ x1 = (x1 / float(normalised_coords_range)) * processed_width
+ y1 = (y1 / float(normalised_coords_range)) * processed_height
+ x2 = (x2 / float(normalised_coords_range)) * processed_width
+ y2 = (y2 / float(normalised_coords_range)) * processed_height
+
+ # Store bounding box after normalization (if applied) but before rescaling to original image space
+ if processed_image_for_debug is not None:
+ pre_scaled_boxes.append({"bbox": (x1, y1, x2, y2), "text": text})
+
+ # Step 3: Scale coordinates back to original image space if image was resized
+ if scale_x != 1.0 or scale_y != 1.0:
+ x1 = x1 * scale_x
+ y1 = y1 * scale_y
+ x2 = x2 * scale_x
+ y2 = y2 * scale_y
+
+ # Convert from (x1, y1, x2, y2) to (left, top, width, height)
+ left = int(round(x1))
+ top = int(round(y1))
+ width = int(round(x2 - x1))
+ height = int(round(y2 - y1))
+
+ # Ensure confidence is in valid range (0-100). VLM may return 0-1; scale to 0-100.
+ try:
+ confidence = float(confidence)
+ if 0 <= confidence <= 1:
+ confidence = confidence * 100
+ confidence = max(0, min(100, confidence)) # Clamp to 0-100
+ except (ValueError, TypeError):
+ confidence = 100 # Default if invalid
+
+ result["text"].append(
+ clean_unicode_text(text, preserve_international_scripts=True)
+ )
+ result["left"].append(left)
+ result["top"].append(top)
+ result["width"].append(width)
+ result["height"].append(height)
+ result["conf"].append(int(round(confidence)))
+ result["model"].append("VLM")
+
+ # Get model name for tracking
+ vlm_model_name = (
+ SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL
+ if SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL
+ else "VLM"
+ )
+
+ return result, vlm_input_tokens, vlm_output_tokens, vlm_model_name
+
+ except Exception as e:
+ print(f"VLM page OCR error: {e}")
+ import traceback
+
+ print(f"VLM page OCR error traceback: {traceback.format_exc()}")
+ return (
+ {
+ "text": [],
+ "left": [],
+ "top": [],
+ "width": [],
+ "height": [],
+ "conf": [],
+ "model": [],
+ },
+ 0,
+ 0,
+ "VLM",
+ )
+
+
+def _inference_server_page_ocr_predict(
+ image: Image.Image,
+ image_name: str = "inference_server_page_ocr_input_image.png",
+ normalised_coords_range: Optional[int] = 999,
+ output_folder: str = OUTPUT_FOLDER,
+ detect_people_only: bool = False,
+ detect_signatures_only: bool = False,
+ progress: Optional[gr.Progress] = gr.Progress(),
+ model_name: str = None,
+ page_index_0: Optional[int] = None,
+) -> Tuple[Dict[str, List], int, int, str]:
+ """
+ Inference-server page-level OCR prediction that returns structured line-level results with bounding boxes.
+ Calls an external inference-server API instead of a local model.
+
+ Args:
+ image: PIL Image to process (full page)
+ image_name: Name of the image for debugging
+ normalised_coords_range: If set, bounding boxes are assumed to be in normalized coordinates
+ from 0 to this value (e.g., 999 as used in the full-page VLM prompt). Coordinates will be rescaled to match the processed image size. If None, coordinates are assumed to be in absolute pixel coordinates.
+ output_folder: The folder where output images will be saved
+ Returns:
+ Dictionary with 'text', 'left', 'top', 'width', 'height', 'conf', 'model' keys
+ matching the format expected by perform_ocr
+ """
+ try:
+
+ def _empty_inference_server_page_result(
+ resolved_name: Optional[str] = None,
+ ) -> Tuple[Dict[str, List], int, int, str]:
+ """Always return (ocr_dict, in_tokens, out_tokens, model_name) for perform_ocr."""
+ nm = resolved_name
+ if nm is None or nm == "":
+ nm = (
+ DEFAULT_INFERENCE_SERVER_VLM_MODEL
+ if DEFAULT_INFERENCE_SERVER_VLM_MODEL
+ else None
+ )
+ if nm is None or nm == "":
+ nm = (
+ INFERENCE_SERVER_MODEL_NAME if INFERENCE_SERVER_MODEL_NAME else None
+ )
+ if nm is None or nm == "":
+ nm = "Inference Server"
+ return (
+ {
+ "text": [],
+ "left": [],
+ "top": [],
+ "width": [],
+ "height": [],
+ "conf": [],
+ "model": [],
+ },
+ 0,
+ 0,
+ nm,
+ )
+
+ # Validate image exists and is not None
+ if image is None:
+ print("Inference-server page OCR error: Image is None")
+ return _empty_inference_server_page_result()
+
+ # Validate image has valid size (at least 10x10 pixels)
+ try:
+ width, height = image.size
+ if width < 10 or height < 10:
+ print(
+ f"Inference-server page OCR error: Image is too small ({width}x{height} pixels). Minimum size is 10x10."
+ )
+ return _empty_inference_server_page_result()
+ except Exception as size_error:
+ print(
+ f"Inference-server page OCR error: Could not get image size: {size_error}"
+ )
+ return _empty_inference_server_page_result()
+
+ # Ensure image is in RGB mode (convert if needed)
+ try:
+ if image.mode != "RGB":
+ image = image.convert("RGB")
+ width, height = image.size
+ except Exception as convert_error:
+ print(
+ f"Inference-server page OCR error: Could not convert image to RGB: {convert_error}"
+ )
+ return _empty_inference_server_page_result()
+
+ # Check and resize image if it exceeds maximum size or DPI limits
+ scale_x = 1.0
+ scale_y = 1.0
+ # In _inference_server_page_ocr_predict, around line 1465-1471:
+ try:
+ original_width, original_height = image.size
+ processed_image = _prepare_image_for_vlm(image)
+ processed_width, processed_height = processed_image.size
+
+ # Use float division to avoid rounding errors
+ scale_x = (
+ float(original_width) / float(processed_width)
+ if processed_width > 0
+ else 1.0
+ )
+ scale_y = (
+ float(original_height) / float(processed_height)
+ if processed_height > 0
+ else 1.0
+ )
+
+ # Debug: print scale factors to verify
+ if scale_x != 1.0 or scale_y != 1.0:
+ print(f"Scale factors: x={scale_x:.6f}, y={scale_y:.6f}")
+ print(
+ f"Original: {original_width}x{original_height}, Processed: {processed_width}x{processed_height}"
+ )
+ except Exception as prep_error:
+ print(
+ f"Inference-server page OCR error: Could not prepare image for VLM: {prep_error}"
+ )
+ return _empty_inference_server_page_result()
+
+ # Save input image for debugging if environment variable is set
+ if SAVE_VLM_INPUT_IMAGES:
+ try:
+ vlm_debug_dir = os.path.join(
+ output_folder,
+ "inference_server_visualisations/vlm_input_images",
+ )
+ os.makedirs(vlm_debug_dir, exist_ok=True)
+ # Increment the number at the end of image_name before .png
+ # This converts zero-indexed input to one-indexed output
+ incremented_image_name = image_name
+ if image_name.endswith(".png"):
+ # Find the number pattern at the end before .png
+ # Matches patterns like: _0.png, _00.png, 0.png, 00.png, etc.
+ pattern = r"(\d+)(\.png)$"
+ match = re.search(pattern, image_name)
+ if match:
+ number_str = match.group(1)
+ number = int(number_str)
+ incremented_number = number + 1
+ # Preserve the same number of digits (padding with zeros if needed)
+ incremented_str = str(incremented_number).zfill(len(number_str))
+ incremented_image_name = re.sub(
+ pattern, lambda m: incremented_str + m.group(2), image_name
+ )
+ image_name_safe = safe_sanitize_text(incremented_image_name)
+ image_name_shortened = image_name_safe[:50]
+ filename = (
+ f"{image_name_shortened}_inference_server_page_input_image.png"
+ )
+ filepath = os.path.join(vlm_debug_dir, filename)
+ print(f"Saving inference-server input image to: {filename}")
+ _save_image_with_config_dpi(processed_image, filepath)
+ # print(f"Saved VLM input image to: {filepath}")
+ except Exception as save_error:
+ print(f"Warning: Could not save VLM input image: {save_error}")
+
+ # Create prompt that requests structured JSON output with bounding boxes
+ if detect_people_only:
+ # progress(0.5, "Detecting faces on page...")
+ # print("Detecting faces on page...")
+ prompt = full_page_ocr_people_vlm_prompt
+ task_type = "face"
+ elif detect_signatures_only:
+ # progress(0.5, "Detecting signatures on page...")
+ # print("Detecting signatures on page...")
+ prompt = full_page_ocr_signature_vlm_prompt
+ task_type = "signature"
+ else:
+ prompt = full_page_ocr_vlm_prompt
+ task_type = "ocr"
+
+ # Use the inference-server API to extract structured text
+ # Note: processed_width and processed_height were already captured on line 1921
+ # after _prepare_image_for_vlm, so we use those values for normalization
+ # Determine model_name: use provided parameter, then DEFAULT_INFERENCE_SERVER_VLM_MODEL, then INFERENCE_SERVER_MODEL_NAME
+ final_model_name = model_name
+ if final_model_name is None or final_model_name == "":
+ final_model_name = (
+ DEFAULT_INFERENCE_SERVER_VLM_MODEL
+ if DEFAULT_INFERENCE_SERVER_VLM_MODEL
+ else None
+ )
+ if final_model_name is None or final_model_name == "":
+ final_model_name = (
+ INFERENCE_SERVER_MODEL_NAME if INFERENCE_SERVER_MODEL_NAME else None
+ )
+
+ (
+ extracted_text,
+ vlm_input_tokens,
+ vlm_output_tokens,
+ vlm_sent_w,
+ vlm_sent_h,
+ ) = _call_inference_server_vlm_api(
+ image=processed_image,
+ prompt=prompt,
+ model_name=final_model_name,
+ max_new_tokens=model_default_max_new_tokens,
+ temperature=None,
+ top_p=None,
+ top_k=None,
+ repetition_penalty=None,
+ seed=None,
+ do_sample=model_default_do_sample,
+ min_p=None,
+ presence_penalty=None,
+ use_llama_swap=USE_LLAMA_SWAP,
+ )
+
+ # Save prompt and response to file
+ if extracted_text and isinstance(extracted_text, str) and output_folder:
+ try:
+ # Determine task suffix based on detection type
+ task_suffix = None
+ if detect_people_only:
+ task_suffix = "face"
+ elif detect_signatures_only:
+ task_suffix = "sig"
+
+ saved_file = save_vlm_prompt_response(
+ prompt=prompt,
+ response_text=extracted_text,
+ output_folder=output_folder,
+ model_choice=final_model_name or "unknown",
+ image_name=image_name,
+ page_number=page_index_0,
+ temperature=model_default_temperature,
+ max_new_tokens=model_default_max_new_tokens,
+ top_p=model_default_top_p,
+ model_type="Inference Server",
+ task_suffix=task_suffix,
+ input_tokens=vlm_input_tokens,
+ output_tokens=vlm_output_tokens,
+ image_width=vlm_sent_w,
+ image_height=vlm_sent_h,
+ )
+ print(f"Saved inference-server VLM prompt/response to: {saved_file}")
+ except Exception as save_error:
+ print(
+ f"Warning: Could not save inference-server VLM prompt/response: {save_error}"
+ )
+
+ # Check if extracted_text is None or empty
+ if extracted_text is None or not isinstance(extracted_text, str):
+ print(
+ "Inference-server page OCR warning: API returned None or invalid type"
+ )
+ return (
+ {
+ "text": [],
+ "left": [],
+ "top": [],
+ "width": [],
+ "height": [],
+ "conf": [],
+ "model": [],
+ },
+ 0,
+ 0,
+ final_model_name or "Inference Server",
+ )
+
+ # Try to parse JSON from the response
+ # The API might return JSON wrapped in markdown code blocks or with extra text
+ extracted_text = extracted_text.strip()
+
+ # Fix malformed bounding box values in the JSON string before parsing
+ # This handles cases like: "bb": "779, 767, 874, 789],
+ _inf_server_implied = None
+ if detect_people_only:
+ _inf_server_implied = "[FACE]"
+ elif detect_signatures_only:
+ _inf_server_implied = "[SIGNATURE]"
+ extracted_text = _preprocess_vlm_ocr_json_string(
+ extracted_text, implied_label=_inf_server_implied
+ )
+
+ lines_data = None
+
+ # First, try to parse the entire response as JSON
+ try:
+ lines_data = json.loads(extracted_text)
+ except json.JSONDecodeError:
+ pass
+
+ # If that fails, try to extract JSON from markdown code blocks
+ if lines_data is None:
+ json_match = re.search(
+ r"```(?:json)?\s*(\[.*?\])", extracted_text, re.DOTALL
+ )
+ if json_match:
+ try:
+ lines_data = json.loads(json_match.group(1))
+ except json.JSONDecodeError:
+ pass
+
+ # If that fails, try to find JSON array in the text (more lenient)
+ if lines_data is None:
+ # Try to find array starting with [ and ending with ]
+ start_idx = extracted_text.find("[")
+ if start_idx >= 0:
+ bracket_count = 0
+ end_idx = start_idx
+ for i in range(start_idx, len(extracted_text)):
+ if extracted_text[i] == "[":
+ bracket_count += 1
+ elif extracted_text[i] == "]":
+ bracket_count -= 1
+ if bracket_count == 0:
+ end_idx = i
+ break
+ if end_idx > start_idx:
+ try:
+ lines_data = json.loads(extracted_text[start_idx : end_idx + 1])
+ except json.JSONDecodeError:
+ pass
+
+ # If that fails, try parsing multiple JSON arrays (may span multiple lines)
+ # This handles cases where the response has multiple JSON arrays separated by newlines
+ # Each array might be on a single line or span multiple lines
+ if lines_data is None:
+ try:
+ combined_data = []
+ # Find all JSON arrays in the text (they may span multiple lines)
+ # This approach handles both single-line and multi-line arrays
+ text = extracted_text
+ while True:
+ start_idx = text.find("[")
+ if start_idx < 0:
+ break
+
+ # Find the matching closing bracket
+ bracket_count = 0
+ end_idx = start_idx
+ for i in range(start_idx, len(text)):
+ if text[i] == "[":
+ bracket_count += 1
+ elif text[i] == "]":
+ bracket_count -= 1
+ if bracket_count == 0:
+ end_idx = i
+ break
+
+ if end_idx > start_idx:
+ try:
+ array_str = text[start_idx : end_idx + 1]
+ array_data = json.loads(array_str)
+ if isinstance(array_data, list):
+ combined_data.extend(array_data)
+ except json.JSONDecodeError:
+ pass
+
+ # Move past this array to find the next one
+ text = text[end_idx + 1 :]
+
+ if combined_data:
+ lines_data = combined_data
+ except Exception:
+ pass
+
+ # If that fails, try to interpret the response as a Python literal (handles single-quoted lists/dicts)
+ if lines_data is None:
+ try:
+ python_data = ast.literal_eval(extracted_text)
+ if isinstance(python_data, list):
+ lines_data = python_data
+ except Exception:
+ pass
+
+ # Final attempt: try to parse as-is
+ if lines_data is None:
+ try:
+ lines_data = json.loads(extracted_text)
+ except json.JSONDecodeError:
+ pass
+
+ # If we still couldn't parse JSON, return empty results
+ if lines_data is None:
+ print("Inference-server page OCR error: Could not parse JSON response")
+ print(
+ f"Response text: {extracted_text[:500]}"
+ ) # Print first 500 chars for debugging
+ return _empty_inference_server_page_result(final_model_name)
+
+ if isinstance(lines_data, dict):
+ lines_data = [lines_data]
+ elif not isinstance(lines_data, list):
+ print(
+ f"Inference-server page OCR error: Expected list, got {type(lines_data)}"
+ )
+ return _empty_inference_server_page_result(final_model_name)
+
+ if SAVE_VLM_INPUT_IMAGES:
+ plot_text_bounding_boxes(
+ processed_image,
+ extracted_text,
+ image_name=image_name,
+ image_folder="inference_server_visualisations",
+ output_folder=output_folder,
+ task_type=task_type,
+ )
+
+ # Store a copy of the processed image for debug visualization (before rescaling)
+ # IMPORTANT: This must be the EXACT same image that was sent to the API
+ processed_image_for_debug = (
+ processed_image.copy() if SAVE_VLM_INPUT_IMAGES else None
+ )
+
+ # Collect all valid bounding boxes before rescaling for debug visualization
+ pre_scaled_boxes = []
+
+ # Convert API results to expected format
+ result = {
+ "text": [],
+ "left": [],
+ "top": [],
+ "width": [],
+ "height": [],
+ "conf": [],
+ "model": [],
+ }
+
+ for line_item in lines_data:
+ parsed = _parse_vlm_line_item_to_geometry(
+ line_item,
+ _inf_server_implied,
+ "Inference-server page OCR",
+ )
+ if parsed is None:
+ continue
+ text, bbox_xyxy, confidence = parsed
+ x1, y1, x2, y2 = bbox_xyxy
+
+ # If coordinates are normalized (0 to normalised_coords_range), rescale directly to processed image dimensions
+ if normalised_coords_range is not None and normalised_coords_range > 0:
+ # Formula: (coord / normalised_coords_range) * image_dimension (e.g. 999 from full-page VLM prompt)
+ x1 = (x1 / float(normalised_coords_range)) * processed_width
+ y1 = (y1 / float(normalised_coords_range)) * processed_height
+ x2 = (x2 / float(normalised_coords_range)) * processed_width
+ y2 = (y2 / float(normalised_coords_range)) * processed_height
+
+ # Store bounding box after normalization (if applied) but before rescaling to original image space
+ if processed_image_for_debug is not None:
+ pre_scaled_boxes.append({"bbox": (x1, y1, x2, y2), "text": text})
+
+ # Step 3: Scale coordinates back to original image space if image was resized
+ if scale_x != 1.0 or scale_y != 1.0:
+ x1 = x1 * scale_x
+ y1 = y1 * scale_y
+ x2 = x2 * scale_x
+ y2 = y2 * scale_y
+
+ # Convert from (x1, y1, x2, y2) to (left, top, width, height)
+ left = int(round(x1))
+ top = int(round(y1))
+ width = int(round(x2 - x1))
+ height = int(round(y2 - y1))
+
+ # Ensure confidence is in valid range (0-100). VLM may return 0-1; scale to 0-100.
+ try:
+ confidence = float(confidence)
+ if 0 <= confidence <= 1:
+ confidence = confidence * 100
+ confidence = max(0, min(100, confidence)) # Clamp to 0-100
+ except (ValueError, TypeError):
+ confidence = 50 # Default if invalid
+
+ result["text"].append(
+ clean_unicode_text(text, preserve_international_scripts=True)
+ )
+ result["left"].append(left)
+ result["top"].append(top)
+ result["width"].append(width)
+ result["height"].append(height)
+ result["conf"].append(int(round(confidence)))
+ result["model"].append("Inference Server")
+
+ # Get model name for tracking
+ vlm_model_name = final_model_name or "Inference Server"
+
+ return result, vlm_input_tokens, vlm_output_tokens, vlm_model_name
+
+ except Exception as e:
+ print(f"Inference-server page OCR error: {e}")
+ import traceback
+
+ print(f"Inference-server page OCR error traceback: {traceback.format_exc()}")
+ # Determine model name for error case
+ error_model_name = (
+ model_name
+ or DEFAULT_INFERENCE_SERVER_VLM_MODEL
+ or INFERENCE_SERVER_MODEL_NAME
+ or "Inference Server"
+ )
+ return (
+ {
+ "text": [],
+ "left": [],
+ "top": [],
+ "width": [],
+ "height": [],
+ "conf": [],
+ "model": [],
+ },
+ 0,
+ 0,
+ error_model_name,
+ )
+
+
+def _parse_vlm_page_ocr_response(
+ extracted_text: str,
+ processed_image: Image.Image,
+ processed_width: int,
+ processed_height: int,
+ scale_x: float,
+ scale_y: float,
+ normalised_coords_range: Optional[int],
+ model_name: str = "Cloud VLM",
+ implied_label: Optional[str] = None,
+) -> Dict[str, List]:
+ """
+ Helper function to parse VLM page OCR response and convert to expected format.
+ Shared by all cloud VLM page OCR functions.
+
+ Args:
+ extracted_text: Raw text response from VLM
+ processed_image: The processed image that was sent to the VLM
+ processed_width: Width of processed image
+ processed_height: Height of processed image
+ scale_x: Scale factor for x coordinates (original/processed)
+ scale_y: Scale factor for y coordinates (original/processed)
+ normalised_coords_range: If set, bounding boxes are in normalized coordinates (0 to this value)
+ model_name: Name of the model for the 'model' field in results
+ implied_label: When set (e.g. \"[FACE]\" for face pass), used to repair malformed JSON
+ where the model omits \"text\", and to fill missing text on dict entries that only have bbox.
+
+ Returns:
+ Dictionary with 'text', 'left', 'top', 'width', 'height', 'conf', 'model' keys
+ """
+ # Use actual image dimensions to ensure consistency (in case image was modified)
+ actual_width, actual_height = processed_image.size
+ if actual_width != processed_width or actual_height != processed_height:
+ print(
+ f"{model_name} page OCR warning: Image dimensions mismatch. "
+ f"Expected {processed_width}x{processed_height}, got {actual_width}x{actual_height}. "
+ f"Using actual dimensions."
+ )
+ processed_width = actual_width
+ processed_height = actual_height
+
+ extracted_text = _preprocess_vlm_ocr_json_string(
+ extracted_text, implied_label=implied_label
+ )
+
+ lines_data = None
+
+ # Try various JSON parsing strategies (same as _vlm_page_ocr_predict)
+ try:
+ lines_data = json.loads(extracted_text)
+ except json.JSONDecodeError:
+ pass
+
+ if lines_data is None:
+ json_match = re.search(r"```(?:json)?\s*(\[.*?\])", extracted_text, re.DOTALL)
+ if json_match:
+ try:
+ lines_data = json.loads(json_match.group(1))
+ except json.JSONDecodeError:
+ pass
+
+ if lines_data is None:
+ start_idx = extracted_text.find("[")
+ if start_idx >= 0:
+ bracket_count = 0
+ end_idx = start_idx
+ for i in range(start_idx, len(extracted_text)):
+ if extracted_text[i] == "[":
+ bracket_count += 1
+ elif extracted_text[i] == "]":
+ bracket_count -= 1
+ if bracket_count == 0:
+ end_idx = i
+ break
+ if end_idx > start_idx:
+ try:
+ lines_data = json.loads(extracted_text[start_idx : end_idx + 1])
+ except json.JSONDecodeError:
+ pass
+
+ if lines_data is None:
+ try:
+ python_data = ast.literal_eval(extracted_text)
+ if isinstance(python_data, list):
+ lines_data = python_data
+ elif isinstance(python_data, dict):
+ lines_data = python_data
+ except Exception:
+ pass
+
+ if lines_data is None:
+ print(f"{model_name} page OCR error: Could not parse JSON response")
+ print(f"Response text: {extracted_text[:500]}")
+ return {
+ "text": [],
+ "left": [],
+ "top": [],
+ "width": [],
+ "height": [],
+ "conf": [],
+ "model": [],
+ }
+
+ if isinstance(lines_data, dict):
+ lines_data = [lines_data]
+ elif not isinstance(lines_data, list):
+ print(f"{model_name} page OCR error: Expected list, got {type(lines_data)}")
+ return {
+ "text": [],
+ "left": [],
+ "top": [],
+ "width": [],
+ "height": [],
+ "conf": [],
+ "model": [],
+ }
+
+ # Convert VLM results to expected format
+ result = {
+ "text": [],
+ "left": [],
+ "top": [],
+ "width": [],
+ "height": [],
+ "conf": [],
+ "model": [],
+ }
+
+ for line_item in lines_data:
+ parsed = _parse_vlm_line_item_to_geometry(
+ line_item,
+ implied_label,
+ f"{model_name} page OCR",
+ )
+ if parsed is None:
+ continue
+ text, bbox_xyxy, confidence = parsed
+ x1, y1, x2, y2 = bbox_xyxy
+
+ if normalised_coords_range is not None and normalised_coords_range > 0:
+ x1 = (x1 / float(normalised_coords_range)) * processed_width
+ y1 = (y1 / float(normalised_coords_range)) * processed_height
+ x2 = (x2 / float(normalised_coords_range)) * processed_width
+ y2 = (y2 / float(normalised_coords_range)) * processed_height
+
+ if scale_x != 1.0 or scale_y != 1.0:
+ x1 = x1 * scale_x
+ y1 = y1 * scale_y
+ x2 = x2 * scale_x
+ y2 = y2 * scale_y
+
+ left = int(round(x1))
+ top = int(round(y1))
+ width = int(round(x2 - x1))
+ height = int(round(y2 - y1))
+
+ try:
+ confidence = float(confidence)
+ if 0 <= confidence <= 1:
+ confidence = confidence * 100
+ confidence = max(0, min(100, confidence))
+ except (ValueError, TypeError):
+ confidence = 100
+
+ result["text"].append(
+ clean_unicode_text(text, preserve_international_scripts=True)
+ )
+ result["left"].append(left)
+ result["top"].append(top)
+ result["width"].append(width)
+ result["height"].append(height)
+ result["conf"].append(int(round(confidence)))
+ result["model"].append(model_name)
+
+ return result
+
+
+def _bedrock_page_ocr_predict(
+ image: Image.Image,
+ image_name: str = "bedrock_page_ocr_input_image.png",
+ normalised_coords_range: Optional[int] = None,
+ output_folder: str = OUTPUT_FOLDER,
+ detect_people_only: bool = False,
+ detect_signatures_only: bool = False,
+ progress: Optional[gr.Progress] = gr.Progress(),
+ model_choice: str = None,
+ bedrock_runtime=None,
+ page_index_0: Optional[int] = None,
+) -> Tuple[Dict[str, List], int, int, str]:
+ """
+ Bedrock page-level OCR prediction that returns structured line-level results with bounding boxes.
+
+ Args:
+ image: PIL Image to process (full page)
+ image_name: Name of the image for debugging
+ normalised_coords_range: If set, bounding boxes are assumed to be in normalized coordinates
+ from 0 to this value (e.g., 999 as used in the full-page VLM prompt). Coordinates will be rescaled to match the processed image size. If None, coordinates are assumed to be in absolute pixel coordinates.
+ output_folder: The folder where output images will be saved
+ detect_people_only: If True, only detect people in images
+ detect_signatures_only: If True, only detect signatures in images
+ progress: Gradio progress tracker
+ model_choice: Bedrock model ID
+ bedrock_runtime: boto3 Bedrock runtime client
+
+ Returns:
+ Dictionary with 'text', 'left', 'top', 'width', 'height', 'conf', 'model' keys
+ """
+ try:
+ if image is None:
+ print("Bedrock page OCR error: Image is None")
+ return {
+ "text": [],
+ "left": [],
+ "top": [],
+ "width": [],
+ "height": [],
+ "conf": [],
+ "model": [],
+ }
+
+ try:
+ width, height = image.size
+ if width < 10 or height < 10:
+ print(
+ f"Bedrock page OCR error: Image is too small ({width}x{height} pixels)."
+ )
+ return {
+ "text": [],
+ "left": [],
+ "top": [],
+ "width": [],
+ "height": [],
+ "conf": [],
+ "model": [],
+ }
+ except Exception as size_error:
+ print(f"Bedrock page OCR error: Could not get image size: {size_error}")
+ return {
+ "text": [],
+ "left": [],
+ "top": [],
+ "width": [],
+ "height": [],
+ "conf": [],
+ "model": [],
+ }
+
+ try:
+ if image.mode != "RGB":
+ image = image.convert("RGB")
+ except Exception as convert_error:
+ print(
+ f"Bedrock page OCR error: Could not convert image to RGB: {convert_error}"
+ )
+ return {
+ "text": [],
+ "left": [],
+ "top": [],
+ "width": [],
+ "height": [],
+ "conf": [],
+ "model": [],
+ }
+
+ # Same preparation as other full-page VLMs: min/max pixels + DPI bounds; Bedrock
+ # gets a higher max pixel budget via ocr_method. scale_x/scale_y map boxes to original.
+ scale_x = 1.0
+ scale_y = 1.0
+ try:
+ original_width, original_height = image.size
+ processed_image = _prepare_image_for_vlm(
+ image,
+ ocr_method="AWS Bedrock VLM page OCR",
+ hybrid_vlm=False,
+ )
+ processed_width, processed_height = processed_image.size
+ scale_x = (
+ float(original_width) / float(processed_width)
+ if processed_width > 0
+ else 1.0
+ )
+ scale_y = (
+ float(original_height) / float(processed_height)
+ if processed_height > 0
+ else 1.0
+ )
+ except Exception as prep_error:
+ print(f"Bedrock page OCR error: Could not prepare image: {prep_error}")
+ return {
+ "text": [],
+ "left": [],
+ "top": [],
+ "width": [],
+ "height": [],
+ "conf": [],
+ "model": [],
+ }
+
+ # Create prompt
+ if detect_people_only:
+ # progress(0.5, "Detecting faces on page...")
+ # print("Detecting faces on page...")
+ prompt = full_page_ocr_people_vlm_prompt
+ elif detect_signatures_only:
+ # progress(0.5, "Detecting signatures on page...")
+ # print("Detecting signatures on page...")
+ prompt = full_page_ocr_signature_vlm_prompt
+ else:
+ prompt = full_page_ocr_vlm_prompt
+
+ # Save input image for debugging if environment variable is set
+ if SAVE_VLM_INPUT_IMAGES:
+ try:
+ vlm_debug_dir = os.path.join(
+ output_folder,
+ "bedrock_visualisations/vlm_input_images",
+ )
+ os.makedirs(vlm_debug_dir, exist_ok=True)
+ # Increment the number at the end of image_name before .png
+ # This converts zero-indexed input to one-indexed output
+ incremented_image_name = image_name
+ if image_name.endswith(".png"):
+ # Find the number pattern at the end before .png
+ # Matches patterns like: _0.png, _00.png, 0.png, 00.png, etc.
+ pattern = r"(\d+)(\.png)$"
+ match = re.search(pattern, image_name)
+ if match:
+ number_str = match.group(1)
+ number = int(number_str)
+ incremented_number = number + 1
+ # Preserve the same number of digits (padding with zeros if needed)
+ incremented_str = str(incremented_number).zfill(len(number_str))
+ incremented_image_name = re.sub(
+ pattern, lambda m: incremented_str + m.group(2), image_name
+ )
+ image_name_safe = safe_sanitize_text(incremented_image_name)
+
+ # Extract page number from image_name if present (e.g., "file_1.png" -> "1")
+ # Look for patterns like "_1.png", "_01.png", "_page_1.png", etc.
+ page_number = None
+ page_patterns = [
+ r"_page_(\d+)\.png$", # _page_1.png
+ r"_(\d+)\.png$", # _1.png, _01.png
+ r"page_(\d+)\.png$", # page_1.png
+ ]
+ for pattern in page_patterns:
+ match = re.search(pattern, incremented_image_name, re.IGNORECASE)
+ if match:
+ page_number = match.group(1)
+ break
+
+ # Use longer name limit to preserve page numbers, but still truncate if very long
+ # Remove .png extension before truncating to preserve more of the name
+ image_name_no_ext = image_name_safe.replace(".png", "").replace(
+ ".PNG", ""
+ )
+ if len(image_name_no_ext) > 100:
+ image_name_shortened = image_name_no_ext[:100]
+ else:
+ image_name_shortened = image_name_no_ext
+
+ # Construct filename with page number if found
+ if page_number:
+ filename = (
+ f"{image_name_shortened}_page_{page_number}_bedrock_input.png"
+ )
+ else:
+ filename = f"{image_name_shortened}_bedrock_page_input_image.png"
+
+ filepath = os.path.join(vlm_debug_dir, filename)
+ print(f"Saving Bedrock VLM input image to: {filename}")
+ _save_image_with_config_dpi(processed_image, filepath)
+ # print(f"Saved Bedrock VLM input image to: {filepath}")
+ except Exception as save_error:
+ print(f"Warning: Could not save Bedrock VLM input image: {save_error}")
+
+ # Call Bedrock API
+ (
+ extracted_text,
+ vlm_input_tokens,
+ vlm_output_tokens,
+ vlm_sent_w,
+ vlm_sent_h,
+ ) = _call_bedrock_vlm_api(
+ image=processed_image,
+ prompt=prompt,
+ model_choice=model_choice,
+ bedrock_runtime=bedrock_runtime,
+ max_new_tokens=model_default_max_new_tokens,
+ temperature=model_default_temperature,
+ top_p=model_default_top_p,
+ )
+
+ # Save prompt and response to file (including when response is empty, e.g. no faces/signatures)
+ if extracted_text is not None and output_folder:
+ try:
+ # Determine task suffix based on detection type
+ task_suffix = None
+ if detect_people_only:
+ task_suffix = "face"
+ elif detect_signatures_only:
+ task_suffix = "sig"
+
+ response_str = (
+ extracted_text
+ if isinstance(extracted_text, str)
+ else str(extracted_text or "")
+ )
+ saved_file = save_vlm_prompt_response(
+ prompt=prompt,
+ response_text=response_str,
+ output_folder=output_folder,
+ model_choice=model_choice or "unknown",
+ image_name=image_name,
+ page_number=page_index_0,
+ temperature=model_default_temperature,
+ max_new_tokens=model_default_max_new_tokens,
+ top_p=model_default_top_p,
+ model_type="Bedrock",
+ task_suffix=task_suffix,
+ input_tokens=vlm_input_tokens,
+ output_tokens=vlm_output_tokens,
+ image_width=vlm_sent_w,
+ image_height=vlm_sent_h,
+ )
+ print(f"Saved Bedrock VLM prompt/response to: {saved_file}")
+ except Exception as save_error:
+ print(
+ f"Warning: Could not save Bedrock VLM prompt/response: {save_error}"
+ )
+
+ if extracted_text is None or not isinstance(extracted_text, str):
+ print("Bedrock page OCR warning: No valid response")
+ return (
+ {
+ "text": [],
+ "left": [],
+ "top": [],
+ "width": [],
+ "height": [],
+ "conf": [],
+ "model": [],
+ },
+ 0,
+ 0,
+ model_choice or "Bedrock",
+ )
+
+ _bedrock_implied_label = None
+ if detect_people_only:
+ _bedrock_implied_label = "[FACE]"
+ elif detect_signatures_only:
+ _bedrock_implied_label = "[SIGNATURE]"
+
+ # Plot bounding boxes from VLM response if enabled
+ if SAVE_VLM_INPUT_IMAGES:
+ try:
+ # Determine task type based on prompt
+ task_type = "ocr"
+ if detect_people_only:
+ task_type = "face"
+ elif detect_signatures_only:
+ task_type = "signature"
+
+ _viz_json = _preprocess_vlm_ocr_json_string(
+ extracted_text, implied_label=_bedrock_implied_label
+ )
+ plot_text_bounding_boxes(
+ processed_image,
+ _viz_json,
+ image_name=image_name,
+ image_folder="bedrock_visualisations",
+ output_folder=output_folder,
+ task_type=task_type,
+ )
+ except Exception as plot_error:
+ print(
+ f"Warning: Could not plot Bedrock VLM bounding boxes: {plot_error}"
+ )
+
+ # Parse response using shared helper
+ result = _parse_vlm_page_ocr_response(
+ extracted_text=extracted_text,
+ processed_image=processed_image,
+ processed_width=processed_width,
+ processed_height=processed_height,
+ scale_x=scale_x,
+ scale_y=scale_y,
+ normalised_coords_range=normalised_coords_range,
+ model_name="Bedrock",
+ implied_label=_bedrock_implied_label,
+ )
+
+ # Get model name for tracking
+ vlm_model_name = model_choice or "Bedrock"
+
+ return result, vlm_input_tokens, vlm_output_tokens, vlm_model_name
+
+ except Exception as e:
+ print(f"Bedrock page OCR error: {e}")
+ import traceback
+
+ print(f"Bedrock page OCR error traceback: {traceback.format_exc()}")
+ return (
+ {
+ "text": [],
+ "left": [],
+ "top": [],
+ "width": [],
+ "height": [],
+ "conf": [],
+ "model": [],
+ },
+ 0,
+ 0,
+ model_choice or "Bedrock",
+ )
+
+
+def _gemini_page_ocr_predict(
+ image: Image.Image,
+ image_name: str = "gemini_page_ocr_input_image.png",
+ normalised_coords_range: Optional[int] = None,
+ output_folder: str = OUTPUT_FOLDER,
+ detect_people_only: bool = False,
+ detect_signatures_only: bool = False,
+ progress: Optional[gr.Progress] = gr.Progress(),
+ model_choice: str = None,
+ client=None,
+ config=None,
+ page_index_0: Optional[int] = None,
+) -> Tuple[Dict[str, List], int, int, str]:
+ """
+ Gemini page-level OCR prediction that returns structured line-level results with bounding boxes.
+
+ Args:
+ image: PIL Image to process (full page)
+ image_name: Name of the image for debugging
+ normalised_coords_range: If set, bounding boxes are assumed to be in normalized coordinates
+ output_folder: The folder where output images will be saved
+ detect_people_only: If True, only detect people in images
+ detect_signatures_only: If True, only detect signatures in images
+ progress: Gradio progress tracker
+ model_choice: Gemini model name
+ client: Gemini ai.Client instance
+ config: Gemini types.GenerateContentConfig instance
+
+ Returns:
+ Dictionary with 'text', 'left', 'top', 'width', 'height', 'conf', 'model' keys
+ """
+ try:
+ if image is None:
+ print("Gemini page OCR error: Image is None")
+ return {
+ "text": [],
+ "left": [],
+ "top": [],
+ "width": [],
+ "height": [],
+ "conf": [],
+ "model": [],
+ }
+
+ try:
+ width, height = image.size
+ if width < 10 or height < 10:
+ print(
+ f"Gemini page OCR error: Image is too small ({width}x{height} pixels)."
+ )
+ return {
+ "text": [],
+ "left": [],
+ "top": [],
+ "width": [],
+ "height": [],
+ "conf": [],
+ "model": [],
+ }
+ except Exception as size_error:
+ print(f"Gemini page OCR error: Could not get image size: {size_error}")
+ return {
+ "text": [],
+ "left": [],
+ "top": [],
+ "width": [],
+ "height": [],
+ "conf": [],
+ "model": [],
+ }
+
+ try:
+ if image.mode != "RGB":
+ image = image.convert("RGB")
+ except Exception as convert_error:
+ print(
+ f"Gemini page OCR error: Could not convert image to RGB: {convert_error}"
+ )
+ return {
+ "text": [],
+ "left": [],
+ "top": [],
+ "width": [],
+ "height": [],
+ "conf": [],
+ "model": [],
+ }
+
+ scale_x = 1.0
+ scale_y = 1.0
+ try:
+ original_width, original_height = image.size
+ processed_image = _prepare_image_for_vlm(image)
+ processed_width, processed_height = processed_image.size
+
+ scale_x = (
+ float(original_width) / float(processed_width)
+ if processed_width > 0
+ else 1.0
+ )
+ scale_y = (
+ float(original_height) / float(processed_height)
+ if processed_height > 0
+ else 1.0
+ )
+ except Exception as prep_error:
+ print(f"Gemini page OCR error: Could not prepare image: {prep_error}")
+ return {
+ "text": [],
+ "left": [],
+ "top": [],
+ "width": [],
+ "height": [],
+ "conf": [],
+ "model": [],
+ }
+
+ # Create prompt
+ if detect_people_only:
+ # progress(0.5, "Detecting faces on page...")
+ # print("Detecting faces on page...")
+ prompt = full_page_ocr_people_vlm_prompt
+ elif detect_signatures_only:
+ # progress(0.5, "Detecting signatures on page...")
+ # print("Detecting signatures on page...")
+ prompt = full_page_ocr_signature_vlm_prompt
+ else:
+ prompt = full_page_ocr_vlm_prompt
+
+ # Call Gemini API
+ extracted_text, vlm_input_tokens, vlm_output_tokens = _call_gemini_vlm_api(
+ image=processed_image,
+ prompt=prompt,
+ client=client,
+ config=config,
+ model_choice=model_choice,
+ max_new_tokens=model_default_max_new_tokens,
+ temperature=model_default_temperature,
+ )
+
+ # Save prompt and response to file (including when response is empty, e.g. no faces/signatures)
+ if extracted_text is not None and output_folder:
+ try:
+ # Determine task suffix based on detection type
+ task_suffix = None
+ if detect_people_only:
+ task_suffix = "face"
+ elif detect_signatures_only:
+ task_suffix = "sig"
+
+ response_str = (
+ extracted_text
+ if isinstance(extracted_text, str)
+ else str(extracted_text or "")
+ )
+ saved_file = save_vlm_prompt_response(
+ prompt=prompt,
+ response_text=response_str,
+ output_folder=output_folder,
+ model_choice=model_choice or "unknown",
+ image_name=image_name,
+ page_number=page_index_0,
+ temperature=model_default_temperature,
+ max_new_tokens=model_default_max_new_tokens,
+ model_type="Gemini",
+ task_suffix=task_suffix,
+ input_tokens=vlm_input_tokens,
+ output_tokens=vlm_output_tokens,
+ image_width=processed_image.size[0],
+ image_height=processed_image.size[1],
+ )
+ print(f"Saved Gemini VLM prompt/response to: {saved_file}")
+ except Exception as save_error:
+ print(
+ f"Warning: Could not save Gemini VLM prompt/response: {save_error}"
+ )
+
+ if extracted_text is None or not isinstance(extracted_text, str):
+ print("Gemini page OCR warning: No valid response")
+ return (
+ {
+ "text": [],
+ "left": [],
+ "top": [],
+ "width": [],
+ "height": [],
+ "conf": [],
+ "model": [],
+ },
+ 0,
+ 0,
+ model_choice or "Gemini",
+ )
+
+ _gem_implied = None
+ if detect_people_only:
+ _gem_implied = "[FACE]"
+ elif detect_signatures_only:
+ _gem_implied = "[SIGNATURE]"
+
+ # Parse response using shared helper
+ result = _parse_vlm_page_ocr_response(
+ extracted_text=extracted_text,
+ processed_image=processed_image,
+ processed_width=processed_width,
+ processed_height=processed_height,
+ scale_x=scale_x,
+ scale_y=scale_y,
+ normalised_coords_range=normalised_coords_range,
+ model_name="Gemini",
+ implied_label=_gem_implied,
+ )
+
+ # Get model name for tracking
+ vlm_model_name = model_choice or "Gemini"
+
+ return result, vlm_input_tokens, vlm_output_tokens, vlm_model_name
+
+ except Exception as e:
+ print(f"Gemini page OCR error: {e}")
+ import traceback
+
+ print(f"Gemini page OCR error traceback: {traceback.format_exc()}")
+ return (
+ {
+ "text": [],
+ "left": [],
+ "top": [],
+ "width": [],
+ "height": [],
+ "conf": [],
+ "model": [],
+ },
+ 0,
+ 0,
+ model_choice or "Gemini",
+ )
+
+
+def _azure_openai_page_ocr_predict(
+ image: Image.Image,
+ image_name: str = "azure_openai_page_ocr_input_image.png",
+ normalised_coords_range: Optional[int] = None,
+ output_folder: str = OUTPUT_FOLDER,
+ detect_people_only: bool = False,
+ detect_signatures_only: bool = False,
+ progress: Optional[gr.Progress] = gr.Progress(),
+ model_choice: str = None,
+ client=None,
+ page_index_0: Optional[int] = None,
+) -> Tuple[Dict[str, List], int, int, str]:
+ """
+ Azure/OpenAI page-level OCR prediction that returns structured line-level results with bounding boxes.
+
+ Args:
+ image: PIL Image to process (full page)
+ image_name: Name of the image for debugging
+ normalised_coords_range: If set, bounding boxes are assumed to be in normalized coordinates
+ output_folder: The folder where output images will be saved
+ detect_people_only: If True, only detect people in images
+ detect_signatures_only: If True, only detect signatures in images
+ progress: Gradio progress tracker
+ model_choice: Model name (e.g., "gpt-4o", "gpt-4-vision-preview")
+ client: OpenAI client instance
+
+ Returns:
+ Dictionary with 'text', 'left', 'top', 'width', 'height', 'conf', 'model' keys
+ """
+ try:
+ if image is None:
+ print("Azure/OpenAI page OCR error: Image is None")
+ return {
+ "text": [],
+ "left": [],
+ "top": [],
+ "width": [],
+ "height": [],
+ "conf": [],
+ "model": [],
+ }
+
+ try:
+ width, height = image.size
+ if width < 10 or height < 10:
+ print(
+ f"Azure/OpenAI page OCR error: Image is too small ({width}x{height} pixels)."
+ )
+ return {
+ "text": [],
+ "left": [],
+ "top": [],
+ "width": [],
+ "height": [],
+ "conf": [],
+ "model": [],
+ }
+ except Exception as size_error:
+ print(
+ f"Azure/OpenAI page OCR error: Could not get image size: {size_error}"
+ )
+ return {
+ "text": [],
+ "left": [],
+ "top": [],
+ "width": [],
+ "height": [],
+ "conf": [],
+ "model": [],
+ }
+
+ try:
+ if image.mode != "RGB":
+ image = image.convert("RGB")
+ except Exception as convert_error:
+ print(
+ f"Azure/OpenAI page OCR error: Could not convert image to RGB: {convert_error}"
+ )
+ return {
+ "text": [],
+ "left": [],
+ "top": [],
+ "width": [],
+ "height": [],
+ "conf": [],
+ "model": [],
+ }
+
+ scale_x = 1.0
+ scale_y = 1.0
+ try:
+ original_width, original_height = image.size
+ processed_image = _prepare_image_for_vlm(image)
+ processed_width, processed_height = processed_image.size
+
+ scale_x = (
+ float(original_width) / float(processed_width)
+ if processed_width > 0
+ else 1.0
+ )
+ scale_y = (
+ float(original_height) / float(processed_height)
+ if processed_height > 0
+ else 1.0
+ )
+ except Exception as prep_error:
+ print(f"Azure/OpenAI page OCR error: Could not prepare image: {prep_error}")
+ return {
+ "text": [],
+ "left": [],
+ "top": [],
+ "width": [],
+ "height": [],
+ "conf": [],
+ "model": [],
+ }
+
+ # Create prompt
+ if detect_people_only:
+ # progress(0.5, "Detecting faces on page...")
+ # print("Detecting faces on page...")
+ prompt = full_page_ocr_people_vlm_prompt
+ elif detect_signatures_only:
+ # progress(0.5, "Detecting signatures on page...")
+ # print("Detecting signatures on page...")
+ prompt = full_page_ocr_signature_vlm_prompt
+ else:
+ prompt = full_page_ocr_vlm_prompt
+
+ # Call Azure/OpenAI API
+ extracted_text, vlm_input_tokens, vlm_output_tokens = (
+ _call_azure_openai_vlm_api(
+ image=processed_image,
+ prompt=prompt,
+ client=client,
+ model_choice=model_choice,
+ max_new_tokens=model_default_max_new_tokens,
+ temperature=model_default_temperature,
+ )
+ )
+
+ # Save prompt and response to file (including when response is empty, e.g. no faces/signatures)
+ if extracted_text is not None and output_folder:
+ try:
+ # Determine task suffix based on detection type
+ task_suffix = None
+ if detect_people_only:
+ task_suffix = "face"
+ elif detect_signatures_only:
+ task_suffix = "sig"
+
+ response_str = (
+ extracted_text
+ if isinstance(extracted_text, str)
+ else str(extracted_text or "")
+ )
+ saved_file = save_vlm_prompt_response(
+ prompt=prompt,
+ response_text=response_str,
+ output_folder=output_folder,
+ model_choice=model_choice or "unknown",
+ image_name=image_name,
+ page_number=page_index_0,
+ temperature=model_default_temperature,
+ max_new_tokens=model_default_max_new_tokens,
+ model_type="Azure/OpenAI",
+ task_suffix=task_suffix,
+ input_tokens=vlm_input_tokens,
+ output_tokens=vlm_output_tokens,
+ image_width=processed_image.size[0],
+ image_height=processed_image.size[1],
+ )
+ print(f"Saved Azure/OpenAI VLM prompt/response to: {saved_file}")
+ except Exception as save_error:
+ print(
+ f"Warning: Could not save Azure/OpenAI VLM prompt/response: {save_error}"
+ )
+
+ if extracted_text is None or not isinstance(extracted_text, str):
+ print("Azure/OpenAI page OCR warning: No valid response")
+ return (
+ {
+ "text": [],
+ "left": [],
+ "top": [],
+ "width": [],
+ "height": [],
+ "conf": [],
+ "model": [],
+ },
+ 0,
+ 0,
+ model_choice or "Azure/OpenAI",
+ )
+
+ _azure_implied = None
+ if detect_people_only:
+ _azure_implied = "[FACE]"
+ elif detect_signatures_only:
+ _azure_implied = "[SIGNATURE]"
+
+ # Parse response using shared helper
+ result = _parse_vlm_page_ocr_response(
+ extracted_text=extracted_text,
+ processed_image=processed_image,
+ processed_width=processed_width,
+ processed_height=processed_height,
+ scale_x=scale_x,
+ scale_y=scale_y,
+ normalised_coords_range=normalised_coords_range,
+ model_name="Azure/OpenAI",
+ implied_label=_azure_implied,
+ )
+
+ # Get model name for tracking
+ vlm_model_name = model_choice or "Azure/OpenAI"
+
+ return result, vlm_input_tokens, vlm_output_tokens, vlm_model_name
+
+ except Exception as e:
+ print(f"Azure/OpenAI page OCR error: {e}")
+ import traceback
+
+ print(f"Azure/OpenAI page OCR error traceback: {traceback.format_exc()}")
+ return (
+ {
+ "text": [],
+ "left": [],
+ "top": [],
+ "width": [],
+ "height": [],
+ "conf": [],
+ "model": [],
+ },
+ 0,
+ 0,
+ model_choice or "Azure/OpenAI",
+ )
+
+
+class CustomImageAnalyzerEngine:
+ def __init__(
+ self,
+ analyzer_engine: Optional[AnalyzerEngine] = None,
+ ocr_engine: str = "tesseract",
+ tesseract_config: Optional[str] = None,
+ paddle_kwargs: Optional[Dict[str, Any]] = None,
+ image_preprocessor: Optional[ImagePreprocessor] = None,
+ language: Optional[str] = DEFAULT_LANGUAGE,
+ output_folder: str = OUTPUT_FOLDER,
+ save_page_ocr_visualisations: bool = SAVE_PAGE_OCR_VISUALISATIONS,
+ ):
+ """
+ Initializes the CustomImageAnalyzerEngine.
+
+ :param ocr_engine: The OCR engine to use ("tesseract", "paddle", "vlm", "hybrid-paddle", "hybrid-vlm", "hybrid-paddle-vlm", "hybrid-paddle-inference-server", or "inference-server").
+ :param analyzer_engine: The Presidio AnalyzerEngine instance.
+ :param tesseract_config: Configuration string for Tesseract. If None, uses TESSERACT_SEGMENTATION_LEVEL config.
+ :param paddle_kwargs: Dictionary of keyword arguments for PaddleOCR constructor.
+ :param image_preprocessor: Optional image preprocessor.
+ :param language: Preferred OCR language (e.g., "en", "fr", "de"). Defaults to DEFAULT_LANGUAGE.
+ :param output_folder: The folder to save the output images to.
+ """
+ if ocr_engine not in LOCAL_OCR_MODEL_OPTIONS:
+ raise ValueError(
+ f"ocr_engine must be one of the following: {LOCAL_OCR_MODEL_OPTIONS}"
+ )
+
+ self.ocr_engine = ocr_engine
+
+ # Language setup
+ self.language = language or DEFAULT_LANGUAGE or "en"
+ self.tesseract_lang = _tesseract_lang_code(self.language)
+ self.paddle_lang = _paddle_lang_code(self.language)
+
+ # Security: Validate and normalize output_folder at construction time
+ # This ensures the object is always in a secure state and prevents
+ # any future code from accidentally using an untrusted directory
+ normalized_output_folder = os.path.normpath(os.path.abspath(output_folder))
+ if not validate_folder_containment(normalized_output_folder, OUTPUT_FOLDER):
+ raise ValueError(
+ f"Unsafe output folder path: {output_folder}. Must be contained within {OUTPUT_FOLDER}"
+ )
+ self.output_folder = normalized_output_folder
+ self.save_page_ocr_visualisations = bool(save_page_ocr_visualisations)
+
+ if (
+ self.ocr_engine == "paddle"
+ or self.ocr_engine == "hybrid-paddle"
+ or self.ocr_engine == "hybrid-paddle-vlm"
+ or self.ocr_engine == "hybrid-paddle-inference-server"
+ ):
+ # Set PaddleOCR environment variables BEFORE importing PaddleOCR
+ # This ensures fonts are configured before the package loads
+
+ # Set PaddleOCR model directory environment variable (only if specified).
+ if PADDLE_MODEL_PATH and PADDLE_MODEL_PATH.strip():
+ os.environ["PADDLEOCR_MODEL_DIR"] = PADDLE_MODEL_PATH
+ print(f"Setting PaddleOCR model path to: {PADDLE_MODEL_PATH}")
+ else:
+ print("Using default PaddleOCR model storage location")
+
+ # Set PaddleOCR font path to use system fonts instead of downloading simfang.ttf/PingFang-SC-Regular.ttf
+ # This MUST be set before importing PaddleOCR to prevent font downloads
+ if (
+ PADDLE_FONT_PATH
+ and PADDLE_FONT_PATH.strip()
+ and os.path.exists(PADDLE_FONT_PATH)
+ ):
+ os.environ["PADDLE_PDX_LOCAL_FONT_FILE_PATH"] = PADDLE_FONT_PATH
+ print(
+ f"Setting PaddleOCR font path to configured font: {PADDLE_FONT_PATH}"
+ )
+ else:
+ system_font_path = get_system_font_path()
+ if system_font_path:
+ os.environ["PADDLE_PDX_LOCAL_FONT_FILE_PATH"] = system_font_path
+ print(
+ f"Setting PaddleOCR font path to system font: {system_font_path}"
+ )
+ else:
+ print(
+ "Warning: No suitable system font found. PaddleOCR may download default fonts."
+ )
+
+ try:
+ from paddleocr import PaddleOCR
+ except Exception as e:
+ raise ImportError(
+ f"Error importing PaddleOCR: {e}. Please install it using 'pip install paddleocr paddlepaddle' in your python environment and retry."
+ )
+
+ # Default paddle configuration if none provided
+ if paddle_kwargs is None:
+ paddle_kwargs = {
+ "det_db_unclip_ratio": PADDLE_DET_DB_UNCLIP_RATIO,
+ "use_textline_orientation": PADDLE_USE_TEXTLINE_ORIENTATION,
+ "use_doc_orientation_classify": False,
+ "use_doc_unwarping": False,
+ "lang": self.paddle_lang,
+ }
+ else:
+ # Enforce language if not explicitly provided
+ paddle_kwargs.setdefault("lang", self.paddle_lang)
+
+ try:
+ self.paddle_ocr = PaddleOCR(**paddle_kwargs)
+ except Exception as e:
+ # Handle DLL loading errors (common on Windows with GPU version)
+ if (
+ "WinError 127" in str(e)
+ or "could not be found" in str(e).lower()
+ or "dll" in str(e).lower()
+ ):
+ print(
+ f"Warning: GPU initialization failed (likely missing CUDA/cuDNN dependencies): {e}"
+ )
+ print("PaddleOCR will not be available. To fix GPU issues:")
+ print("1. Install Visual C++ Redistributables (latest version)")
+ print("2. Ensure CUDA runtime libraries are in your PATH")
+ print(
+ "3. Or reinstall paddlepaddle CPU version: pip install paddlepaddle"
+ )
+ raise ImportError(
+ f"Error initializing PaddleOCR: {e}. Please install it using 'pip install paddleocr paddlepaddle' in your python environment and retry."
+ )
+ else:
+ raise e
+
+ elif self.ocr_engine == "hybrid-vlm":
+ # VLM-based hybrid OCR - no additional initialization needed
+ # VLM weights load at import if LOAD_TRANSFORMERS_VLM_MODEL_AT_START=True, else on first VLM call
+ print(
+ f"Initializing hybrid VLM OCR with model: {SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL}"
+ )
+ self.paddle_ocr = None # Not using PaddleOCR
+
+ elif self.ocr_engine == "vlm":
+ # VLM page-level OCR - no additional initialization needed
+ # VLM weights load at import if LOAD_TRANSFORMERS_VLM_MODEL_AT_START=True, else on first VLM call
+ print(
+ f"Initializing VLM OCR with model: {SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL}"
+ )
+ self.paddle_ocr = None # Not using PaddleOCR
+
+ if self.ocr_engine == "hybrid-paddle-vlm":
+ # Hybrid PaddleOCR + VLM - requires both PaddleOCR and VLM
+ # VLM weights load at import if LOAD_TRANSFORMERS_VLM_MODEL_AT_START=True, else on first VLM call
+ print(
+ f"Initializing hybrid PaddleOCR + VLM OCR with model: {SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL}"
+ )
+
+ if self.ocr_engine == "hybrid-paddle-inference-server":
+ # Hybrid PaddleOCR + Inference-server - requires both PaddleOCR and inference-server API
+ print("Initializing hybrid PaddleOCR + Inference-server OCR")
+
+ if not analyzer_engine:
+ analyzer_engine = AnalyzerEngine()
+ self.analyzer_engine = analyzer_engine
+
+ # Set Tesseract configuration based on segmentation level
+ if tesseract_config:
+ self.tesseract_config = tesseract_config
+ else:
+ # Following function does not actually work correctly, so always use PSM 11
+ psm_value = TESSERACT_SEGMENTATION_LEVEL # _get_tesseract_psm(TESSERACT_SEGMENTATION_LEVEL)
+ self.tesseract_config = f"--oem 3 --psm {psm_value}"
+ # print(
+ # f"Tesseract configured for {TESSERACT_SEGMENTATION_LEVEL}-level segmentation (PSM {psm_value})"
+ # )
+
+ if not image_preprocessor:
+ image_preprocessor = ContrastSegmentedImageEnhancer()
+ self.image_preprocessor = image_preprocessor
+
+ def _sanitize_filename(
+ self, text: str, max_length: int = 20, fallback_prefix: str = "unknown_text"
+ ) -> str:
+ """
+ Sanitizes text for use in filenames by removing invalid characters and limiting length.
+
+ :param text: The text to sanitize
+ :param max_length: Maximum length of the sanitized text
+ :param fallback_prefix: Prefix to use if sanitization fails
+ :return: Sanitized text safe for filenames
+ """
+
+ # Remove or replace invalid filename characters
+ # Windows: < > : " | ? * \ /
+ # Unix: / (forward slash)
+
+ sanitized = safe_sanitize_text(text)
+
+ # Remove leading/trailing underscores and spaces
+ sanitized = sanitized.strip("_ ")
+
+ # If empty after sanitization, use a default value
+ if not sanitized:
+ sanitized = fallback_prefix
+
+ # Limit to max_length characters
+ if len(sanitized) > max_length:
+ sanitized = sanitized[:max_length]
+ # Ensure we don't end with an underscore if we cut in the middle
+ sanitized = sanitized.rstrip("_")
+
+ # Final check: if still empty or too short, use fallback
+ if not sanitized or len(sanitized) < 3:
+ sanitized = fallback_prefix
+
+ return sanitized
+
+ def _create_safe_filename_with_confidence(
+ self,
+ original_text: str,
+ new_text: str,
+ conf: int,
+ new_conf: int,
+ ocr_type: str = "OCR",
+ ) -> str:
+ """
+ Creates a safe filename using confidence values when text sanitization fails.
+
+ Args:
+ original_text: Original text from Tesseract
+ new_text: New text from VLM/PaddleOCR
+ conf: Original confidence score
+ new_conf: New confidence score
+ ocr_type: Type of OCR used (VLM, Paddle, etc.)
+
+ Returns:
+ Safe filename string
+ """
+ # Try to sanitize both texts
+ safe_original = self._sanitize_filename(
+ original_text, max_length=15, fallback_prefix=f"orig_conf_{conf}"
+ )
+ safe_new = self._sanitize_filename(
+ new_text, max_length=15, fallback_prefix=f"new_conf_{new_conf}"
+ )
+
+ # If both sanitizations resulted in fallback names, create a confidence-based name
+ if safe_original.startswith("unknown_text") and safe_new.startswith(
+ "unknown_text"
+ ):
+ return f"{ocr_type}_conf_{conf}_to_conf_{new_conf}"
+
+ return f"{safe_original}_conf_{conf}_to_{safe_new}_conf_{new_conf}"
+
+ def _is_line_level_data(self, ocr_data: Dict[str, List]) -> bool:
+ """
+ Determines if OCR data contains line-level results (multiple words per bounding box).
+
+ Args:
+ ocr_data: Dictionary with OCR data
+
+ Returns:
+ True if data appears to be line-level, False otherwise
+ """
+ if not ocr_data or not ocr_data.get("text"):
+ return False
+
+ # Check if any text entries contain multiple words
+ for text in ocr_data["text"]:
+ if text.strip() and len(text.split()) > 1:
+ return True
+
+ return False
+
+ def _convert_paddle_to_tesseract_format(
+ self,
+ paddle_results: List[Any],
+ input_image_width: int = None,
+ input_image_height: int = None,
+ image_name: str = None,
+ image: Image.Image = None,
+ ) -> Dict[str, List]:
+ """Converts PaddleOCR result format to Tesseract's dictionary format using relative coordinates.
+
+ This function uses a safer approach: converts PaddleOCR coordinates to relative (0-1) coordinates
+ based on whatever coordinate space PaddleOCR uses, then scales them to the input image dimensions.
+ This avoids issues with PaddleOCR's internal image resizing.
+
+ Args:
+ paddle_results: List of PaddleOCR result dictionaries
+ input_image_width: Width of the input image passed to PaddleOCR (target dimensions for scaling)
+ input_image_height: Height of the input image passed to PaddleOCR (target dimensions for scaling)
+ image_name: Name of the image
+ image: Image object
+ """
+
+ output = {
+ "text": list(),
+ "left": list(),
+ "top": list(),
+ "width": list(),
+ "height": list(),
+ "conf": list(),
+ "model": list(),
+ }
+
+ # paddle_results is now a list of dictionaries with detailed information
+ if not paddle_results:
+ return output
+
+ # Validate that we have target dimensions
+ if input_image_width is None or input_image_height is None:
+ print(
+ "Warning: Input image dimensions not provided. PaddleOCR coordinates may be incorrectly scaled."
+ )
+ # Fallback: we'll try to detect from coordinates, but this is less reliable
+ use_relative_coords = False
+ else:
+ use_relative_coords = True
+
+ for page_result in paddle_results:
+ # Extract text recognition results from the new format
+ rec_texts = page_result.get("rec_texts", list())
+ rec_scores = page_result.get("rec_scores", list())
+ rec_polys = page_result.get("rec_polys", list())
+ rec_models = page_result.get("rec_models", list())
+
+ # PaddleOCR may return image dimensions in the result - check for them
+ # Some versions of PaddleOCR include this information
+ result_image_width = page_result.get("image_width")
+ result_image_height = page_result.get("image_height")
+
+ # PaddleOCR typically returns coordinates in the input image space
+ # However, it may internally resize images, so we need to check if coordinates
+ # are in a different space by comparing with explicit metadata or detecting from coordinates
+
+ # First pass: determine PaddleOCR's coordinate space by finding max coordinates
+ # This tells us what coordinate space PaddleOCR is actually using
+ max_x_coord = 0
+ max_y_coord = 0
+
+ for bounding_box in rec_polys:
+ if hasattr(bounding_box, "tolist"):
+ box = bounding_box.tolist()
+ else:
+ box = bounding_box
+
+ if box and len(box) > 0:
+ x_coords = [p[0] for p in box]
+ y_coords = [p[1] for p in box]
+ max_x_coord = max(max_x_coord, max(x_coords) if x_coords else 0)
+ max_y_coord = max(max_y_coord, max(y_coords) if y_coords else 0)
+
+ # Determine PaddleOCR's coordinate space dimensions
+ # Priority: explicit result metadata > input dimensions (standard PaddleOCR behavior)
+ # Note: PaddleOCR typically returns coordinates in the input image space.
+ # We only use a different coordinate space if PaddleOCR provides explicit metadata.
+ # Using max coordinates to detect coordinate space is unreliable because:
+ # 1. Text might not extend to image edges
+ # 2. There might be padding
+ # 3. Max coordinates don't necessarily equal image dimensions
+ if result_image_width is not None and result_image_height is not None:
+ # Use explicit metadata from PaddleOCR if available (most reliable)
+ paddle_coord_width = result_image_width
+ paddle_coord_height = result_image_height
+ # Only use relative conversion if coordinate space differs from input
+ if (
+ paddle_coord_width != input_image_width
+ or paddle_coord_height != input_image_height
+ ):
+ # print(
+ # f"PaddleOCR metadata indicates coordinate space ({paddle_coord_width}x{paddle_coord_height}) "
+ # f"differs from input ({input_image_width}x{input_image_height}). "
+ # f"Using metadata for coordinate conversion."
+ # )
+ pass
+ elif input_image_width is not None and input_image_height is not None:
+ # Default: assume coordinates are in input image space (standard PaddleOCR behavior)
+ # This is the most common case and avoids incorrect scaling
+ paddle_coord_width = input_image_width
+ paddle_coord_height = input_image_height
+ else:
+ # Fallback: use max coordinates if we have no other information
+ paddle_coord_width = max_x_coord if max_x_coord > 0 else 1
+ paddle_coord_height = max_y_coord if max_y_coord > 0 else 1
+ use_relative_coords = False
+ print(
+ f"Warning: No input dimensions provided. Using detected coordinate space ({paddle_coord_width}x{paddle_coord_height}) from max coordinates."
+ )
+
+ # Validate coordinate space dimensions
+ if paddle_coord_width is None or paddle_coord_height is None:
+ paddle_coord_width = input_image_width or 1
+ paddle_coord_height = input_image_height or 1
+ use_relative_coords = False
+
+ if paddle_coord_width <= 0 or paddle_coord_height <= 0:
+ print(
+ f"Warning: Invalid PaddleOCR coordinate space dimensions ({paddle_coord_width}x{paddle_coord_height}). Using input dimensions."
+ )
+ paddle_coord_width = input_image_width or 1
+ paddle_coord_height = input_image_height or 1
+ use_relative_coords = False
+
+ # If coordinate space matches input dimensions, coordinates are already in the correct space
+ # Only use relative coordinate conversion if coordinate space differs from input
+ if (
+ paddle_coord_width == input_image_width
+ and paddle_coord_height == input_image_height
+ and input_image_width is not None
+ and input_image_height is not None
+ ):
+ # Coordinates are already in input space, no conversion needed
+ use_relative_coords = False
+ # print(
+ # f"PaddleOCR coordinates are in input image space ({input_image_width}x{input_image_height}). "
+ # f"Using coordinates directly without conversion."
+ # )
+
+ # Second pass: convert coordinates using relative coordinate approach
+ # Use default "Paddle" if rec_models is not available or doesn't match length
+ if len(rec_models) != len(rec_texts):
+ # print(
+ # f"Warning: rec_models length ({len(rec_models)}) doesn't match rec_texts length ({len(rec_texts)}). Using default 'Paddle' for all."
+ # )
+ rec_models = ["Paddle"] * len(rec_texts)
+ # Update page_result to keep it consistent
+ page_result["rec_models"] = rec_models
+ else:
+ # Ensure we're using the rec_models from page_result (which may have been modified)
+ rec_models = page_result.get("rec_models", rec_models)
+
+ # Debug: Print model distribution
+ vlm_count = sum(1 for m in rec_models if m == "VLM")
+ if vlm_count > 0:
+ print(
+ f"Found {vlm_count} VLM-labeled lines out of {len(rec_models)} total lines in page_result"
+ )
+
+ for line_text, line_confidence, bounding_box, line_model in zip(
+ rec_texts, rec_scores, rec_polys, rec_models
+ ):
+ # bounding_box is now a numpy array with shape (4, 2)
+ # Convert to list of coordinates if it's a numpy array
+ if hasattr(bounding_box, "tolist"):
+ box = bounding_box.tolist()
+ else:
+ box = bounding_box
+
+ if not box or len(box) == 0:
+ continue
+
+ # box is [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
+ x_coords = [p[0] for p in box]
+ y_coords = [p[1] for p in box]
+
+ # Extract bounding box coordinates in PaddleOCR's coordinate space
+ line_left_paddle = float(min(x_coords))
+ line_top_paddle = float(min(y_coords))
+ line_right_paddle = float(max(x_coords))
+ line_bottom_paddle = float(max(y_coords))
+ line_width_paddle = line_right_paddle - line_left_paddle
+ line_height_paddle = line_bottom_paddle - line_top_paddle
+
+ # Convert to relative coordinates (0-1) based on PaddleOCR's coordinate space
+ # Then scale to input image dimensions
+ if (
+ use_relative_coords
+ and paddle_coord_width > 0
+ and paddle_coord_height > 0
+ ):
+ # Normalize to relative coordinates [0-1]
+ rel_left = line_left_paddle / paddle_coord_width
+ rel_top = line_top_paddle / paddle_coord_height
+ rel_width = line_width_paddle / paddle_coord_width
+ rel_height = line_height_paddle / paddle_coord_height
+
+ # Scale to input image dimensions
+ line_left = rel_left * input_image_width
+ line_top = rel_top * input_image_height
+ line_width = rel_width * input_image_width
+ line_height = rel_height * input_image_height
+ else:
+ # Fallback: use coordinates directly (may cause issues if coordinate spaces don't match)
+ line_left = line_left_paddle
+ line_top = line_top_paddle
+ line_width = line_width_paddle
+ line_height = line_height_paddle
+ # if input_image_width and input_image_height:
+ # print(f"Warning: Using PaddleOCR coordinates directly. This may cause scaling issues.")
+
+ # Ensure coordinates are within valid bounds
+ if input_image_width and input_image_height:
+ line_left = max(0, min(line_left, input_image_width))
+ line_top = max(0, min(line_top, input_image_height))
+ line_width = max(0, min(line_width, input_image_width - line_left))
+ line_height = max(
+ 0, min(line_height, input_image_height - line_top)
+ )
+
+ # Add line-level data
+ output["text"].append(line_text)
+ output["left"].append(round(line_left, 2))
+ output["top"].append(round(line_top, 2))
+ output["width"].append(round(line_width, 2))
+ output["height"].append(round(line_height, 2))
+ output["conf"].append(int(line_confidence * 100))
+ output["model"].append(line_model if line_model else "Paddle")
+
+ return output
+
+ @staticmethod
+ def _process_one_line_to_words(
+ task: Tuple,
+ output_folder: str,
+ image_name: Optional[str],
+ thread_local_segmenter: Optional[threading.local] = None,
+ ) -> Tuple[int, Dict[str, List]]:
+ """
+ Process a single line to word-level bounding boxes. Used by
+ _convert_line_to_word_level for parallel execution.
+
+ Args:
+ task: (line_index, line_image, line_text, line_conf, line_model,
+ line_left, line_top, line_width, line_height)
+ output_folder: Passed to AdaptiveSegmenter
+ image_name: Passed to segmenter.segment()
+ Returns:
+ (line_index, word_dict) with word_dict having keys text, left, top,
+ width, height, conf, model (all lists).
+ """
+ (
+ i,
+ line_image,
+ line_text,
+ line_conf,
+ line_model,
+ line_left,
+ line_top,
+ line_width,
+ line_height,
+ ) = task
+ word_dict = {
+ "text": [],
+ "left": [],
+ "top": [],
+ "width": [],
+ "height": [],
+ "conf": [],
+ "model": [],
+ }
+ if thread_local_segmenter is not None:
+ segmenter = getattr(thread_local_segmenter, "segmenter", None)
+ if segmenter is None:
+ segmenter = AdaptiveSegmenter(output_folder=output_folder)
+ thread_local_segmenter.segmenter = segmenter
+ else:
+ segmenter = AdaptiveSegmenter(output_folder=output_folder)
+ single_line_data = {
+ "text": [line_text],
+ "left": [0],
+ "top": [0],
+ "width": [line_width],
+ "height": [line_height],
+ "conf": [line_conf],
+ "line": [i],
+ }
+ word_output, _ = segmenter.segment(
+ single_line_data, line_image, image_name=image_name
+ )
+ if not word_output or not word_output.get("text"):
+ words = line_text.split()
+ if words:
+ num_chars = len("".join(words))
+ num_spaces = len(words) - 1
+ if num_chars > 0:
+ char_space_ratio = 2.0
+ estimated_space_width = (
+ line_width / (num_chars * char_space_ratio + num_spaces)
+ if (num_chars * char_space_ratio + num_spaces) > 0
+ else line_width / num_chars
+ )
+ avg_char_width = estimated_space_width * char_space_ratio
+ current_left = 0
+ for word in words:
+ word_width = len(word) * avg_char_width
+ clamped_left = max(0, min(current_left, line_width))
+ clamped_width = max(
+ 0, min(word_width, line_width - clamped_left)
+ )
+ word_dict["text"].append(word)
+ word_dict["left"].append(line_left + clamped_left)
+ word_dict["top"].append(line_top)
+ word_dict["width"].append(clamped_width)
+ word_dict["height"].append(line_height)
+ word_dict["conf"].append(line_conf)
+ word_dict["model"].append(line_model)
+ current_left += word_width + estimated_space_width
+ return (i, word_dict)
+ for j in range(len(word_output["text"])):
+ word_dict["text"].append(word_output["text"][j])
+ word_dict["left"].append(line_left + word_output["left"][j])
+ word_dict["top"].append(line_top + word_output["top"][j])
+ word_dict["width"].append(word_output["width"][j])
+ word_dict["height"].append(word_output["height"][j])
+ word_dict["conf"].append(word_output["conf"][j])
+ word_dict["model"].append(line_model)
+ return (i, word_dict)
+
+ def _convert_line_to_word_level(
+ self,
+ line_data: Dict[str, List],
+ image_width: int,
+ image_height: int,
+ image: Image.Image,
+ image_name: str = None,
+ ) -> Dict[str, List]:
+ """
+ Converts line-level OCR results to word-level using AdaptiveSegmenter.segment().
+ This method processes each line individually using the adaptive segmentation algorithm.
+ Lines are processed in parallel with ThreadPoolExecutor when there is more than one.
+
+ Args:
+ line_data: Dictionary with keys "text", "left", "top", "width", "height", "conf" (all lists)
+ image_width: Width of the full image
+ image_height: Height of the full image
+ image: PIL Image object of the full image
+ image_name: Name of the image
+ Returns:
+ Dictionary with same keys as input, containing word-level bounding boxes
+ """
+ output = {
+ "text": list(),
+ "left": list(),
+ "top": list(),
+ "width": list(),
+ "height": list(),
+ "conf": list(),
+ "model": list(),
+ }
+
+ if not line_data or not line_data.get("text"):
+ return output
+ # Timing hooks removed (test-only).
+
+ # Validate that image is not None before processing
+ if image is None:
+ print(
+ "Warning: Image is None in _convert_line_to_word_level. Returning empty output."
+ )
+ return output
+
+ # Convert PIL Image to numpy array (BGR format for OpenCV)
+ if hasattr(image, "size"): # PIL Image
+ image_np = np.array(image)
+ if len(image_np.shape) == 3:
+ # Convert RGB to BGR for OpenCV
+ image_np = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR)
+ elif len(image_np.shape) == 2:
+ # Grayscale - convert to BGR
+ image_np = cv2.cvtColor(image_np, cv2.COLOR_GRAY2BGR)
+ else:
+ # Already numpy array
+ image_np = image.copy()
+ if len(image_np.shape) == 2:
+ image_np = cv2.cvtColor(image_np, cv2.COLOR_GRAY2BGR)
+
+ # Validate that image_np dimensions match the expected image_width and image_height
+ # PIL Image.size returns (width, height), but numpy array shape is (height, width, channels)
+ actual_height, actual_width = image_np.shape[:2]
+ if actual_width != image_width or actual_height != image_height:
+ print(
+ f"Warning: Image dimension mismatch! Expected {image_width}x{image_height}, but got {actual_width}x{actual_height}"
+ )
+ image_width = actual_width
+ image_height = actual_height
+
+ # Build list of tasks: one per valid line (crop and validate on main thread)
+ _start_task_build = time.perf_counter()
+ tasks = []
+ for i in range(len(line_data["text"])):
+ line_text = line_data["text"][i]
+ line_conf = line_data["conf"][i]
+ if "model" in line_data and len(line_data["model"]) > i:
+ line_model = line_data["model"][i]
+ else:
+ line_model = "Paddle"
+
+ f_left = float(line_data["left"][i])
+ f_top = float(line_data["top"][i])
+ f_width = float(line_data["width"][i])
+ f_height = float(line_data["height"][i])
+ is_normalized = (
+ f_left <= 1.0 and f_top <= 1.0 and f_width <= 1.0 and f_height <= 1.0
+ )
+ if is_normalized:
+ line_left = float(round(f_left * image_width))
+ line_top = float(round(f_top * image_height))
+ line_width = float(round(f_width * image_width))
+ line_height = float(round(f_height * image_height))
+ else:
+ line_left = float(round(f_left))
+ line_top = float(round(f_top))
+ line_width = float(round(f_width))
+ line_height = float(round(f_height))
+
+ if not line_text.strip():
+ continue
+
+ line_left = int(max(0, min(line_left, image_width - 1)))
+ line_top = int(max(0, min(line_top, image_height - 1)))
+ line_width = int(max(1, min(line_width, image_width - line_left)))
+ line_height = int(max(1, min(line_height, image_height - line_top)))
+
+ if line_left >= image_width or line_top >= image_height:
+ continue
+ if line_left + line_width > image_width:
+ line_width = image_width - line_left
+ if line_top + line_height > image_height:
+ line_height = image_height - line_top
+ if line_width <= 0 or line_height <= 0:
+ continue
+
+ try:
+ line_image = image_np[
+ line_top : line_top + line_height,
+ line_left : line_left + line_width,
+ ].copy()
+ except IndexError:
+ continue
+ if line_image.size == 0 or len(line_image.shape) < 2:
+ continue
+
+ tasks.append(
+ (
+ i,
+ line_image,
+ line_text,
+ line_conf,
+ line_model,
+ line_left,
+ line_top,
+ line_width,
+ line_height,
+ )
+ )
+
+ if not tasks:
+ return output
+ # Timing hooks removed (test-only).
+
+ # Process lines in parallel. Dedicated worker cap is safer for this CPU-heavy path.
+ max_workers = min(LINE_TO_WORD_SEGMENT_MAX_WORKERS, len(tasks))
+ # Timing hooks removed (test-only).
+ process_one = partial(
+ CustomImageAnalyzerEngine._process_one_line_to_words,
+ output_folder=self.output_folder,
+ image_name=image_name,
+ thread_local_segmenter=threading.local(),
+ )
+ if max_workers <= 1:
+ results = [process_one(task) for task in tasks]
+ else:
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
+ results = list(executor.map(process_one, tasks))
+ # Timing hooks removed (test-only).
+
+ # Merge results in line order to preserve document order
+ # Timing hooks removed (test-only).
+ for _i, word_dict in sorted(results, key=lambda x: x[0]):
+ for key in output:
+ output[key].extend(word_dict[key])
+ # Timing hooks removed (test-only).
+
+ return output
+
+ def _visualize_tesseract_bounding_boxes(
+ self,
+ image: Image.Image,
+ ocr_data: Dict[str, List],
+ image_name: str = None,
+ visualisation_folder: str = "tesseract_visualisations",
+ ) -> None:
+ """
+ Visualizes Tesseract OCR bounding boxes with confidence-based colors and a legend.
+
+ Args:
+ image: The PIL Image object
+ ocr_data: Tesseract OCR data dictionary
+ image_name: Optional name for the saved image file
+ """
+ if not ocr_data or not ocr_data.get("text"):
+ return
+
+ # Convert PIL image to OpenCV format
+ image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+
+ # Get image dimensions
+ height, width = image_cv.shape[:2]
+
+ # Define confidence ranges and colors
+ confidence_ranges = [
+ (80, 100, (0, 255, 0), "High (80-100%)"), # Green
+ (50, 79, (0, 165, 255), "Medium (50-79%)"), # Orange
+ (0, 49, (0, 0, 255), "Low (0-49%)"), # Red
+ ]
+
+ # Process each detected text element
+ for i in range(len(ocr_data["text"])):
+ text = ocr_data["text"][i]
+ conf = int(ocr_data["conf"][i])
+
+ # Skip empty text or invalid confidence
+ if not text.strip() or conf == -1:
+ continue
+
+ left = ocr_data["left"][i]
+ top = ocr_data["top"][i]
+ width_box = ocr_data["width"][i]
+ height_box = ocr_data["height"][i]
+
+ # Calculate bounding box coordinates
+ x1 = int(left)
+ y1 = int(top)
+ x2 = int(left + width_box)
+ y2 = int(top + height_box)
+
+ # Ensure coordinates are within image bounds
+ x1 = max(0, min(x1, width))
+ y1 = max(0, min(y1, height))
+ x2 = max(0, min(x2, width))
+ y2 = max(0, min(y2, height))
+
+ # Skip if bounding box is invalid
+ if x2 <= x1 or y2 <= y1:
+ continue
+
+ # Determine color based on confidence score
+ color = (0, 0, 255) # Default to red
+ for min_conf, max_conf, conf_color, _ in confidence_ranges:
+ if min_conf <= conf <= max_conf:
+ color = conf_color
+ break
+
+ # Draw bounding box
+ cv2.rectangle(image_cv, (x1, y1), (x2, y2), color, 1)
+
+ # Add legend
+ self._add_confidence_legend(image_cv, confidence_ranges)
+
+ # Save the visualization
+ tesseract_viz_folder = os.path.join(self.output_folder, visualisation_folder)
+
+ # Double-check the constructed path is safe
+ if not validate_folder_containment(tesseract_viz_folder, OUTPUT_FOLDER):
+ raise ValueError(
+ f"Unsafe tesseract visualisations folder path: {tesseract_viz_folder}"
+ )
+
+ os.makedirs(tesseract_viz_folder, exist_ok=True)
+
+ # Generate filename
+ if image_name:
+ # Remove file extension if present
+ base_name = os.path.splitext(image_name)[0]
+ filename = f"{base_name}_{visualisation_folder}.jpg"
+ else:
+ timestamp = int(time.time())
+ filename = f"{visualisation_folder}_{timestamp}.jpg"
+
+ output_path = os.path.join(tesseract_viz_folder, filename)
+
+ # Save the image
+ max_filesize = 500 * 1024 # 500kb in bytes
+ quality = 95 # Start high, OpenCV JPEG quality range is 0-100
+
+ # Try lowering JPEG quality until file is below size limit
+ is_saved = False
+ while quality >= 10:
+ cv2.imwrite(output_path, image_cv, [int(cv2.IMWRITE_JPEG_QUALITY), quality])
+ if (
+ os.path.exists(output_path)
+ and os.path.getsize(output_path) <= max_filesize
+ ):
+ is_saved = True
+ break
+ quality -= 5
+
+ if not is_saved:
+ # Save as lowest acceptable quality if cannot get under 500kb, or raise warning
+ cv2.imwrite(output_path, image_cv, [int(cv2.IMWRITE_JPEG_QUALITY), 10])
+ # Optionally log warning here that file could not be compressed below 500kb
+
+ print(f"Tesseract visualization saved to: {output_path}")
+
+ def _add_confidence_legend(
+ self, image_cv: np.ndarray, confidence_ranges: List[Tuple]
+ ) -> None:
+ """
+ Adds a confidence legend to the visualization image.
+
+ Args:
+ image_cv: OpenCV image array
+ confidence_ranges: List of tuples containing (min_conf, max_conf, color, label)
+ """
+ height, width = image_cv.shape[:2]
+
+ # Legend parameters
+ legend_width = 200
+ legend_height = 100
+ legend_x = width - legend_width - 20
+ legend_y = 20
+
+ # Draw legend background
+ cv2.rectangle(
+ image_cv,
+ (legend_x, legend_y),
+ (legend_x + legend_width, legend_y + legend_height),
+ (255, 255, 255), # White background
+ -1,
+ )
+ cv2.rectangle(
+ image_cv,
+ (legend_x, legend_y),
+ (legend_x + legend_width, legend_y + legend_height),
+ (0, 0, 0), # Black border
+ 2,
+ )
+
+ # Add title
+ title_text = "Confidence Levels"
+ font_scale = 0.6
+ font_thickness = 2
+ (title_width, title_height), _ = cv2.getTextSize(
+ title_text, cv2.FONT_HERSHEY_SIMPLEX, font_scale, font_thickness
+ )
+ title_x = legend_x + (legend_width - title_width) // 2
+ title_y = legend_y + title_height + 10
+ cv2.putText(
+ image_cv,
+ title_text,
+ (title_x, title_y),
+ cv2.FONT_HERSHEY_SIMPLEX,
+ font_scale,
+ (0, 0, 0), # Black text
+ font_thickness,
+ )
+
+ # Add confidence range items
+ item_spacing = 25
+ start_y = title_y + 25
+
+ for i, (min_conf, max_conf, color, label) in enumerate(confidence_ranges):
+ item_y = start_y + i * item_spacing
+
+ # Draw color box
+ box_size = 15
+ box_x = legend_x + 10
+ box_y = item_y - box_size
+ cv2.rectangle(
+ image_cv,
+ (box_x, box_y),
+ (box_x + box_size, box_y + box_size),
+ color,
+ -1,
+ )
+ cv2.rectangle(
+ image_cv,
+ (box_x, box_y),
+ (box_x + box_size, box_y + box_size),
+ (0, 0, 0), # Black border
+ 1,
+ )
+
+ # Add label text
+ label_x = box_x + box_size + 10
+ label_y = item_y - 5
+ cv2.putText(
+ image_cv,
+ label,
+ (label_x, label_y),
+ cv2.FONT_HERSHEY_SIMPLEX,
+ 0.5,
+ (0, 0, 0), # Black text
+ 1,
+ )
+
+ # Calculate line-level bounding boxes and average confidence
+ def _calculate_line_bbox(self, group):
+ # Get the leftmost and rightmost positions
+ left = group["left"].min()
+ top = group["top"].min()
+ right = (group["left"] + group["width"]).max()
+ bottom = (group["top"] + group["height"]).max()
+
+ # Calculate width and height
+ width = right - left
+ height = bottom - top
+
+ # Calculate average confidence
+ avg_conf = round(group["conf"].mean(), 0)
+
+ return pd.Series(
+ {
+ "text": " ".join(group["text"].astype(str).tolist()),
+ "left": left,
+ "top": top,
+ "width": width,
+ "height": height,
+ "conf": avg_conf,
+ }
+ )
+
+ def _perform_hybrid_ocr(
+ self,
+ image: Image.Image,
+ confidence_threshold: int = HYBRID_OCR_CONFIDENCE_THRESHOLD,
+ padding: int = HYBRID_OCR_PADDING,
+ ocr: Optional[Any] = None,
+ image_name: str = "unknown_image_name",
+ ) -> Dict[str, list]:
+ """
+ Performs hybrid OCR on an image using Tesseract for initial OCR and PaddleOCR/VLM to enhance
+ results for low-confidence or uncertain words.
+
+ Args:
+ image (Image.Image): The input image (PIL format) to be processed.
+ confidence_threshold (int, optional): Tesseract confidence threshold below which words are
+ re-analyzed with secondary OCR (PaddleOCR/VLM). Defaults to HYBRID_OCR_CONFIDENCE_THRESHOLD.
+ padding (int, optional): Pixel padding (in all directions) to add around each word box when
+ cropping for secondary OCR. Defaults to HYBRID_OCR_PADDING.
+ ocr (Optional[Any], optional): An instance of the PaddleOCR or VLM engine. If None, will use the
+ instance's `paddle_ocr` attribute if available. Only necessary for PaddleOCR-based pipelines.
+ image_name (str, optional): Optional name of the image, useful for debugging and visualization.
+
+ Returns:
+ Dict[str, list]: OCR results in the dictionary format of pytesseract.image_to_data (keys:
+ 'text', 'left', 'top', 'width', 'height', 'conf', 'model', ...).
+ """
+ # Determine if we're using VLM or PaddleOCR
+ use_vlm = self.ocr_engine == "hybrid-vlm"
+
+ if not use_vlm:
+ if ocr is None:
+ if hasattr(self, "paddle_ocr") and self.paddle_ocr is not None:
+ ocr = self.paddle_ocr
+ else:
+ raise ValueError(
+ "No OCR object provided and 'paddle_ocr' is not initialized."
+ )
+
+ # print("Starting hybrid OCR process...")
+
+ # 1. Get initial word-level results from Tesseract
+ tesseract_data = pytesseract.image_to_data(
+ image,
+ output_type=pytesseract.Output.DICT,
+ config=self.tesseract_config,
+ lang=self.tesseract_lang,
+ )
+
+ if TESSERACT_WORD_LEVEL_OCR is False:
+ ocr_df = pd.DataFrame(tesseract_data)
+
+ # Filter out invalid entries (confidence == -1)
+ ocr_df = ocr_df[ocr_df.conf != -1]
+
+ # Group by line and aggregate text
+ line_groups = ocr_df.groupby(["block_num", "par_num", "line_num"])
+
+ ocr_data = line_groups.apply(self._calculate_line_bbox).reset_index()
+
+ # Overwrite tesseract_data with the aggregated data
+ tesseract_data = {
+ "text": ocr_data["text"].tolist(),
+ "left": ocr_data["left"].astype(int).tolist(),
+ "top": ocr_data["top"].astype(int).tolist(),
+ "width": ocr_data["width"].astype(int).tolist(),
+ "height": ocr_data["height"].astype(int).tolist(),
+ "conf": ocr_data["conf"].tolist(),
+ "model": ["Tesseract"] * len(ocr_data), # Add model field
+ }
+
+ final_data = {
+ "text": list(),
+ "left": list(),
+ "top": list(),
+ "width": list(),
+ "height": list(),
+ "conf": list(),
+ "model": list(), # Track which model was used for each word
+ }
+
+ num_words = len(tesseract_data["text"])
+
+ # This handles the "no text on page" case. If num_words is 0, the loop is skipped
+ # and an empty dictionary with empty lists is returned, which is the correct behavior.
+ for i in range(num_words):
+ text = tesseract_data["text"][i]
+ conf = int(tesseract_data["conf"][i])
+
+ # Skip empty text boxes or non-word elements (like page/block markers)
+ if not text.strip() or conf == -1:
+ continue
+
+ left = tesseract_data["left"][i]
+ top = tesseract_data["top"][i]
+ width = tesseract_data["width"][i]
+ height = tesseract_data["height"][i]
+ # line_number = tesseract_data['abs_line_id'][i]
+
+ # Initialize model as Tesseract (default)
+ model_used = "Tesseract"
+
+ # If confidence is low, use PaddleOCR for a second opinion
+ if conf <= confidence_threshold:
+ img_width, img_height = image.size
+ crop_left = max(0, left - padding)
+ crop_top = max(0, top - padding)
+ crop_right = min(img_width, left + width + padding)
+ crop_bottom = min(img_height, top + height + padding)
+
+ # Ensure crop dimensions are valid
+ if crop_right <= crop_left or crop_bottom <= crop_top:
+ continue # Skip invalid crops
+
+ cropped_image = image.crop(
+ (crop_left, crop_top, crop_right, crop_bottom)
+ )
+ if use_vlm:
+ # Use VLM for OCR
+ vlm_result = _vlm_ocr_predict(cropped_image)
+ rec_texts = vlm_result.get("rec_texts", [])
+ rec_scores = vlm_result.get("rec_scores", [])
+ else:
+ # Use PaddleOCR
+ cropped_image_np = np.array(cropped_image)
+
+ if len(cropped_image_np.shape) == 2:
+ cropped_image_np = np.stack([cropped_image_np] * 3, axis=-1)
+
+ paddle_results = ocr.predict(cropped_image_np)
+
+ if paddle_results and paddle_results[0]:
+ rec_texts = paddle_results[0].get("rec_texts", [])
+ rec_scores = paddle_results[0].get("rec_scores", [])
+ else:
+ rec_texts = []
+ rec_scores = []
+
+ if rec_texts and rec_scores:
+ new_text = " ".join(rec_texts)
+ new_conf = int(round(np.median(rec_scores) * 100, 0))
+
+ # Only replace if Paddle's/VLM's confidence is better
+ if new_conf >= conf:
+ ocr_type = "VLM" if use_vlm else "Paddle"
+ message_output = f" Re-OCR'd word: '{text}' (conf: {conf}) -> '{new_text}' (conf: {new_conf:.0f}) [{ocr_type}]"
+ print(message_output)
+
+ if REPORT_VLM_OUTPUTS_TO_GUI:
+ try:
+ gr.Info(message_output, duration=2)
+ except Exception:
+ # gr.Info may not be available in worker process, ignore
+ pass
+
+ # For exporting example image comparisons, not used here
+ safe_filename = self._create_safe_filename_with_confidence(
+ text, new_text, conf, new_conf, ocr_type
+ )
+
+ if SAVE_EXAMPLE_HYBRID_IMAGES:
+ # Normalize and validate image_name to prevent path traversal attacks
+ normalized_image_name = os.path.normpath(
+ image_name + "_" + ocr_type
+ )
+ # Ensure the image name doesn't contain path traversal characters
+ if (
+ ".." in normalized_image_name
+ or "/" in normalized_image_name
+ or "\\" in normalized_image_name
+ ):
+ normalized_image_name = (
+ "safe_image" # Fallback to safe default
+ )
+
+ hybrid_ocr_examples_folder = (
+ self.output_folder
+ + f"/hybrid_ocr_examples/{normalized_image_name}"
+ )
+ # Validate the constructed path is safe before creating directories
+ if not validate_folder_containment(
+ hybrid_ocr_examples_folder, OUTPUT_FOLDER
+ ):
+ raise ValueError(
+ f"Unsafe hybrid_ocr_examples folder path: {hybrid_ocr_examples_folder}"
+ )
+
+ if not os.path.exists(hybrid_ocr_examples_folder):
+ os.makedirs(hybrid_ocr_examples_folder)
+ output_image_path = (
+ hybrid_ocr_examples_folder + f"/{safe_filename}.png"
+ )
+ print(f"Saving example image to {output_image_path}")
+ _save_image_with_config_dpi(
+ cropped_image, output_image_path
+ )
+
+ text = new_text
+ conf = new_conf
+ model_used = ocr_type # Update model to VLM or Paddle
+
+ else:
+ ocr_type = "VLM" if use_vlm else "Paddle"
+ print(
+ f" '{text}' (conf: {conf}) -> {ocr_type} result '{new_text}' (conf: {new_conf:.0f}) was not better. Keeping original."
+ )
+ else:
+ # OCR ran but found nothing, discard original word
+ ocr_type = "VLM" if use_vlm else "Paddle"
+ print(
+ f" '{text}' (conf: {conf}) -> No text found by {ocr_type}. Discarding."
+ )
+ text = ""
+
+ # Append the final result (either original, replaced, or skipped if empty)
+ if text.strip():
+ final_data["text"].append(
+ clean_unicode_text(text, preserve_international_scripts=True)
+ )
+ final_data["left"].append(left)
+ final_data["top"].append(top)
+ final_data["width"].append(width)
+ final_data["height"].append(height)
+ final_data["conf"].append(int(conf))
+ final_data["model"].append(model_used)
+ # final_data['line_number'].append(int(line_number))
+
+ return final_data
+
+ def _perform_hybrid_paddle_vlm_ocr(
+ self,
+ image: Image.Image,
+ ocr: Optional[Any] = None,
+ paddle_results: List[Any] = None,
+ confidence_threshold: int = HYBRID_OCR_CONFIDENCE_THRESHOLD,
+ padding: int = HYBRID_OCR_PADDING,
+ image_name: str = "unknown_image_name",
+ input_image_width: int = None,
+ input_image_height: int = None,
+ ) -> List[Any]:
+ """
+ Performs OCR using PaddleOCR at line level, then VLM for low-confidence lines.
+ Returns modified paddle_results in the same format as PaddleOCR output.
+
+ Args:
+ image: PIL Image to process
+ ocr: PaddleOCR instance (optional, uses self.paddle_ocr if not provided)
+ paddle_results: PaddleOCR results in original format (List of dicts with rec_texts, rec_scores, rec_polys)
+ confidence_threshold: Confidence threshold below which VLM is used
+ padding: Padding to add around line crops
+ image_name: Name of the image for logging/debugging
+ input_image_width: Original image width (before preprocessing)
+ input_image_height: Original image height (before preprocessing)
+
+ Returns:
+ Modified paddle_results with VLM replacements for low-confidence lines
+ """
+ if ocr is None:
+ if hasattr(self, "paddle_ocr") and self.paddle_ocr is not None:
+ ocr = self.paddle_ocr
+ else:
+ raise ValueError(
+ "No OCR object provided and 'paddle_ocr' is not initialized."
+ )
+
+ if paddle_results is None or not paddle_results:
+ return paddle_results
+
+ print("Starting hybrid PaddleOCR + VLM OCR process...")
+
+ # Get image dimensions
+ img_width, img_height = image.size
+
+ # Use original dimensions if provided, otherwise use current image dimensions
+ if input_image_width is None:
+ input_image_width = img_width
+ if input_image_height is None:
+ input_image_height = img_height
+
+ # Convert PaddleOCR result objects to plain dictionaries for pickling
+ # The @spaces.GPU decorator requires picklable arguments, but PaddleOCR
+ # result objects contain CopyableWeakMethod references that can't be pickled
+ def _paddle_result_to_plain_dict(result):
+ """Convert PaddleOCR result object to a plain dictionary."""
+ plain_dict = {}
+ # Extract all standard keys as plain Python types
+ for key in ["rec_texts", "rec_scores", "rec_polys", "rec_models"]:
+ if key in result or hasattr(result, key):
+ value = (
+ result.get(key, [])
+ if hasattr(result, "get")
+ else getattr(result, key, [])
+ )
+ if value is not None:
+ # Convert to list to ensure it's a plain Python type
+ plain_dict[key] = (
+ list(value)
+ if hasattr(value, "__iter__") and not isinstance(value, str)
+ else value
+ )
+ # Also extract dimension info if present
+ for key in ["image_width", "image_height"]:
+ if key in result or hasattr(result, key):
+ value = (
+ result.get(key)
+ if hasattr(result, "get")
+ else getattr(result, key, None)
+ )
+ if value is not None:
+ plain_dict[key] = value
+ return plain_dict
+
+ copied_paddle_results = [
+ _paddle_result_to_plain_dict(result) for result in paddle_results
+ ]
+
+ modified_paddle_results = _process_page_result_with_hybrid_vlm_ocr(
+ copied_paddle_results,
+ image,
+ img_width,
+ img_height,
+ input_image_width,
+ input_image_height,
+ confidence_threshold,
+ image_name,
+ self.output_folder,
+ padding,
+ )
+
+ return modified_paddle_results
+
+ def _perform_hybrid_paddle_inference_server_ocr(
+ self,
+ image: Image.Image,
+ ocr: Optional[Any] = None,
+ paddle_results: List[Any] = None,
+ confidence_threshold: int = HYBRID_OCR_CONFIDENCE_THRESHOLD,
+ padding: int = HYBRID_OCR_PADDING,
+ image_name: str = "unknown_image_name",
+ input_image_width: int = None,
+ input_image_height: int = None,
+ model_name: str = None,
+ ) -> List[Any]:
+ """
+ Performs OCR using PaddleOCR at line level, then inference-server API for low-confidence lines.
+ Returns modified paddle_results in the same format as PaddleOCR output.
+
+ Args:
+ image: PIL Image to process
+ ocr: PaddleOCR instance (optional, uses self.paddle_ocr if not provided)
+ paddle_results: PaddleOCR results in original format (List of dicts with rec_texts, rec_scores, rec_polys)
+ confidence_threshold: Confidence threshold below which inference-server is used
+ padding: Padding to add around line crops
+ image_name: Name of the image for logging/debugging
+ input_image_width: Original image width (before preprocessing)
+ input_image_height: Original image height (before preprocessing)
+ model_name: Name of the inference-server model to use
+
+ Returns:
+ Modified paddle_results with inference-server replacements for low-confidence lines
+ """
+ if ocr is None:
+ if hasattr(self, "paddle_ocr") and self.paddle_ocr is not None:
+ ocr = self.paddle_ocr
+ else:
+ raise ValueError(
+ "No OCR object provided and 'paddle_ocr' is not initialized."
+ )
+
+ if paddle_results is None or not paddle_results:
+ return paddle_results
+
+ print("Starting hybrid PaddleOCR + Inference-server OCR process...")
+
+ # Get image dimensions
+ img_width, img_height = image.size
+
+ # Use original dimensions if provided, otherwise use current image dimensions
+ if input_image_width is None:
+ input_image_width = img_width
+ if input_image_height is None:
+ input_image_height = img_height
+
+ # Create a deep copy of paddle_results to modify
+ copied_paddle_results = copy.deepcopy(paddle_results)
+
+ def _normalize_paddle_result_lists(rec_texts, rec_scores, rec_polys):
+ """
+ Normalizes PaddleOCR result lists to ensure they all have the same length.
+ Pads missing entries with appropriate defaults:
+ - rec_texts: empty string ""
+ - rec_scores: 0.0 (low confidence)
+ - rec_polys: empty list []
+
+ Args:
+ rec_texts: List of recognized text strings
+ rec_scores: List of confidence scores
+ rec_polys: List of bounding box polygons
+
+ Returns:
+ Tuple of (normalized_rec_texts, normalized_rec_scores, normalized_rec_polys, max_length)
+ """
+ len_texts = len(rec_texts)
+ len_scores = len(rec_scores)
+ len_polys = len(rec_polys)
+ max_length = max(len_texts, len_scores, len_polys)
+
+ # Only normalize if there's a mismatch
+ if max_length > 0 and (
+ len_texts != max_length
+ or len_scores != max_length
+ or len_polys != max_length
+ ):
+ print(
+ f"Warning: List length mismatch detected - rec_texts: {len_texts}, "
+ f"rec_scores: {len_scores}, rec_polys: {len_polys}. "
+ f"Padding to length {max_length}."
+ )
+
+ # Pad rec_texts
+ if len_texts < max_length:
+ rec_texts = list(rec_texts) + [""] * (max_length - len_texts)
+
+ # Pad rec_scores
+ if len_scores < max_length:
+ rec_scores = list(rec_scores) + [0.0] * (max_length - len_scores)
+
+ # Pad rec_polys
+ if len_polys < max_length:
+ rec_polys = list(rec_polys) + [[]] * (max_length - len_polys)
+
+ return rec_texts, rec_scores, rec_polys, max_length
+
+ def _process_page_result_with_hybrid_inference_server_ocr(
+ page_results: list,
+ image: Image.Image,
+ img_width: int,
+ img_height: int,
+ input_image_width: int,
+ input_image_height: int,
+ confidence_threshold: float,
+ image_name: str,
+ instance_self: object,
+ padding: int = 0,
+ ):
+ """
+ Processes OCR page results using a hybrid system that combines PaddleOCR for initial recognition
+ and an inference server for low-confidence lines. When PaddleOCR's recognition confidence for a
+ detected line is below the specified threshold, the line is re-processed using a higher-quality
+ (but slower) server model and the result is used to replace the low-confidence recognition.
+ Results are kept in PaddleOCR's standard output format for downstream compatibility.
+
+ Args:
+ page_results (list): The list of page result dicts from PaddleOCR to process. Each dict should
+ contain keys like 'rec_texts', 'rec_scores', 'rec_polys', and optionally 'image_width',
+ 'image_height', and 'rec_models'.
+ image (PIL.Image.Image): The PIL Image object of the full page to allow line cropping.
+ img_width (int): The width of the (possibly preprocessed) image in pixels.
+ img_height (int): The height of the (possibly preprocessed) image in pixels.
+ input_image_width (int): The original image width (before any resizing/preprocessing).
+ input_image_height (int): The original image height (before any resizing/preprocessing).
+ confidence_threshold (float): Lines recognized by PaddleOCR with confidence lower than this
+ threshold will be replaced using the inference server.
+ image_name (str): The name of the source image, used for logging/debugging.
+ instance_self (object): The enclosing class instance to access inference invocation.
+ padding (int): Padding to add around line crops.
+
+ Returns:
+ None. Modifies page_results in place with higher-confidence text replacements when possible.
+ """
+
+ # Process each page result in paddle_results
+ for page_result in page_results:
+ # Extract text recognition results from the paddle format
+ rec_texts = page_result.get("rec_texts", list())
+ rec_scores = page_result.get("rec_scores", list())
+ rec_polys = page_result.get("rec_polys", list())
+
+ # Normalize lists to ensure they all have the same length
+ rec_texts, rec_scores, rec_polys, num_lines = (
+ _normalize_paddle_result_lists(rec_texts, rec_scores, rec_polys)
+ )
+
+ # Update page_result with normalized lists
+ page_result["rec_texts"] = rec_texts
+ page_result["rec_scores"] = rec_scores
+ page_result["rec_polys"] = rec_polys
+
+ # Initialize rec_models list with "Paddle" as default for all lines
+ if (
+ "rec_models" not in page_result
+ or len(page_result.get("rec_models", [])) != num_lines
+ ):
+ rec_models = ["Paddle"] * num_lines
+ page_result["rec_models"] = rec_models
+ else:
+ rec_models = page_result["rec_models"]
+
+ # Since we're using the exact image PaddleOCR processed, coordinates are directly in image space
+ # No coordinate conversion needed - coordinates match the image dimensions exactly
+
+ # Process each line
+ for i in range(num_lines):
+ line_text = rec_texts[i]
+
+ line_conf = float(rec_scores[i]) * 100 # Convert to percentage
+ bounding_box = rec_polys[i]
+
+ # Skip if bounding box is empty (from padding)
+ # Handle numpy arrays, lists, and None values safely
+ if bounding_box is None:
+ print(
+ f"Current line {i + 1} of {num_lines}: Bounding box is None"
+ )
+ continue
+
+ # Convert to list first to handle numpy arrays safely
+ if hasattr(bounding_box, "tolist"):
+ box = bounding_box.tolist()
+ else:
+ box = bounding_box
+
+ # Check if box is empty (handles both list and numpy array cases)
+ if not box or (isinstance(box, list) and len(box) == 0):
+ print(f"Current line {i + 1} of {num_lines}: Box is empty")
+ continue
+
+ # Skip empty lines
+ if not line_text.strip():
+ print(
+ f"Current line {i + 1} of {num_lines}: Line text is empty"
+ )
+ continue
+
+ # Convert polygon to bounding box
+ x_coords = [p[0] for p in box]
+ y_coords = [p[1] for p in box]
+
+ line_left_paddle = float(min(x_coords))
+ line_top_paddle = float(min(y_coords))
+ line_right_paddle = float(max(x_coords))
+ line_bottom_paddle = float(max(y_coords))
+ line_width_paddle = line_right_paddle - line_left_paddle
+ line_height_paddle = line_bottom_paddle - line_top_paddle
+
+ # Since we're using the exact image PaddleOCR processed, coordinates are already in image space
+ line_left = line_left_paddle
+ line_top = line_top_paddle
+ line_width = line_width_paddle
+ line_height = line_height_paddle
+
+ # Count words in PaddleOCR output
+ paddle_words = line_text.split()
+ paddle_word_count = len(paddle_words)
+
+ # If confidence is low, use inference-server for a second opinion
+ if line_conf <= confidence_threshold:
+
+ # Ensure minimum line height for inference-server processing
+ min_line_height = max(
+ line_height, 20
+ ) # Minimum 20 pixels for text line
+
+ # Calculate crop coordinates with padding
+ # Convert floats to integers and apply padding, clamping to image bounds
+ crop_left = max(0, int(round(line_left - padding)))
+ crop_top = max(0, int(round(line_top - padding)))
+ crop_right = min(
+ img_width, int(round(line_left + line_width + padding))
+ )
+ crop_bottom = min(
+ img_height, int(round(line_top + min_line_height + padding))
+ )
+
+ # Ensure crop dimensions are valid
+ if crop_right <= crop_left or crop_bottom <= crop_top:
+ # Invalid crop, keep original PaddleOCR result
+ print(
+ f"Current line {i + 1} of {num_lines}: Invalid crop, keeping original PaddleOCR result"
+ )
+ continue
+
+ # Crop the line image
+ cropped_image = image.crop(
+ (crop_left, crop_top, crop_right, crop_bottom)
+ )
+
+ # Check if cropped image is too small for inference-server processing
+ crop_width = crop_right - crop_left
+ crop_height = crop_bottom - crop_top
+ if crop_width < 10 or crop_height < 10:
+ # Keep original PaddleOCR result for this line
+ print(
+ f"Current line {i + 1} of {num_lines}: Cropped image is too small, keeping original PaddleOCR result"
+ )
+ continue
+
+ # Ensure cropped image is in RGB mode before passing to inference-server
+ if cropped_image.mode != "RGB":
+ cropped_image = cropped_image.convert("RGB")
+
+ # Match hybrid local VLM: resize/DPI budget then aspect-pad before API
+ try:
+ prepared_for_inference = _prepare_hybrid_line_crop_for_vlm(
+ cropped_image
+ )
+ except Exception as prep_err:
+ print(
+ f"Current line {i + 1} of {num_lines}: "
+ f"Could not prepare image for inference server: {prep_err}"
+ )
+ continue
+
+ # Save the same pixels sent to the API when debugging
+ if SAVE_VLM_INPUT_IMAGES:
+ try:
+ inference_server_debug_dir = os.path.join(
+ self.output_folder,
+ "hybrid_paddle_inference_server_visualisations/hybrid_analysis_input_images",
+ )
+ os.makedirs(inference_server_debug_dir, exist_ok=True)
+ line_text_safe = safe_sanitize_text(line_text)
+ line_text_shortened = line_text_safe[:20]
+ image_name_safe = safe_sanitize_text(image_name)
+ image_name_shortened = image_name_safe[:20]
+ filename = f"{image_name_shortened}_{line_text_shortened}_hybrid_analysis_input_image.png"
+ filepath = os.path.join(
+ inference_server_debug_dir, filename
+ )
+ _save_image_with_config_dpi(
+ prepared_for_inference, filepath
+ )
+ except Exception as save_error:
+ print(
+ f"Warning: Could not save inference-server input image: {save_error}"
+ )
+
+ # Use inference-server for OCR on this line with error handling
+ inference_server_result = None
+ inference_server_rec_texts = []
+ inference_server_rec_scores = []
+
+ print(
+ f" Line {i + 1}/{num_lines}: Sending to inference server "
+ f"(Paddle conf: {line_conf:.1f}%, words: {paddle_word_count})"
+ )
+ try:
+ inference_server_result = _inference_server_ocr_predict(
+ prepared_for_inference,
+ model_name=model_name,
+ image_hybrid_line_prepared=True,
+ )
+ inference_server_rec_texts = (
+ inference_server_result.get("rec_texts", [])
+ if inference_server_result
+ else []
+ )
+
+ inference_server_rec_scores = (
+ inference_server_result.get("rec_scores", [])
+ if inference_server_result
+ else []
+ )
+ except Exception as e:
+ print(
+ f"Current line {i + 1} of {num_lines}: Error in inference-server OCR: {e}"
+ )
+ # Ensure we keep original PaddleOCR result on error
+ inference_server_rec_texts = []
+ inference_server_rec_scores = []
+
+ if not (
+ inference_server_rec_texts and inference_server_rec_scores
+ ):
+ # Inference server returned empty or no results - keep Paddle
+ print(
+ f" Line {i + 1}/{num_lines}: Inference server returned no results "
+ f"(Paddle conf: {line_conf:.1f}%, text: '{line_text[:40]}{'...' if len(line_text) > 40 else ''}'), keeping Paddle result."
+ )
+
+ if inference_server_rec_texts and inference_server_rec_scores:
+ # Combine inference-server words into a single text string
+ inference_server_text = " ".join(inference_server_rec_texts)
+
+ ### If text starts with "Cannot read", then skip this line
+ if inference_server_text.startswith('""'):
+ print(
+ "Inference server text starts with '"
+ "', skipping line {i + 1} of {num_lines}"
+ )
+ continue
+
+ inference_server_word_count = len(
+ inference_server_rec_texts
+ )
+ inference_server_conf = float(
+ np.median(inference_server_rec_scores)
+ ) # Keep as 0-1 range for paddle format
+
+ # Only replace if word counts match
+ word_count_allowed_difference = 7
+ if (
+ inference_server_word_count - paddle_word_count
+ <= word_count_allowed_difference
+ and inference_server_word_count - paddle_word_count
+ >= -word_count_allowed_difference
+ ):
+ message_output = (
+ f" Re-OCR'd line: '{line_text}' (conf: {line_conf:.1f}, words: {paddle_word_count}) "
+ f"-> '{inference_server_text}' (conf: {inference_server_conf*100:.1f}, words: {inference_server_word_count}) [Inference Server]"
+ )
+ print(message_output)
+
+ if REPORT_VLM_OUTPUTS_TO_GUI:
+ try:
+ gr.Info(message_output, duration=2)
+ except Exception:
+ # gr.Info may not be available in worker process, ignore
+ pass
+
+ # For exporting example image comparisons
+ safe_filename = (
+ instance_self._create_safe_filename_with_confidence(
+ line_text,
+ inference_server_text,
+ int(line_conf),
+ int(inference_server_conf * 100),
+ "Inference Server",
+ )
+ )
+
+ if SAVE_EXAMPLE_HYBRID_IMAGES:
+ # Normalize and validate image_name to prevent path traversal attacks
+ normalized_image_name = os.path.normpath(
+ image_name + "_hybrid_paddle_inference_server"
+ )
+ if (
+ ".." in normalized_image_name
+ or "/" in normalized_image_name
+ or "\\" in normalized_image_name
+ ):
+ normalized_image_name = "safe_image"
+
+ hybrid_ocr_examples_folder = (
+ instance_self.output_folder
+ + f"/hybrid_ocr_examples/{normalized_image_name}"
+ )
+ # Validate the constructed path is safe
+ if not validate_folder_containment(
+ hybrid_ocr_examples_folder, OUTPUT_FOLDER
+ ):
+ raise ValueError(
+ f"Unsafe hybrid_ocr_examples folder path: {hybrid_ocr_examples_folder}"
+ )
+
+ if not os.path.exists(hybrid_ocr_examples_folder):
+ os.makedirs(hybrid_ocr_examples_folder)
+ output_image_path = (
+ hybrid_ocr_examples_folder
+ + f"/{safe_filename}.png"
+ )
+ _save_image_with_config_dpi(
+ prepared_for_inference, output_image_path
+ )
+
+ # Replace with inference-server result in paddle_results format
+ # Update rec_texts, rec_scores, and rec_models for this line
+ rec_texts[i] = inference_server_text
+ rec_scores[i] = inference_server_conf
+ rec_models[i] = "Inference Server"
+ # Ensure page_result is updated with the modified rec_models list
+ page_result["rec_models"] = rec_models
+ else:
+ print(
+ f" Line: '{line_text}' (conf: {line_conf:.1f}, words: {paddle_word_count}) -> "
+ f"Inference-server result '{inference_server_text}' (conf: {inference_server_conf*100:.1f}, words: {inference_server_word_count}) "
+ f"word count mismatch. Keeping PaddleOCR result."
+ )
+ else:
+ # Inference-server returned empty or no results - keep original PaddleOCR result
+ if line_conf <= confidence_threshold:
+ pass
+
+ return page_results
+
+ modified_paddle_results = _process_page_result_with_hybrid_inference_server_ocr(
+ copied_paddle_results,
+ image,
+ img_width,
+ img_height,
+ input_image_width,
+ input_image_height,
+ confidence_threshold,
+ image_name,
+ self,
+ padding,
+ )
+
+ return modified_paddle_results
+
+ def perform_ocr(
+ self,
+ image: Union[str, Image.Image, np.ndarray],
+ ocr: Optional[Any] = None,
+ bedrock_runtime=None,
+ gemini_client=None,
+ gemini_config=None,
+ azure_openai_client=None,
+ vlm_model_choice: str = None,
+ inference_server_model_name: str = None,
+ page_index_0: Optional[int] = None,
+ ) -> Tuple[List[OCRResult], int, int, str]:
+ """
+ Performs OCR on the given image using the configured engine.
+
+ page_index_0: 0-based page index for VLM prompt/response log filenames when the
+ basename does not encode the page (optional).
+ """
+ if isinstance(image, str):
+ image_path = image
+ image_name = os.path.basename(image)
+ image = Image.open(image)
+ elif isinstance(image, np.ndarray):
+ image = Image.fromarray(image)
+ image_path = ""
+ image_name = "unknown_image_name"
+
+ # Pre-process image
+ # Store original dimensions BEFORE preprocessing (needed for coordinate conversion)
+ original_image_width = None
+ original_image_height = None
+ original_image_for_visualization = (
+ None # Store original image for visualization
+ )
+
+ if PREPROCESS_LOCAL_OCR_IMAGES:
+ # print("Pre-processing image...")
+ # Get original dimensions before preprocessing
+ original_image_width, original_image_height = image.size
+ # Store original image for visualization (coordinates are in original space)
+ original_image_for_visualization = image.copy()
+ image, preprocessing_metadata = self.image_preprocessor.preprocess_image(
+ image
+ )
+ # Only export preprocessed images when they are actually used as OCR input.
+ # Full-page VLM-style OCR paths use the original image for coordinate consistency.
+ save_preprocessed_for_engine = self.ocr_engine not in (
+ "vlm",
+ "inference-server",
+ "bedrock-vlm",
+ "gemini-vlm",
+ "azure-openai-vlm",
+ )
+ if SAVE_PREPROCESS_IMAGES and save_preprocessed_for_engine:
+ # print("Saving pre-processed image...")
+ image_basename = os.path.basename(image_name)
+ output_path = os.path.join(
+ self.output_folder,
+ "preprocessed_images",
+ image_basename + "_preprocessed_image.png",
+ )
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
+ _save_image_with_config_dpi(image, output_path)
+ # print(f"Pre-processed image saved to {output_path}")
+ else:
+ preprocessing_metadata = dict()
+ original_image_width, original_image_height = image.size
+ # When preprocessing is disabled, the current image is the original
+ original_image_for_visualization = image.copy()
+
+ image_width, image_height = image.size
+
+ # Store original image for line-to-word conversion when PaddleOCR processes original image
+ original_image_for_cropping = None
+ paddle_processed_original = False
+
+ # Note: In testing I haven't seen that this necessarily improves results
+ if self.ocr_engine == "hybrid-paddle":
+ try:
+ pass
+ except Exception as e:
+ raise ImportError(
+ f"Error importing PaddleOCR: {e}. Please install it using 'pip install paddleocr paddlepaddle' in your python environment and retry."
+ )
+
+ # Try hybrid with original image for cropping:
+ ocr_data = self._perform_hybrid_ocr(image, image_name=image_name)
+
+ elif self.ocr_engine == "hybrid-vlm":
+ # Try hybrid VLM with original image for cropping:
+ ocr_data = self._perform_hybrid_ocr(image, image_name=image_name)
+
+ # Initialize VLM token tracking variables
+ vlm_total_input_tokens = 0
+ vlm_total_output_tokens = 0
+ vlm_model_name = ""
+
+ if self.ocr_engine == "vlm":
+ # VLM page-level OCR - sends whole page to VLM and gets structured line-level results
+ # Use original image (before preprocessing) for VLM since coordinates should be in original space
+ vlm_image = (
+ original_image_for_visualization
+ if original_image_for_visualization is not None
+ else image
+ )
+ ocr_data, vlm_input_tokens, vlm_output_tokens, vlm_model_name = (
+ _vlm_page_ocr_predict(
+ vlm_image,
+ image_name=image_name,
+ output_folder=self.output_folder,
+ page_index_0=page_index_0,
+ )
+ )
+ vlm_total_input_tokens = vlm_input_tokens
+ vlm_total_output_tokens = vlm_output_tokens
+ # VLM returns data already in the expected format, so no conversion needed
+
+ elif self.ocr_engine == "inference-server":
+ # Inference-server page-level OCR - sends whole page to inference-server API and gets structured line-level results
+ # Use original image (before preprocessing) for inference-server since coordinates should be in original space
+ inference_server_image = (
+ original_image_for_visualization
+ if original_image_for_visualization is not None
+ else image
+ )
+ ocr_data, vlm_input_tokens, vlm_output_tokens, vlm_model_name = (
+ _inference_server_page_ocr_predict(
+ inference_server_image,
+ image_name=image_name,
+ normalised_coords_range=999,
+ output_folder=self.output_folder,
+ model_name=inference_server_model_name,
+ page_index_0=page_index_0,
+ )
+ )
+ vlm_total_input_tokens = vlm_input_tokens
+ vlm_total_output_tokens = vlm_output_tokens
+ # Inference-server returns data already in the expected format, so no conversion needed
+
+ elif self.ocr_engine == "bedrock-vlm":
+ # Bedrock page-level OCR - sends whole page to Bedrock API and gets structured line-level results
+ # Use original image (before preprocessing) for Bedrock since coordinates should be in original space
+ bedrock_image = (
+ original_image_for_visualization
+ if original_image_for_visualization is not None
+ else image
+ )
+ # Get model choice from parameter or config
+ from tools.config import CLOUD_VLM_MODEL_CHOICE
+
+ model_choice = (
+ vlm_model_choice if vlm_model_choice else CLOUD_VLM_MODEL_CHOICE
+ )
+
+ # Full-page VLM prompt instructs all models to use 0-999 coordinates; convert for any model_choice
+ normalised_coords_range = 999
+
+ ocr_data, vlm_input_tokens, vlm_output_tokens, vlm_model_name = (
+ _bedrock_page_ocr_predict(
+ bedrock_image,
+ image_name=image_name,
+ normalised_coords_range=normalised_coords_range,
+ output_folder=self.output_folder,
+ model_choice=model_choice,
+ bedrock_runtime=bedrock_runtime,
+ page_index_0=page_index_0,
+ )
+ )
+ vlm_total_input_tokens = vlm_input_tokens
+ vlm_total_output_tokens = vlm_output_tokens
+ # Bedrock returns data already in the expected format, so no conversion needed
+
+ elif self.ocr_engine == "gemini-vlm":
+ # Gemini page-level OCR - sends whole page to Gemini API and gets structured line-level results
+ # Use original image (before preprocessing) for Gemini since coordinates should be in original space
+ gemini_image = (
+ original_image_for_visualization
+ if original_image_for_visualization is not None
+ else image
+ )
+ # Get model choice from parameter or config
+ from tools.config import CLOUD_VLM_MODEL_CHOICE
+
+ model_choice = (
+ vlm_model_choice if vlm_model_choice else CLOUD_VLM_MODEL_CHOICE
+ )
+ ocr_data, vlm_input_tokens, vlm_output_tokens, vlm_model_name = (
+ _gemini_page_ocr_predict(
+ gemini_image,
+ image_name=image_name,
+ normalised_coords_range=999, # Full-page prompt uses 0-999 coordinates
+ output_folder=self.output_folder,
+ model_choice=model_choice,
+ client=gemini_client,
+ config=gemini_config,
+ page_index_0=page_index_0,
+ )
+ )
+ vlm_total_input_tokens = vlm_input_tokens
+ vlm_total_output_tokens = vlm_output_tokens
+ # Gemini returns data already in the expected format, so no conversion needed
+
+ elif self.ocr_engine == "azure-openai-vlm":
+ # Azure/OpenAI page-level OCR - sends whole page to Azure/OpenAI API and gets structured line-level results
+ # Use original image (before preprocessing) for Azure/OpenAI since coordinates should be in original space
+ azure_image = (
+ original_image_for_visualization
+ if original_image_for_visualization is not None
+ else image
+ )
+ # Get model choice from parameter or config
+ from tools.config import CLOUD_VLM_MODEL_CHOICE
+
+ model_choice = (
+ vlm_model_choice if vlm_model_choice else CLOUD_VLM_MODEL_CHOICE
+ )
+ ocr_data, vlm_input_tokens, vlm_output_tokens, vlm_model_name = (
+ _azure_openai_page_ocr_predict(
+ azure_image,
+ image_name=image_name,
+ normalised_coords_range=999, # Full-page prompt uses 0-999 coordinates
+ output_folder=self.output_folder,
+ model_choice=model_choice,
+ client=azure_openai_client,
+ page_index_0=page_index_0,
+ )
+ )
+ vlm_total_input_tokens = vlm_input_tokens
+ vlm_total_output_tokens = vlm_output_tokens
+ # Azure/OpenAI returns data already in the expected format, so no conversion needed
+
+ elif self.ocr_engine == "tesseract":
+
+ ocr_data = pytesseract.image_to_data(
+ image,
+ output_type=pytesseract.Output.DICT,
+ config=self.tesseract_config,
+ lang=self.tesseract_lang, # Ensure the Tesseract language data (e.g., fra.traineddata) is installed on your system.
+ )
+
+ if TESSERACT_WORD_LEVEL_OCR is False:
+ ocr_df = pd.DataFrame(ocr_data)
+
+ # Filter out invalid entries (confidence == -1)
+ ocr_df = ocr_df[ocr_df.conf != -1]
+
+ # Group by line and aggregate text
+ line_groups = ocr_df.groupby(["block_num", "par_num", "line_num"])
+
+ ocr_data = line_groups.apply(self._calculate_line_bbox).reset_index()
+
+ # Convert DataFrame to dictionary of lists format expected by downstream code
+ ocr_data = {
+ "text": ocr_data["text"].tolist(),
+ "left": ocr_data["left"].astype(int).tolist(),
+ "top": ocr_data["top"].astype(int).tolist(),
+ "width": ocr_data["width"].astype(int).tolist(),
+ "height": ocr_data["height"].astype(int).tolist(),
+ "conf": ocr_data["conf"].tolist(),
+ "model": ["Tesseract"] * len(ocr_data), # Add model field
+ }
+
+ elif (
+ self.ocr_engine == "paddle"
+ or self.ocr_engine == "hybrid-paddle-vlm"
+ or self.ocr_engine == "hybrid-paddle-inference-server"
+ ):
+
+ if ocr is None:
+ if hasattr(self, "paddle_ocr") and self.paddle_ocr is not None:
+ ocr = self.paddle_ocr
+ else:
+ raise ValueError(
+ "No OCR object provided and 'paddle_ocr' is not initialised."
+ )
+
+ try:
+ pass
+ except Exception as e:
+ raise ImportError(
+ f"Error importing PaddleOCR: {e}. Please install it using 'pip install paddleocr paddlepaddle' in your python environment and retry."
+ )
+
+ prepare_page_for_hybrid_vlm = (
+ PREPARE_PAGE_FOR_HYBRID_VLM_BEFORE_PADDLE
+ and self.ocr_engine
+ in ["hybrid-paddle-vlm", "hybrid-paddle-inference-server"]
+ )
+
+ paddle_prepared_width = None
+ paddle_prepared_height = None
+ paddle_input_image = image
+
+ if prepare_page_for_hybrid_vlm:
+ # Resize/pad the full page once so that line crops inherit the
+ # VLM-ready pixel density (reduces per-crop VLM resizing work).
+ if paddle_input_image.mode != "RGB":
+ paddle_input_image = paddle_input_image.convert("RGB")
+ paddle_input_image = _prepare_image_for_vlm(
+ paddle_input_image,
+ hybrid_vlm=True,
+ )
+ paddle_prepared_width, paddle_prepared_height = paddle_input_image.size
+ print(
+ "Hybrid OCR: preparing PaddleOCR input page for VLM constraints "
+ f"({paddle_prepared_width}x{paddle_prepared_height})."
+ )
+
+ if not image_path:
+ image_np = np.array(
+ paddle_input_image
+ ) # image_processed (possibly resized)
+
+ # Check that sizes match the PaddleOCR input image we constructed.
+ image_np_height, image_np_width = image_np.shape[:2]
+ expected_w, expected_h = paddle_input_image.size
+ if image_np_width != expected_w or image_np_height != expected_h:
+ raise ValueError(
+ f"Image size mismatch: {image_np_width}x{image_np_height} != {expected_w}x{expected_h}"
+ )
+
+ # PaddleOCR may need an RGB image. Ensure it has 3 channels.
+ if len(image_np.shape) == 2:
+ image_np = np.stack([image_np] * 3, axis=-1)
+ else:
+ image_np = np.array(paddle_input_image)
+
+ paddle_results = ocr.predict(image_np)
+ # PaddleOCR processed the prepared image (not a file-path open)
+ paddle_processed_original = False
+
+ # Store the exact image that PaddleOCR processed (convert numpy array back to PIL Image)
+ # This ensures we crop from the exact same image PaddleOCR analyzed.
+ paddle_processed_image = Image.fromarray(image_np.astype(np.uint8))
+ else:
+ # When using image path, load image to get dimensions
+ temp_image = Image.open(image_path)
+
+ # For file path, we still keep the original image for visualization
+ # and for any downstream coordinate scaling.
+ original_image_for_cropping = temp_image.copy()
+
+ if prepare_page_for_hybrid_vlm:
+ # Run PaddleOCR on the VLM-prepared page (to keep rec_polys
+ # consistent with the line crops we send to VLM).
+ paddle_input_image = temp_image
+ if paddle_input_image.mode != "RGB":
+ paddle_input_image = paddle_input_image.convert("RGB")
+ paddle_input_image = _prepare_image_for_vlm(
+ paddle_input_image,
+ hybrid_vlm=True,
+ )
+ paddle_prepared_width, paddle_prepared_height = (
+ paddle_input_image.size
+ )
+
+ image_np = np.array(paddle_input_image)
+ if len(image_np.shape) == 2:
+ image_np = np.stack([image_np] * 3, axis=-1)
+ paddle_results = ocr.predict(image_np)
+ paddle_processed_original = False
+ paddle_processed_image = paddle_input_image.copy()
+ else:
+ # Use PaddleOCR's file-path loading (original behaviour).
+ try:
+ paddle_results = ocr.predict(image_path)
+ except Exception as ocr_path_exc:
+ # PaddleOCR's file-path path can hit OpenCV decode issues on
+ # specific pages. Retry using the already-loaded PIL image
+ # to avoid the OpenCV "read from disk" path.
+ print(
+ f"WARNING: PaddleOCR failed reading image path via OpenCV "
+ f"for {image_path}. Retrying with in-memory numpy image. "
+ f"Error: {ocr_path_exc}"
+ )
+ paddle_input_image = temp_image
+ if paddle_input_image.mode != "RGB":
+ paddle_input_image = paddle_input_image.convert("RGB")
+ image_np = np.array(paddle_input_image)
+ if len(image_np.shape) == 2:
+ image_np = np.stack([image_np] * 3, axis=-1)
+ paddle_results = ocr.predict(image_np)
+ # PaddleOCR processed the original image from file path
+ paddle_processed_original = True
+ # Store the exact image that PaddleOCR processed (from file path)
+ paddle_processed_image = temp_image.copy()
+
+ # Save PaddleOCR visualization with bounding boxes
+ if paddle_results and self.save_page_ocr_visualisations is True:
+
+ for res in paddle_results:
+ # self.output_folder is already validated and normalized at construction time
+ paddle_viz_folder = os.path.join(
+ self.output_folder, "paddle_visualisations"
+ )
+ # Double-check the constructed path is safe
+ if not validate_folder_containment(
+ paddle_viz_folder, OUTPUT_FOLDER
+ ):
+ raise ValueError(
+ f"Unsafe paddle visualisations folder path: {paddle_viz_folder}"
+ )
+
+ os.makedirs(paddle_viz_folder, exist_ok=True)
+ res.save_to_img(paddle_viz_folder)
+
+ # If we prepared/resized the page before running PaddleOCR, ensure
+ # each result dict reports the correct coordinate space size.
+ # This lets _convert_paddle_to_tesseract_format scale bboxes back
+ # into the original page pixel space reliably.
+ if (
+ prepare_page_for_hybrid_vlm
+ and paddle_prepared_width
+ and paddle_prepared_height
+ and isinstance(paddle_results, list)
+ ):
+ for res in paddle_results:
+ try:
+ if isinstance(res, dict):
+ res["image_width"] = paddle_prepared_width
+ res["image_height"] = paddle_prepared_height
+ except Exception:
+ pass
+
+ if self.ocr_engine == "hybrid-paddle-vlm":
+
+ modified_paddle_results = self._perform_hybrid_paddle_vlm_ocr(
+ paddle_processed_image, # Use the exact image PaddleOCR processed
+ ocr=ocr,
+ paddle_results=copy.deepcopy(paddle_results),
+ image_name=image_name,
+ input_image_width=original_image_width,
+ input_image_height=original_image_height,
+ )
+
+ elif self.ocr_engine == "hybrid-paddle-inference-server":
+
+ modified_paddle_results = self._perform_hybrid_paddle_inference_server_ocr(
+ paddle_processed_image, # Use the exact image PaddleOCR processed
+ ocr=ocr,
+ paddle_results=copy.deepcopy(paddle_results),
+ image_name=image_name,
+ input_image_width=original_image_width,
+ input_image_height=original_image_height,
+ model_name=inference_server_model_name,
+ )
+ else:
+ modified_paddle_results = copy.deepcopy(paddle_results)
+
+ ocr_data = self._convert_paddle_to_tesseract_format(
+ modified_paddle_results,
+ input_image_width=original_image_width,
+ input_image_height=original_image_height,
+ )
+
+ if self.save_page_ocr_visualisations is True:
+ # Save output to image with identified bounding boxes
+ # Use original image since coordinates are in original image space
+ # Prefer original_image_for_cropping (when PaddleOCR processed from file path),
+ # otherwise use original_image_for_visualization (stored before preprocessing)
+ viz_image = (
+ original_image_for_cropping
+ if original_image_for_cropping is not None
+ else (
+ original_image_for_visualization
+ if original_image_for_visualization is not None
+ else image
+ )
+ )
+ if isinstance(viz_image, Image.Image):
+ # Convert PIL Image to numpy array in BGR format for OpenCV
+ image_cv = cv2.cvtColor(np.array(viz_image), cv2.COLOR_RGB2BGR)
+ else:
+ image_cv = np.array(viz_image)
+ if len(image_cv.shape) == 2:
+ image_cv = cv2.cvtColor(image_cv, cv2.COLOR_GRAY2BGR)
+ elif len(image_cv.shape) == 3 and image_cv.shape[2] == 3:
+ # Assume RGB, convert to BGR
+ image_cv = cv2.cvtColor(image_cv, cv2.COLOR_RGB2BGR)
+
+ # Draw all bounding boxes on the image
+ for i in range(len(ocr_data["text"])):
+ left = int(ocr_data["left"][i])
+ top = int(ocr_data["top"][i])
+ width = int(ocr_data["width"][i])
+ height = int(ocr_data["height"][i])
+ # Ensure coordinates are within image bounds
+ left = max(0, min(left, image_cv.shape[1] - 1))
+ top = max(0, min(top, image_cv.shape[0] - 1))
+ right = max(left + 1, min(left + width, image_cv.shape[1]))
+ bottom = max(top + 1, min(top + height, image_cv.shape[0]))
+ cv2.rectangle(
+ image_cv, (left, top), (right, bottom), (0, 255, 0), 2
+ )
+
+ # Save the visualization once with all boxes drawn
+ paddle_viz_folder = os.path.join(
+ self.output_folder, "paddle_visualisations"
+ )
+ # Double-check the constructed path is safe
+ if not validate_folder_containment(paddle_viz_folder, OUTPUT_FOLDER):
+ raise ValueError(
+ f"Unsafe paddle visualisations folder path: {paddle_viz_folder}"
+ )
+
+ os.makedirs(paddle_viz_folder, exist_ok=True)
+
+ # Generate safe filename
+ if image_name:
+ base_name = os.path.splitext(os.path.basename(image_name))[0]
+ # Increment the number at the end of base_name
+ # This converts zero-indexed input to one-indexed output
+ incremented_base_name = base_name
+ # Find the number pattern at the end
+ # Matches patterns like: _0, _00, 0, 00, etc.
+ pattern = r"(\d+)$"
+ match = re.search(pattern, base_name)
+ if match:
+ number_str = match.group(1)
+ number = int(number_str)
+ incremented_number = number + 1
+ # Preserve the same number of digits (padding with zeros if needed)
+ incremented_str = str(incremented_number).zfill(len(number_str))
+ incremented_base_name = re.sub(
+ pattern, incremented_str, base_name
+ )
+ # Sanitize filename to avoid issues with special characters
+ incremented_base_name = safe_sanitize_text(
+ incremented_base_name, max_length=50
+ )
+ filename = f"{incremented_base_name}_initial_bounding_boxes.jpg"
+ else:
+ timestamp = int(time.time())
+ filename = f"initial_bounding_boxes_{timestamp}.jpg"
+
+ output_path = os.path.join(paddle_viz_folder, filename)
+ max_filesize = 500 * 1024 # 500kb in bytes
+ quality = 95 # Start high, OpenCV JPEG quality range is 0-100
+
+ # Try lowering JPEG quality until file is below size limit
+ is_saved = False
+ while quality >= 10:
+ cv2.imwrite(
+ output_path, image_cv, [int(cv2.IMWRITE_JPEG_QUALITY), quality]
+ )
+ if (
+ os.path.exists(output_path)
+ and os.path.getsize(output_path) <= max_filesize
+ ):
+ is_saved = True
+ break
+ quality -= 5
+
+ if not is_saved:
+ # Save as lowest acceptable quality if cannot get under 500kb, or raise warning
+ cv2.imwrite(
+ output_path, image_cv, [int(cv2.IMWRITE_JPEG_QUALITY), 10]
+ )
+
+ else:
+ raise RuntimeError(f"Unsupported OCR engine: {self.ocr_engine}")
+
+ # Always check for scale_factor, even if preprocessing_metadata is empty
+ # This ensures rescaling happens correctly when preprocessing was applied
+ scale_factor = (
+ preprocessing_metadata.get("scale_factor", 1.0)
+ if preprocessing_metadata
+ else 1.0
+ )
+ if scale_factor != 1.0:
+ # Skip rescaling for PaddleOCR since _convert_paddle_to_tesseract_format
+ # already scales coordinates directly to original image dimensions
+ # hybrid-paddle-vlm also uses PaddleOCR and converts to original space
+ # Skip rescaling for VLM since it returns coordinates in original image space
+ if (
+ self.ocr_engine == "paddle"
+ or self.ocr_engine == "hybrid-paddle-vlm"
+ or self.ocr_engine == "hybrid-paddle-inference-server"
+ or self.ocr_engine == "vlm"
+ or self.ocr_engine == "inference-server"
+ or self.ocr_engine == "bedrock-vlm"
+ or self.ocr_engine == "gemini-vlm"
+ or self.ocr_engine == "azure-openai-vlm"
+ ):
+ pass
+ # print(f"Skipping rescale_ocr_data for PaddleOCR/VLM (already scaled to original dimensions)")
+ else:
+ # print("rescaling ocr_data with scale_factor: ", scale_factor)
+ ocr_data = rescale_ocr_data(ocr_data, scale_factor)
+
+ # print("Finished rescaling ocr_data")
+
+ # Convert line-level results to word-level if configured and needed
+ if CONVERT_LINE_TO_WORD_LEVEL and self._is_line_level_data(ocr_data):
+ # print("Converting line-level OCR results to word-level...")
+
+ # Check if coordinates need to be scaled to match the image we're cropping from
+ # For PaddleOCR: _convert_paddle_to_tesseract_format converts coordinates to original image space
+ # - If PaddleOCR processed the original image (image_path provided), crop from original image (no scaling)
+ # - If PaddleOCR processed the preprocessed image (no image_path), scale coordinates to preprocessed space and crop from preprocessed image
+ # For Tesseract: OCR runs on preprocessed image
+ # - If scale_factor != 1.0, rescale_ocr_data converted coordinates to original space, so crop from original image
+ # - If scale_factor == 1.0, coordinates are still in preprocessed space, so crop from preprocessed image
+
+ needs_scaling = False
+ crop_image = image # Default to preprocessed image
+ crop_image_width = image_width
+ crop_image_height = image_height
+
+ if (
+ PREPROCESS_LOCAL_OCR_IMAGES
+ and original_image_width
+ and original_image_height
+ ):
+ if (
+ self.ocr_engine == "paddle"
+ or self.ocr_engine == "hybrid-paddle-vlm"
+ or self.ocr_engine == "hybrid-paddle-inference-server"
+ ):
+ # PaddleOCR coordinates are converted to original space by _convert_paddle_to_tesseract_format
+ # hybrid-paddle-vlm also uses PaddleOCR and converts to original space
+ if paddle_processed_original:
+ # PaddleOCR processed the original image, so crop from original image
+ # No scaling needed - coordinates are already in original space
+ crop_image = original_image_for_cropping
+ crop_image_width = original_image_width
+ crop_image_height = original_image_height
+ needs_scaling = False
+ else:
+ # PaddleOCR processed the preprocessed image, so scale coordinates to preprocessed space
+ needs_scaling = True
+ elif (
+ self.ocr_engine == "vlm"
+ or self.ocr_engine == "inference-server"
+ or self.ocr_engine == "bedrock-vlm"
+ or self.ocr_engine == "gemini-vlm"
+ or self.ocr_engine == "azure-openai-vlm"
+ ):
+ # VLM/Cloud VLM returns coordinates in original image space (since we pass original image to VLM)
+ # So we need to crop from the original image, not the preprocessed image
+ if original_image_for_visualization is not None:
+ # Coordinates are in original space, so crop from original image
+ crop_image = original_image_for_visualization
+ crop_image_width = original_image_width
+ crop_image_height = original_image_height
+ needs_scaling = False
+ else:
+ # Fallback to preprocessed image if original not available
+ needs_scaling = False
+ elif self.ocr_engine == "tesseract":
+ # For Tesseract: if scale_factor != 1.0, rescale_ocr_data converted coordinates to original space
+ # So we need to crop from the original image, not the preprocessed image
+ if (
+ scale_factor != 1.0
+ and original_image_for_visualization is not None
+ ):
+ # Coordinates are in original space, so crop from original image
+ crop_image = original_image_for_visualization
+ crop_image_width = original_image_width
+ crop_image_height = original_image_height
+ needs_scaling = False
+ else:
+ # scale_factor == 1.0, so coordinates are still in preprocessed space
+ # Crop from preprocessed image - no scaling needed
+ needs_scaling = False
+
+ if needs_scaling:
+ # Calculate scale factors from original to preprocessed
+ scale_x = image_width / original_image_width
+ scale_y = image_height / original_image_height
+ # Scale coordinates to preprocessed image space for cropping
+ scaled_ocr_data = {
+ "text": ocr_data["text"],
+ "left": [x * scale_x for x in ocr_data["left"]],
+ "top": [y * scale_y for y in ocr_data["top"]],
+ "width": [w * scale_x for w in ocr_data["width"]],
+ "height": [h * scale_y for h in ocr_data["height"]],
+ "conf": ocr_data["conf"],
+ "model": ocr_data["model"],
+ }
+ ocr_data = self._convert_line_to_word_level(
+ scaled_ocr_data,
+ crop_image_width,
+ crop_image_height,
+ crop_image,
+ image_name=image_name,
+ )
+ # Scale word-level results back to original image space
+ scale_factor_x = original_image_width / image_width
+ scale_factor_y = original_image_height / image_height
+ for i in range(len(ocr_data["left"])):
+ ocr_data["left"][i] = ocr_data["left"][i] * scale_factor_x
+ ocr_data["top"][i] = ocr_data["top"][i] * scale_factor_y
+ ocr_data["width"][i] = ocr_data["width"][i] * scale_factor_x
+ ocr_data["height"][i] = ocr_data["height"][i] * scale_factor_y
+ else:
+ # No scaling needed - coordinates match the crop image space
+ ocr_data = self._convert_line_to_word_level(
+ ocr_data,
+ crop_image_width,
+ crop_image_height,
+ crop_image,
+ image_name=image_name,
+ )
+
+ # print("Finished converting line level results to word level")
+
+ # The rest of your processing pipeline now works for both engines
+ ocr_result = ocr_data
+
+ # Filter out empty strings and non-positive confidence.
+ # Do not use int(conf): fractional confidences in (0, 1) from line→word conversion
+ # (e.g. AdaptiveSegmenter) would become 0 and drop every word (empty OCR tables / UI).
+ valid_indices = []
+ for i, text in enumerate(ocr_result["text"]):
+ if not (text and str(text).strip()):
+ continue
+ try:
+ c = float(ocr_result["conf"][i])
+ except (TypeError, ValueError, IndexError, KeyError):
+ continue
+ if c > 0:
+ valid_indices.append(i)
+
+ # Determine default model based on OCR engine if model field is not present
+ if "model" in ocr_result and len(ocr_result["model"]) == len(
+ ocr_result["text"]
+ ):
+ # Model field exists and has correct length - use it (preserves VLM/inference-server replacements)
+ def get_model_name(idx):
+ return ocr_result["model"][idx]
+
+ else:
+ # Model field not present or incorrect length - use default based on engine
+ default_model = (
+ "Tesseract"
+ if self.ocr_engine == "tesseract"
+ else (
+ "Paddle"
+ if self.ocr_engine == "paddle"
+ else (
+ "Tesseract"
+ if self.ocr_engine == "hybrid-paddle"
+ else (
+ "Tesseract"
+ if self.ocr_engine == "hybrid-vlm"
+ else (
+ "Paddle"
+ if self.ocr_engine == "hybrid-paddle-vlm"
+ else (
+ "Paddle"
+ if self.ocr_engine
+ == "hybrid-paddle-inference-server"
+ else (
+ "VLM"
+ if self.ocr_engine == "vlm"
+ else (
+ "Inference Server"
+ if self.ocr_engine == "inference-server"
+ else (
+ "Bedrock"
+ if self.ocr_engine == "bedrock-vlm"
+ else (
+ "Gemini"
+ if self.ocr_engine == "gemini-vlm"
+ else (
+ "Azure/OpenAI"
+ if self.ocr_engine
+ == "azure-openai-vlm"
+ else None
+ )
+ )
+ )
+ )
+ )
+ )
+ )
+ )
+ )
+ )
+ )
+
+ def get_model_name(idx):
+ return default_model
+
+ output = [
+ OCRResult(
+ text=clean_unicode_text(
+ ocr_result["text"][i], preserve_international_scripts=True
+ ),
+ left=ocr_result["left"][i],
+ top=ocr_result["top"][i],
+ width=ocr_result["width"][i],
+ height=ocr_result["height"][i],
+ conf=round(float(ocr_result["conf"][i]), 0),
+ model=get_model_name(i),
+ )
+ for i in valid_indices
+ ]
+
+ return output, vlm_total_input_tokens, vlm_total_output_tokens, vlm_model_name
+
+ def analyze_text(
+ self,
+ line_level_ocr_results: List[OCRResult],
+ ocr_results_with_words: Dict[str, Dict],
+ chosen_redact_comprehend_entities: List[str],
+ pii_identification_method: str = LOCAL_PII_OPTION,
+ comprehend_client="",
+ custom_entities: List[str] = custom_entities,
+ language: Optional[str] = DEFAULT_LANGUAGE,
+ nlp_analyser: AnalyzerEngine = None,
+ bedrock_runtime=None,
+ model_choice: str = CLOUD_LLM_PII_MODEL_CHOICE,
+ custom_llm_instructions: str = "",
+ chosen_llm_entities: List[str] = None,
+ file_name: Optional[str] = None,
+ page_number: Optional[int] = None,
+ **text_analyzer_kwargs,
+ ) -> List[CustomImageRecognizerResult]:
+
+ page_text = ""
+ page_text_mapping = list()
+ all_text_line_results = list()
+ comprehend_query_number = 0
+
+ # Track LLM token usage
+ llm_total_input_tokens = 0
+ llm_total_output_tokens = 0
+ llm_model_name = ""
+
+ # Extract allow_list from text_analyzer_kwargs if provided
+ # This allows allow_list terms to "overrule" LLM PII detection results
+ allow_list = text_analyzer_kwargs.get("allow_list", [])
+ if allow_list is None:
+ allow_list = []
+
+ # Default chosen_llm_entities to chosen_redact_comprehend_entities if not provided
+ if chosen_llm_entities is None:
+ chosen_llm_entities = chosen_redact_comprehend_entities
+
+ # Filter out CUSTOM_VLM_* entities (these are handled separately via VLM, not LLM)
+ # and validate that we have either entities or custom instructions
+ filtered_llm_entities = [
+ entity
+ for entity in (chosen_llm_entities or [])
+ if not entity.startswith("CUSTOM_VLM_")
+ ]
+
+ # If only CUSTOM_VLM_* entities (and no custom instructions), skip LLM analysis and return blank
+ if not filtered_llm_entities and (
+ not custom_llm_instructions or not custom_llm_instructions.strip()
+ ):
+ if pii_identification_method == AWS_LLM_PII_OPTION:
+ return (list(), 0, "", 0, 0)
+ raise ValueError(
+ "No standard entities selected for LLM PII detection and no custom instructions provided. "
+ "Please select at least one entity type (excluding CUSTOM_VLM_* entities) or provide custom instructions."
+ )
+
+ if not nlp_analyser:
+ nlp_analyser = self.analyzer_engine
+
+ # Collect all text and create mapping
+ for i, line_level_ocr_result in enumerate(line_level_ocr_results):
+ if page_text:
+ page_text += " "
+ start_pos = len(page_text)
+ page_text += line_level_ocr_result.text
+ # Note: We're not passing line_characters here since it's not needed for this use case
+ page_text_mapping.append((start_pos, i, line_level_ocr_result, None))
+
+ # Determine language for downstream services
+ aws_language = language or getattr(self, "language", None) or "en"
+
+ valid_language_entities = nlp_analyser.registry.get_supported_entities(
+ languages=[language]
+ )
+ if "CUSTOM" not in valid_language_entities:
+ valid_language_entities.append("CUSTOM")
+ if "CUSTOM_FUZZY" not in valid_language_entities:
+ valid_language_entities.append("CUSTOM_FUZZY")
+
+ # Process using either Local or AWS Comprehend
+ if pii_identification_method == LOCAL_PII_OPTION:
+
+ language_supported_entities = filter_entities_for_language(
+ custom_entities, valid_language_entities, language
+ )
+
+ if language_supported_entities:
+ text_analyzer_kwargs["entities"] = language_supported_entities
+
+ else:
+ out_message = f"No relevant entities supported for language: {language}"
+ print(out_message)
+ raise Warning(out_message)
+
+ # Filter out LLM-specific parameters that Presidio AnalyzerEngine doesn't accept
+ # Also exclude allow_list since we pass it explicitly
+ presidio_kwargs = {
+ k: v
+ for k, v in text_analyzer_kwargs.items()
+ if k
+ not in [
+ "inference_method",
+ "model_choice",
+ "api_url",
+ "local_model",
+ "tokenizer",
+ "assistant_model",
+ "client",
+ "client_config",
+ "temperature",
+ "max_tokens",
+ "custom_instructions",
+ "allow_list",
+ ]
+ }
+
+ analyzer_result = nlp_analyser.analyze(
+ text=page_text,
+ language=language,
+ allow_list=allow_list,
+ **presidio_kwargs,
+ )
+ all_text_line_results = map_back_entity_results(
+ analyzer_result,
+ page_text_mapping,
+ all_text_line_results,
+ allow_list=allow_list,
+ )
+
+ elif pii_identification_method == AWS_PII_OPTION:
+
+ # Run local detection for any custom entities (including CUSTOM/CUSTOM_FUZZY)
+ local_custom_entities = [
+ entity
+ for entity in (chosen_redact_comprehend_entities or [])
+ if entity in (custom_entities or [])
+ or entity in ("CUSTOM", "CUSTOM_FUZZY")
+ ]
+
+ if local_custom_entities:
+ # Filter entities to only include those supported by the language
+ language_supported_entities = filter_entities_for_language(
+ local_custom_entities, valid_language_entities, language
+ )
+
+ if language_supported_entities:
+ text_analyzer_kwargs["entities"] = language_supported_entities
+
+ # Filter out LLM-specific parameters that Presidio AnalyzerEngine doesn't accept
+ presidio_kwargs = {
+ k: v
+ for k, v in text_analyzer_kwargs.items()
+ if k
+ not in [
+ "inference_method",
+ "model_choice",
+ "api_url",
+ "local_model",
+ "tokenizer",
+ "assistant_model",
+ "client",
+ "client_config",
+ "temperature",
+ "max_tokens",
+ "custom_instructions",
+ "allow_list",
+ ]
+ }
+
+ page_analyser_result = nlp_analyser.analyze(
+ text=page_text,
+ language=language,
+ allow_list=allow_list,
+ **presidio_kwargs,
+ )
+ all_text_line_results = map_back_entity_results(
+ page_analyser_result,
+ page_text_mapping,
+ all_text_line_results,
+ allow_list=allow_list,
+ )
+
+ # Guard: only call AWS Comprehend when at least one non-custom Comprehend entity is selected.
+ aws_comprehend_entities = [
+ entity
+ for entity in (chosen_redact_comprehend_entities or [])
+ if entity in (FULL_COMPREHEND_ENTITY_LIST or [])
+ and entity not in ("CUSTOM", "CUSTOM_FUZZY")
+ ]
+
+ # Process text in batches for AWS Comprehend
+ current_batch = ""
+ current_batch_mapping = list()
+ batch_char_count = 0
+ batch_word_count = 0
+
+ for i, text_line in enumerate(
+ line_level_ocr_results
+ ): # Changed from line_level_text_results_list
+ words = text_line.text.split()
+ word_start_positions = list()
+ current_pos = 0
+
+ for word in words:
+ word_start_positions.append(current_pos)
+ current_pos += len(word) + 1
+
+ word_idx = 0
+ while word_idx < len(words):
+ word = words[word_idx]
+ new_batch_char_count = len(current_batch) + len(word) + 1
+
+ # Check if we've hit the limit
+ limit_reached = (
+ batch_word_count >= DEFAULT_NEW_BATCH_WORD_COUNT
+ or new_batch_char_count >= DEFAULT_NEW_BATCH_CHAR_COUNT
+ )
+
+ if limit_reached:
+ # Add the current word to the batch first
+ if current_batch:
+ current_batch += " "
+ batch_char_count += 1
+ current_batch += word
+ batch_char_count += len(word)
+ batch_word_count += 1
+
+ if (
+ not current_batch_mapping
+ or current_batch_mapping[-1][1] != i
+ ):
+ current_batch_mapping.append(
+ (
+ batch_char_count - len(word),
+ i,
+ text_line,
+ None,
+ word_start_positions[word_idx],
+ )
+ )
+
+ # Check if current word ends with phrase punctuation
+ if ends_with_phrase_punctuation(word):
+ # Process current batch
+ all_text_line_results = do_aws_comprehend_call(
+ current_batch,
+ current_batch_mapping,
+ comprehend_client,
+ aws_language,
+ text_analyzer_kwargs.get("allow_list", []),
+ aws_comprehend_entities,
+ all_text_line_results,
+ )
+ if aws_comprehend_entities:
+ comprehend_query_number += (
+ len(current_batch.strip())
+ + COMPREHEND_CHARACTERS_PER_UNIT
+ - 1
+ ) // COMPREHEND_CHARACTERS_PER_UNIT
+
+ # Reset batch
+ current_batch = ""
+ batch_word_count = 0
+ batch_char_count = 0
+ current_batch_mapping = list()
+ word_idx += 1
+ else:
+ # Look ahead in current line for phrase-ending punctuation or end of line
+ lookahead_idx = word_idx + 1
+ lookahead_batch = current_batch
+ lookahead_char_count = batch_char_count
+ lookahead_word_count = batch_word_count
+ lookahead_mapping = list(current_batch_mapping)
+
+ # Continue adding words until we find phrase-ending punctuation or end of line
+ while lookahead_idx < len(words):
+ lookahead_word = words[lookahead_idx]
+
+ # Add the word to lookahead batch
+ if lookahead_batch:
+ lookahead_batch += " "
+ lookahead_char_count += 1
+ lookahead_batch += lookahead_word
+ lookahead_char_count += len(lookahead_word)
+ lookahead_word_count += 1
+
+ if (
+ not lookahead_mapping
+ or lookahead_mapping[-1][1] != i
+ ):
+ lookahead_mapping.append(
+ (
+ lookahead_char_count - len(lookahead_word),
+ i,
+ text_line,
+ None,
+ word_start_positions[lookahead_idx],
+ )
+ )
+
+ # Check if this word ends with phrase punctuation
+ if ends_with_phrase_punctuation(lookahead_word):
+ break
+
+ lookahead_idx += 1
+
+ # Use the lookahead batch (either found phrase end or reached end of line)
+ current_batch = lookahead_batch
+ batch_char_count = lookahead_char_count
+ batch_word_count = lookahead_word_count
+ current_batch_mapping = lookahead_mapping
+
+ # Process current batch
+ all_text_line_results = do_aws_comprehend_call(
+ current_batch,
+ current_batch_mapping,
+ comprehend_client,
+ aws_language,
+ text_analyzer_kwargs.get("allow_list", []),
+ aws_comprehend_entities,
+ all_text_line_results,
+ )
+ if aws_comprehend_entities:
+ comprehend_query_number += (
+ len(current_batch.strip())
+ + COMPREHEND_CHARACTERS_PER_UNIT
+ - 1
+ ) // COMPREHEND_CHARACTERS_PER_UNIT
+
+ # Reset batch
+ current_batch = ""
+ batch_word_count = 0
+ batch_char_count = 0
+ current_batch_mapping = list()
+ word_idx = lookahead_idx + 1
+ else:
+ # Normal case: add word to batch
+ if current_batch:
+ current_batch += " "
+ batch_char_count += 1
+ current_batch += word
+ batch_char_count += len(word)
+ batch_word_count += 1
+
+ if (
+ not current_batch_mapping
+ or current_batch_mapping[-1][1] != i
+ ):
+ current_batch_mapping.append(
+ (
+ batch_char_count - len(word),
+ i,
+ text_line,
+ None,
+ word_start_positions[word_idx],
+ )
+ )
+ word_idx += 1
+
+ # Process final batch if any
+ if current_batch:
+ all_text_line_results = do_aws_comprehend_call(
+ current_batch,
+ current_batch_mapping,
+ comprehend_client,
+ aws_language,
+ text_analyzer_kwargs.get("allow_list", []),
+ aws_comprehend_entities,
+ all_text_line_results,
+ )
+ if aws_comprehend_entities:
+ comprehend_query_number += (
+ len(current_batch.strip()) + COMPREHEND_CHARACTERS_PER_UNIT - 1
+ ) // COMPREHEND_CHARACTERS_PER_UNIT
+
+ elif pii_identification_method == AWS_LLM_PII_OPTION:
+ # LLM-based entity detection using AWS Bedrock
+ try:
+ from tools.llm_entity_detection import do_llm_entity_detection_call
+ except ImportError as e:
+ print(f"Error importing LLM entity detection: {e}")
+ raise ImportError(
+ "LLM entity detection not available. Please ensure llm_funcs.py is accessible."
+ )
+
+ if not bedrock_runtime:
+ raise ValueError(
+ "bedrock_runtime is required when using LLM-based PII detection"
+ )
+
+ # Set inference method to aws-bedrock if not already set
+ if text_analyzer_kwargs.get("inference_method") is None:
+ text_analyzer_kwargs["inference_method"] = "aws-bedrock"
+
+ # Update model_choice to use CLOUD_LLM_PII_MODEL_CHOICE for Bedrock, or value from text_analyzer_kwargs if set
+ if text_analyzer_kwargs.get("model_choice") is None:
+ model_choice = CLOUD_LLM_PII_MODEL_CHOICE
+ else:
+ model_choice = text_analyzer_kwargs.get(
+ "model_choice", CLOUD_LLM_PII_MODEL_CHOICE
+ )
+
+ # Set LLM model name for tracking (use custom-instructions model when applicable)
+ custom_instructions_model = (
+ CLOUD_LLM_PII_CUSTOM_INSTRUCTIONS_MODEL_CHOICE.strip()
+ if isinstance(CLOUD_LLM_PII_CUSTOM_INSTRUCTIONS_MODEL_CHOICE, str)
+ and CLOUD_LLM_PII_CUSTOM_INSTRUCTIONS_MODEL_CHOICE.strip()
+ else ""
+ )
+ if (
+ (custom_llm_instructions or "").strip()
+ and model_choice == CLOUD_LLM_PII_MODEL_CHOICE
+ and custom_instructions_model
+ ):
+ llm_model_name = custom_instructions_model
+ else:
+ llm_model_name = model_choice or ""
+
+ # Handle custom entities first (same as AWS Comprehend)
+ # Include CUSTOM/CUSTOM_FUZZY (deny list) so deny-list words are redacted when CUSTOM is selected
+ local_custom_entities = [
+ entity
+ for entity in (chosen_llm_entities or [])
+ if entity in (custom_entities or [])
+ or entity in ("CUSTOM", "CUSTOM_FUZZY")
+ ]
+
+ if local_custom_entities:
+ # Filter entities to only include those supported by the language
+ language_supported_entities = filter_entities_for_language(
+ local_custom_entities, valid_language_entities, language
+ )
+
+ if language_supported_entities:
+ text_analyzer_kwargs["entities"] = language_supported_entities
+
+ # Filter out LLM-specific parameters that Presidio AnalyzerEngine doesn't accept
+ presidio_kwargs = {
+ k: v
+ for k, v in text_analyzer_kwargs.items()
+ if k
+ not in [
+ "inference_method",
+ "model_choice",
+ "api_url",
+ "local_model",
+ "tokenizer",
+ "assistant_model",
+ "client",
+ "client_config",
+ "temperature",
+ "max_tokens",
+ "custom_instructions",
+ "allow_list",
+ ]
+ }
+
+ page_analyser_result = nlp_analyser.analyze(
+ text=page_text,
+ language=language,
+ allow_list=allow_list,
+ **presidio_kwargs,
+ )
+ all_text_line_results = map_back_entity_results(
+ page_analyser_result,
+ page_text_mapping,
+ all_text_line_results,
+ allow_list=allow_list,
+ )
+
+ # Process text in batches for LLM (same batching logic as AWS Comprehend)
+ current_batch = ""
+ current_batch_mapping = list()
+ batch_char_count = 0
+ batch_word_count = 0
+
+ for i, text_line in enumerate(line_level_ocr_results):
+ words = text_line.text.split()
+ word_start_positions = list()
+ current_pos = 0
+
+ for word in words:
+ word_start_positions.append(current_pos)
+ current_pos += len(word) + 1
+
+ word_idx = 0
+ while word_idx < len(words):
+ word = words[word_idx]
+ new_batch_char_count = len(current_batch) + len(word) + 1
+
+ # Check if we've hit the limit
+ limit_reached = (
+ batch_word_count >= DEFAULT_NEW_BATCH_WORD_COUNT
+ or new_batch_char_count >= DEFAULT_NEW_BATCH_CHAR_COUNT
+ )
+
+ if limit_reached:
+ # Add the current word to the batch first
+ if current_batch:
+ current_batch += " "
+ batch_char_count += 1
+ current_batch += word
+ batch_char_count += len(word)
+ batch_word_count += 1
+
+ if (
+ not current_batch_mapping
+ or current_batch_mapping[-1][1] != i
+ ):
+ current_batch_mapping.append(
+ (
+ batch_char_count - len(word),
+ i,
+ text_line,
+ None,
+ word_start_positions[word_idx],
+ )
+ )
+
+ # Check if current word ends with phrase punctuation
+ if ends_with_phrase_punctuation(word):
+ # Remove 'CUSTOM' and 'CUSTOM_VLM_*' entities from the chosen_llm_entities list
+ # CUSTOM_VLM_* entities are handled separately via VLM, not LLM
+ llm_chosen_redact_comprehend_entities = [
+ entity
+ for entity in chosen_llm_entities
+ if entity != "CUSTOM"
+ and not entity.startswith("CUSTOM_VLM_")
+ ]
+ # Process current batch
+ (
+ all_text_line_results,
+ batch_input_tokens,
+ batch_output_tokens,
+ ) = do_llm_entity_detection_call(
+ current_batch,
+ current_batch_mapping,
+ bedrock_runtime=bedrock_runtime,
+ language=aws_language,
+ allow_list=text_analyzer_kwargs.get("allow_list", []),
+ chosen_redact_llm_entities=llm_chosen_redact_comprehend_entities,
+ all_text_line_results=all_text_line_results,
+ model_choice=model_choice,
+ temperature=text_analyzer_kwargs.get(
+ "temperature", LLM_TEMPERATURE
+ ),
+ max_tokens=text_analyzer_kwargs.get(
+ "max_tokens", LLM_MAX_NEW_TOKENS
+ ),
+ output_folder=getattr(self, "output_folder", None),
+ batch_number=comprehend_query_number + 1,
+ custom_instructions=custom_llm_instructions,
+ file_name=file_name,
+ page_number=page_number,
+ inference_method=text_analyzer_kwargs.get(
+ "inference_method"
+ ),
+ # local_model=text_analyzer_kwargs.get("local_model"),
+ # tokenizer=text_analyzer_kwargs.get("tokenizer"),
+ # assistant_model=text_analyzer_kwargs.get(
+ # "assistant_model"
+ # ),
+ client=text_analyzer_kwargs.get("client"),
+ client_config=text_analyzer_kwargs.get("client_config"),
+ api_url=text_analyzer_kwargs.get("api_url"),
+ )
+ # Accumulate token usage
+ llm_total_input_tokens += batch_input_tokens
+ llm_total_output_tokens += batch_output_tokens
+ comprehend_query_number += 1
+
+ # Reset batch
+ current_batch = ""
+ batch_word_count = 0
+ batch_char_count = 0
+ current_batch_mapping = list()
+ word_idx += 1
+ else:
+ # Look ahead in current line for phrase-ending punctuation or end of line
+ lookahead_idx = word_idx + 1
+ lookahead_batch = current_batch
+ lookahead_char_count = batch_char_count
+ lookahead_word_count = batch_word_count
+ lookahead_mapping = list(current_batch_mapping)
+
+ # Continue adding words until we find phrase-ending punctuation or end of line
+ while lookahead_idx < len(words):
+ lookahead_word = words[lookahead_idx]
+
+ # Add the word to lookahead batch
+ if lookahead_batch:
+ lookahead_batch += " "
+ lookahead_char_count += 1
+ lookahead_batch += lookahead_word
+ lookahead_char_count += len(lookahead_word)
+ lookahead_word_count += 1
+
+ if (
+ not lookahead_mapping
+ or lookahead_mapping[-1][1] != i
+ ):
+ lookahead_mapping.append(
+ (
+ lookahead_char_count - len(lookahead_word),
+ i,
+ text_line,
+ None,
+ word_start_positions[lookahead_idx],
+ )
+ )
+
+ # Check if this word ends with phrase punctuation
+ if ends_with_phrase_punctuation(lookahead_word):
+ break
+
+ lookahead_idx += 1
+
+ # Use the lookahead batch (either found phrase end or reached end of line)
+ current_batch = lookahead_batch
+ batch_char_count = lookahead_char_count
+ batch_word_count = lookahead_word_count
+ current_batch_mapping = lookahead_mapping
+
+ # Remove 'CUSTOM' and 'CUSTOM_VLM_*' entities from the chosen_llm_entities list
+ # CUSTOM_VLM_* entities are handled separately via VLM, not LLM
+ llm_chosen_redact_comprehend_entities = [
+ entity
+ for entity in chosen_llm_entities
+ if entity != "CUSTOM"
+ and not entity.startswith("CUSTOM_VLM_")
+ ]
+ # Process current batch
+ (
+ all_text_line_results,
+ batch_input_tokens,
+ batch_output_tokens,
+ ) = do_llm_entity_detection_call(
+ current_batch,
+ current_batch_mapping,
+ bedrock_runtime=bedrock_runtime,
+ language=aws_language,
+ allow_list=text_analyzer_kwargs.get("allow_list", []),
+ chosen_redact_llm_entities=llm_chosen_redact_comprehend_entities,
+ all_text_line_results=all_text_line_results,
+ model_choice=model_choice,
+ temperature=text_analyzer_kwargs.get(
+ "temperature", LLM_TEMPERATURE
+ ),
+ max_tokens=text_analyzer_kwargs.get(
+ "max_tokens", LLM_MAX_NEW_TOKENS
+ ),
+ output_folder=getattr(self, "output_folder", None),
+ batch_number=comprehend_query_number + 1,
+ custom_instructions=custom_llm_instructions,
+ file_name=file_name,
+ page_number=page_number,
+ inference_method=text_analyzer_kwargs.get(
+ "inference_method"
+ ),
+ # local_model=text_analyzer_kwargs.get("local_model"),
+ # tokenizer=text_analyzer_kwargs.get("tokenizer"),
+ # assistant_model=text_analyzer_kwargs.get(
+ # "assistant_model"
+ # ),
+ client=text_analyzer_kwargs.get("client"),
+ client_config=text_analyzer_kwargs.get("client_config"),
+ api_url=text_analyzer_kwargs.get("api_url"),
+ )
+ # Accumulate token usage
+ llm_total_input_tokens += batch_input_tokens
+ llm_total_output_tokens += batch_output_tokens
+ comprehend_query_number += 1
+
+ # Reset batch
+ current_batch = ""
+ batch_word_count = 0
+ batch_char_count = 0
+ current_batch_mapping = list()
+ word_idx = lookahead_idx + 1
+ else:
+ # Normal case: add word to batch
+ if current_batch:
+ current_batch += " "
+ batch_char_count += 1
+ current_batch += word
+ batch_char_count += len(word)
+ batch_word_count += 1
+
+ if (
+ not current_batch_mapping
+ or current_batch_mapping[-1][1] != i
+ ):
+ current_batch_mapping.append(
+ (
+ batch_char_count - len(word),
+ i,
+ text_line,
+ None,
+ word_start_positions[word_idx],
+ )
+ )
+ word_idx += 1
+
+ # Process final batch if any
+ if current_batch:
+ # Remove 'CUSTOM' and 'CUSTOM_VLM_*' entities from the chosen_llm_entities list
+ # CUSTOM_VLM_* entities are handled separately via VLM, not LLM
+ llm_chosen_redact_comprehend_entities = [
+ entity
+ for entity in chosen_llm_entities
+ if entity != "CUSTOM" and not entity.startswith("CUSTOM_VLM_")
+ ]
+ all_text_line_results, batch_input_tokens, batch_output_tokens = (
+ do_llm_entity_detection_call(
+ current_batch,
+ current_batch_mapping,
+ bedrock_runtime=bedrock_runtime,
+ language=aws_language,
+ allow_list=text_analyzer_kwargs.get("allow_list", []),
+ chosen_redact_llm_entities=llm_chosen_redact_comprehend_entities,
+ all_text_line_results=all_text_line_results,
+ model_choice=model_choice,
+ temperature=text_analyzer_kwargs.get(
+ "temperature", LLM_TEMPERATURE
+ ),
+ max_tokens=text_analyzer_kwargs.get(
+ "max_tokens", LLM_MAX_NEW_TOKENS
+ ),
+ output_folder=getattr(self, "output_folder", None),
+ batch_number=comprehend_query_number + 1,
+ custom_instructions=custom_llm_instructions,
+ file_name=file_name,
+ page_number=page_number,
+ inference_method=text_analyzer_kwargs.get("inference_method"),
+ # local_model=text_analyzer_kwargs.get("local_model"),
+ # tokenizer=text_analyzer_kwargs.get("tokenizer"),
+ # assistant_model=text_analyzer_kwargs.get("assistant_model"),
+ client=text_analyzer_kwargs.get("client"),
+ client_config=text_analyzer_kwargs.get("client_config"),
+ api_url=text_analyzer_kwargs.get("api_url"),
+ )
+ )
+ # Accumulate token usage
+ llm_total_input_tokens += batch_input_tokens
+ llm_total_output_tokens += batch_output_tokens
+ comprehend_query_number += 1
+ elif pii_identification_method == INFERENCE_SERVER_PII_OPTION:
+ # LLM-based entity detection using inference server
+ try:
+ from tools.llm_entity_detection import do_llm_entity_detection_call
+ except ImportError as e:
+ print(f"Error importing LLM entity detection: {e}")
+ raise ImportError(
+ "LLM entity detection not available. Please ensure llm_funcs.py is accessible."
+ )
+
+ # Set inference method to inference-server if not already set
+ if text_analyzer_kwargs.get("inference_method") is None:
+ text_analyzer_kwargs["inference_method"] = "inference-server"
+
+ # Set API URL if not already set
+ if text_analyzer_kwargs.get("api_url") is None:
+ text_analyzer_kwargs["api_url"] = INFERENCE_SERVER_API_URL
+
+ # Set model choice if not already set - use INFERENCE_SERVER_LLM_PII_MODEL_CHOICE
+ if text_analyzer_kwargs.get("model_choice") is None:
+ model_choice = INFERENCE_SERVER_LLM_PII_MODEL_CHOICE
+ text_analyzer_kwargs["model_choice"] = model_choice
+ else:
+ model_choice = text_analyzer_kwargs.get("model_choice")
+
+ # Set LLM model name for tracking
+ llm_model_name = model_choice or ""
+
+ # Update model_choice to use the value from text_analyzer_kwargs
+ model_choice = text_analyzer_kwargs.get(
+ "model_choice", INFERENCE_SERVER_LLM_PII_MODEL_CHOICE
+ )
+
+ # Handle custom entities first (same as AWS Comprehend)
+ # Include CUSTOM/CUSTOM_FUZZY (deny list) so deny-list words are redacted when CUSTOM is selected
+ local_custom_entities = [
+ entity
+ for entity in (chosen_llm_entities or [])
+ if entity in (custom_entities or [])
+ or entity in ("CUSTOM", "CUSTOM_FUZZY")
+ ]
+
+ if local_custom_entities:
+ # Filter entities to only include those supported by the language
+ language_supported_entities = filter_entities_for_language(
+ local_custom_entities, valid_language_entities, language
+ )
+
+ if language_supported_entities:
+ text_analyzer_kwargs["entities"] = language_supported_entities
+
+ # Filter out LLM-specific parameters that Presidio AnalyzerEngine doesn't accept
+ presidio_kwargs = {
+ k: v
+ for k, v in text_analyzer_kwargs.items()
+ if k
+ not in [
+ "inference_method",
+ "model_choice",
+ "api_url",
+ "local_model",
+ "tokenizer",
+ "assistant_model",
+ "client",
+ "client_config",
+ "temperature",
+ "max_tokens",
+ "custom_instructions",
+ "allow_list",
+ ]
+ }
+
+ page_analyser_result = nlp_analyser.analyze(
+ text=page_text,
+ language=language,
+ allow_list=allow_list,
+ **presidio_kwargs,
+ )
+ all_text_line_results = map_back_entity_results(
+ page_analyser_result,
+ page_text_mapping,
+ all_text_line_results,
+ allow_list=allow_list,
+ )
+
+ # Process text in batches for LLM (same batching logic as AWS Comprehend)
+ current_batch = ""
+ current_batch_mapping = list()
+ batch_char_count = 0
+ batch_word_count = 0
+
+ for i, text_line in enumerate(line_level_ocr_results):
+ words = text_line.text.split()
+ word_start_positions = list()
+ current_pos = 0
+
+ for word in words:
+ word_start_positions.append(current_pos)
+ current_pos += len(word) + 1
+
+ word_idx = 0
+ while word_idx < len(words):
+ word = words[word_idx]
+ new_batch_char_count = len(current_batch) + len(word) + 1
+
+ # Check if we've hit the limit
+ limit_reached = (
+ batch_word_count >= DEFAULT_NEW_BATCH_WORD_COUNT
+ or new_batch_char_count >= DEFAULT_NEW_BATCH_CHAR_COUNT
+ )
+
+ if limit_reached:
+ # Add the current word to the batch first
+ if current_batch:
+ current_batch += " "
+ batch_char_count += 1
+ current_batch += word
+ batch_char_count += len(word)
+ batch_word_count += 1
+
+ if (
+ not current_batch_mapping
+ or current_batch_mapping[-1][1] != i
+ ):
+ current_batch_mapping.append(
+ (
+ batch_char_count - len(word),
+ i,
+ text_line,
+ None,
+ word_start_positions[word_idx],
+ )
+ )
+
+ # Check if current word ends with phrase punctuation
+ if ends_with_phrase_punctuation(word):
+ # Remove 'CUSTOM' and 'CUSTOM_VLM_*' entities from the chosen_llm_entities list
+ # CUSTOM_VLM_* entities are handled separately via VLM, not LLM
+ llm_chosen_redact_comprehend_entities = [
+ entity
+ for entity in chosen_llm_entities
+ if entity != "CUSTOM"
+ and not entity.startswith("CUSTOM_VLM_")
+ ]
+ # Process current batch
+ (
+ all_text_line_results,
+ batch_input_tokens,
+ batch_output_tokens,
+ ) = do_llm_entity_detection_call(
+ current_batch,
+ current_batch_mapping,
+ bedrock_runtime=bedrock_runtime,
+ language=aws_language,
+ allow_list=text_analyzer_kwargs.get("allow_list", []),
+ chosen_redact_llm_entities=llm_chosen_redact_comprehend_entities,
+ all_text_line_results=all_text_line_results,
+ model_choice=model_choice,
+ temperature=text_analyzer_kwargs.get(
+ "temperature", LLM_TEMPERATURE
+ ),
+ max_tokens=text_analyzer_kwargs.get(
+ "max_tokens", LLM_MAX_NEW_TOKENS
+ ),
+ output_folder=getattr(self, "output_folder", None),
+ batch_number=comprehend_query_number + 1,
+ custom_instructions=custom_llm_instructions,
+ file_name=file_name,
+ page_number=page_number,
+ inference_method=text_analyzer_kwargs.get(
+ "inference_method"
+ ),
+ # local_model=text_analyzer_kwargs.get("local_model"),
+ # tokenizer=text_analyzer_kwargs.get("tokenizer"),
+ # assistant_model=text_analyzer_kwargs.get(
+ # "assistant_model"
+ # ),
+ client=text_analyzer_kwargs.get("client"),
+ client_config=text_analyzer_kwargs.get("client_config"),
+ api_url=text_analyzer_kwargs.get("api_url"),
+ )
+ # Accumulate token usage
+ llm_total_input_tokens += batch_input_tokens
+ llm_total_output_tokens += batch_output_tokens
+ comprehend_query_number += 1
+
+ # Reset batch
+ current_batch = ""
+ batch_word_count = 0
+ batch_char_count = 0
+ current_batch_mapping = list()
+ word_idx += 1
+ else:
+ # Look ahead in current line for phrase-ending punctuation or end of line
+ lookahead_idx = word_idx + 1
+ lookahead_batch = current_batch
+ lookahead_char_count = batch_char_count
+ lookahead_word_count = batch_word_count
+ lookahead_mapping = list(current_batch_mapping)
+
+ # Continue adding words until we find phrase-ending punctuation or end of line
+ while lookahead_idx < len(words):
+ lookahead_word = words[lookahead_idx]
+
+ # Add the word to lookahead batch
+ if lookahead_batch:
+ lookahead_batch += " "
+ lookahead_char_count += 1
+ lookahead_batch += lookahead_word
+ lookahead_char_count += len(lookahead_word)
+ lookahead_word_count += 1
+
+ if (
+ not lookahead_mapping
+ or lookahead_mapping[-1][1] != i
+ ):
+ lookahead_mapping.append(
+ (
+ lookahead_char_count - len(lookahead_word),
+ i,
+ text_line,
+ None,
+ word_start_positions[lookahead_idx],
+ )
+ )
+
+ # Check if this word ends with phrase punctuation
+ if ends_with_phrase_punctuation(lookahead_word):
+ break
+
+ lookahead_idx += 1
+
+ # Use the lookahead batch (either found phrase end or reached end of line)
+ current_batch = lookahead_batch
+ batch_char_count = lookahead_char_count
+ batch_word_count = lookahead_word_count
+ current_batch_mapping = lookahead_mapping
+
+ # Remove 'CUSTOM' and 'CUSTOM_VLM_*' entities from the chosen_llm_entities list
+ # CUSTOM_VLM_* entities are handled separately via VLM, not LLM
+ llm_chosen_redact_comprehend_entities = [
+ entity
+ for entity in chosen_llm_entities
+ if entity != "CUSTOM"
+ and not entity.startswith("CUSTOM_VLM_")
+ ]
+ # Process current batch
+ (
+ all_text_line_results,
+ batch_input_tokens,
+ batch_output_tokens,
+ ) = do_llm_entity_detection_call(
+ current_batch,
+ current_batch_mapping,
+ bedrock_runtime=bedrock_runtime,
+ language=aws_language,
+ allow_list=text_analyzer_kwargs.get("allow_list", []),
+ chosen_redact_llm_entities=llm_chosen_redact_comprehend_entities,
+ all_text_line_results=all_text_line_results,
+ model_choice=model_choice,
+ temperature=text_analyzer_kwargs.get(
+ "temperature", LLM_TEMPERATURE
+ ),
+ max_tokens=text_analyzer_kwargs.get(
+ "max_tokens", LLM_MAX_NEW_TOKENS
+ ),
+ output_folder=getattr(self, "output_folder", None),
+ batch_number=comprehend_query_number + 1,
+ custom_instructions=custom_llm_instructions,
+ file_name=file_name,
+ page_number=page_number,
+ inference_method=text_analyzer_kwargs.get(
+ "inference_method"
+ ),
+ # local_model=text_analyzer_kwargs.get("local_model"),
+ # tokenizer=text_analyzer_kwargs.get("tokenizer"),
+ # assistant_model=text_analyzer_kwargs.get(
+ # "assistant_model"
+ # ),
+ client=text_analyzer_kwargs.get("client"),
+ client_config=text_analyzer_kwargs.get("client_config"),
+ api_url=text_analyzer_kwargs.get("api_url"),
+ )
+ # Accumulate token usage
+ llm_total_input_tokens += batch_input_tokens
+ llm_total_output_tokens += batch_output_tokens
+ comprehend_query_number += 1
+
+ # Reset batch
+ current_batch = ""
+ batch_word_count = 0
+ batch_char_count = 0
+ current_batch_mapping = list()
+ word_idx = lookahead_idx + 1
+ else:
+ # Normal case: add word to batch
+ if current_batch:
+ current_batch += " "
+ batch_char_count += 1
+ current_batch += word
+ batch_char_count += len(word)
+ batch_word_count += 1
+
+ if (
+ not current_batch_mapping
+ or current_batch_mapping[-1][1] != i
+ ):
+ current_batch_mapping.append(
+ (
+ batch_char_count - len(word),
+ i,
+ text_line,
+ None,
+ word_start_positions[word_idx],
+ )
+ )
+ word_idx += 1
+
+ # Process final batch if any
+ if current_batch:
+ # Remove 'CUSTOM' and 'CUSTOM_VLM_*' entities from the chosen_llm_entities list
+ # CUSTOM_VLM_* entities are handled separately via VLM, not LLM
+ llm_chosen_redact_comprehend_entities = [
+ entity
+ for entity in chosen_llm_entities
+ if entity != "CUSTOM" and not entity.startswith("CUSTOM_VLM_")
+ ]
+ all_text_line_results, batch_input_tokens, batch_output_tokens = (
+ do_llm_entity_detection_call(
+ current_batch,
+ current_batch_mapping,
+ bedrock_runtime=bedrock_runtime,
+ language=aws_language,
+ allow_list=text_analyzer_kwargs.get("allow_list", []),
+ chosen_redact_llm_entities=llm_chosen_redact_comprehend_entities,
+ all_text_line_results=all_text_line_results,
+ model_choice=model_choice,
+ temperature=text_analyzer_kwargs.get(
+ "temperature", LLM_TEMPERATURE
+ ),
+ max_tokens=text_analyzer_kwargs.get(
+ "max_tokens", LLM_MAX_NEW_TOKENS
+ ),
+ output_folder=getattr(self, "output_folder", None),
+ batch_number=comprehend_query_number + 1,
+ custom_instructions=custom_llm_instructions,
+ file_name=file_name,
+ page_number=page_number,
+ inference_method=text_analyzer_kwargs.get("inference_method"),
+ # local_model=text_analyzer_kwargs.get("local_model"),
+ # tokenizer=text_analyzer_kwargs.get("tokenizer"),
+ # assistant_model=text_analyzer_kwargs.get("assistant_model"),
+ client=text_analyzer_kwargs.get("client"),
+ client_config=text_analyzer_kwargs.get("client_config"),
+ api_url=text_analyzer_kwargs.get("api_url"),
+ )
+ )
+ # Accumulate token usage
+ llm_total_input_tokens += batch_input_tokens
+ llm_total_output_tokens += batch_output_tokens
+ comprehend_query_number += 1
+
+ elif pii_identification_method == LOCAL_TRANSFORMERS_LLM_PII_OPTION:
+ # LLM-based entity detection using local transformers models
+ try:
+ from tools.llm_entity_detection import do_llm_entity_detection_call
+ except ImportError as e:
+ print(f"Error importing LLM entity detection: {e}")
+ raise ImportError(
+ "LLM entity detection not available. Please ensure llm_funcs.py is accessible."
+ )
+
+ # Set inference method to local if not already set
+ if text_analyzer_kwargs.get("inference_method") is None:
+ text_analyzer_kwargs["inference_method"] = "local"
+
+ # Set model choice if not already set - use VLM model when USE_TRANSFORMERS_VLM_MODEL_AS_LLM else LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE
+ if text_analyzer_kwargs.get("model_choice") is None:
+ text_analyzer_kwargs["model_choice"] = (
+ SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL
+ if USE_TRANSFORMERS_VLM_MODEL_AS_LLM
+ else LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE
+ )
+
+ # Update model_choice to use the value from text_analyzer_kwargs
+ model_choice = text_analyzer_kwargs.get(
+ "model_choice",
+ (
+ SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL
+ if USE_TRANSFORMERS_VLM_MODEL_AS_LLM
+ else LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE
+ ),
+ )
+
+ # Handle custom entities first (same as AWS Comprehend)
+ # Include CUSTOM/CUSTOM_FUZZY (deny list) so deny-list words are redacted when CUSTOM is selected
+ local_custom_entities = [
+ entity
+ for entity in (chosen_llm_entities or [])
+ if entity in (custom_entities or [])
+ or entity in ("CUSTOM", "CUSTOM_FUZZY")
+ ]
+
+ if local_custom_entities:
+ # Filter entities to only include those supported by the language
+ language_supported_entities = filter_entities_for_language(
+ local_custom_entities, valid_language_entities, language
+ )
+
+ if language_supported_entities:
+ text_analyzer_kwargs["entities"] = language_supported_entities
+
+ # Filter out LLM-specific parameters that Presidio AnalyzerEngine doesn't accept
+ presidio_kwargs = {
+ k: v
+ for k, v in text_analyzer_kwargs.items()
+ if k
+ not in [
+ "inference_method",
+ "model_choice",
+ "api_url",
+ "local_model",
+ "tokenizer",
+ "assistant_model",
+ "client",
+ "client_config",
+ "temperature",
+ "max_tokens",
+ "custom_instructions",
+ "allow_list",
+ ]
+ }
+
+ page_analyser_result = nlp_analyser.analyze(
+ text=page_text,
+ language=language,
+ allow_list=allow_list,
+ **presidio_kwargs,
+ )
+ all_text_line_results = map_back_entity_results(
+ page_analyser_result,
+ page_text_mapping,
+ all_text_line_results,
+ allow_list=allow_list,
+ )
+
+ # Process text in batches for LLM (same batching logic as AWS Comprehend)
+ current_batch = ""
+ current_batch_mapping = list()
+ batch_char_count = 0
+ batch_word_count = 0
+
+ for i, text_line in enumerate(line_level_ocr_results):
+ words = text_line.text.split()
+ word_start_positions = list()
+ current_pos = 0
+
+ for word in words:
+ word_start_positions.append(current_pos)
+ current_pos += len(word) + 1
+
+ word_idx = 0
+ while word_idx < len(words):
+ word = words[word_idx]
+ new_batch_char_count = len(current_batch) + len(word) + 1
+
+ # Check if we've hit the limit
+ limit_reached = (
+ batch_word_count >= DEFAULT_NEW_BATCH_WORD_COUNT
+ or new_batch_char_count >= DEFAULT_NEW_BATCH_CHAR_COUNT
+ )
+
+ if limit_reached:
+ # Add the current word to the batch first
+ if current_batch:
+ current_batch += " "
+ batch_char_count += 1
+ current_batch += word
+ batch_char_count += len(word)
+ batch_word_count += 1
+
+ if (
+ not current_batch_mapping
+ or current_batch_mapping[-1][1] != i
+ ):
+ current_batch_mapping.append(
+ (
+ batch_char_count - len(word),
+ i,
+ text_line,
+ None,
+ word_start_positions[word_idx],
+ )
+ )
+
+ # Check if current word ends with phrase punctuation
+ if ends_with_phrase_punctuation(word):
+ # Remove 'CUSTOM' and 'CUSTOM_VLM_*' entities from the chosen_llm_entities list
+ # CUSTOM_VLM_* entities are handled separately via VLM, not LLM
+ llm_chosen_redact_comprehend_entities = [
+ entity
+ for entity in chosen_llm_entities
+ if entity != "CUSTOM"
+ and not entity.startswith("CUSTOM_VLM_")
+ ]
+ # Process current batch
+ (
+ all_text_line_results,
+ batch_input_tokens,
+ batch_output_tokens,
+ ) = do_llm_entity_detection_call(
+ current_batch,
+ current_batch_mapping,
+ bedrock_runtime=bedrock_runtime,
+ language=aws_language,
+ allow_list=text_analyzer_kwargs.get("allow_list", []),
+ chosen_redact_llm_entities=llm_chosen_redact_comprehend_entities,
+ all_text_line_results=all_text_line_results,
+ model_choice=model_choice,
+ temperature=text_analyzer_kwargs.get(
+ "temperature", LLM_TEMPERATURE
+ ),
+ max_tokens=text_analyzer_kwargs.get(
+ "max_tokens", LLM_MAX_NEW_TOKENS
+ ),
+ output_folder=getattr(self, "output_folder", None),
+ batch_number=comprehend_query_number + 1,
+ custom_instructions=custom_llm_instructions,
+ file_name=file_name,
+ page_number=page_number,
+ inference_method=text_analyzer_kwargs.get(
+ "inference_method"
+ ),
+ # local_model=text_analyzer_kwargs.get("local_model"),
+ # tokenizer=text_analyzer_kwargs.get("tokenizer"),
+ # assistant_model=text_analyzer_kwargs.get(
+ # "assistant_model"
+ # ),
+ client=text_analyzer_kwargs.get("client"),
+ client_config=text_analyzer_kwargs.get("client_config"),
+ api_url=text_analyzer_kwargs.get("api_url"),
+ )
+ # Accumulate token usage
+ llm_total_input_tokens += batch_input_tokens
+ llm_total_output_tokens += batch_output_tokens
+ comprehend_query_number += 1
+
+ # Reset batch
+ current_batch = ""
+ batch_word_count = 0
+ batch_char_count = 0
+ current_batch_mapping = list()
+ word_idx += 1
+ else:
+ # Look ahead in current line for phrase-ending punctuation or end of line
+ lookahead_idx = word_idx + 1
+ lookahead_batch = current_batch
+ lookahead_char_count = batch_char_count
+ lookahead_word_count = batch_word_count
+ lookahead_mapping = list(current_batch_mapping)
+
+ # Continue adding words until we find phrase-ending punctuation or end of line
+ while lookahead_idx < len(words):
+ lookahead_word = words[lookahead_idx]
+
+ # Add the word to lookahead batch
+ if lookahead_batch:
+ lookahead_batch += " "
+ lookahead_char_count += 1
+ lookahead_batch += lookahead_word
+ lookahead_char_count += len(lookahead_word)
+ lookahead_word_count += 1
+
+ if (
+ not lookahead_mapping
+ or lookahead_mapping[-1][1] != i
+ ):
+ lookahead_mapping.append(
+ (
+ lookahead_char_count - len(lookahead_word),
+ i,
+ text_line,
+ None,
+ word_start_positions[lookahead_idx],
+ )
+ )
+
+ # Check if this word ends with phrase punctuation
+ if ends_with_phrase_punctuation(lookahead_word):
+ break
+
+ lookahead_idx += 1
+
+ # Use the lookahead batch (either found phrase end or reached end of line)
+ current_batch = lookahead_batch
+ batch_char_count = lookahead_char_count
+ batch_word_count = lookahead_word_count
+ current_batch_mapping = lookahead_mapping
+
+ # Remove 'CUSTOM' and 'CUSTOM_VLM_*' entities from the chosen_llm_entities list
+ # CUSTOM_VLM_* entities are handled separately via VLM, not LLM
+ llm_chosen_redact_comprehend_entities = [
+ entity
+ for entity in chosen_llm_entities
+ if entity != "CUSTOM"
+ and not entity.startswith("CUSTOM_VLM_")
+ ]
+ # Process current batch
+ (
+ all_text_line_results,
+ batch_input_tokens,
+ batch_output_tokens,
+ ) = do_llm_entity_detection_call(
+ current_batch,
+ current_batch_mapping,
+ bedrock_runtime=bedrock_runtime,
+ language=aws_language,
+ allow_list=text_analyzer_kwargs.get("allow_list", []),
+ chosen_redact_llm_entities=llm_chosen_redact_comprehend_entities,
+ all_text_line_results=all_text_line_results,
+ model_choice=model_choice,
+ temperature=text_analyzer_kwargs.get(
+ "temperature", LLM_TEMPERATURE
+ ),
+ max_tokens=text_analyzer_kwargs.get(
+ "max_tokens", LLM_MAX_NEW_TOKENS
+ ),
+ output_folder=getattr(self, "output_folder", None),
+ batch_number=comprehend_query_number + 1,
+ custom_instructions=custom_llm_instructions,
+ file_name=file_name,
+ page_number=page_number,
+ inference_method=text_analyzer_kwargs.get(
+ "inference_method"
+ ),
+ # local_model=text_analyzer_kwargs.get("local_model"),
+ # tokenizer=text_analyzer_kwargs.get("tokenizer"),
+ # assistant_model=text_analyzer_kwargs.get(
+ # "assistant_model"
+ # ),
+ client=text_analyzer_kwargs.get("client"),
+ client_config=text_analyzer_kwargs.get("client_config"),
+ api_url=text_analyzer_kwargs.get("api_url"),
+ )
+ # Accumulate token usage
+ llm_total_input_tokens += batch_input_tokens
+ llm_total_output_tokens += batch_output_tokens
+ comprehend_query_number += 1
+
+ # Reset batch
+ current_batch = ""
+ batch_word_count = 0
+ batch_char_count = 0
+ current_batch_mapping = list()
+ word_idx = lookahead_idx + 1
+ else:
+ # Normal case: add word to batch
+ if current_batch:
+ current_batch += " "
+ batch_char_count += 1
+ current_batch += word
+ batch_char_count += len(word)
+ batch_word_count += 1
+
+ if (
+ not current_batch_mapping
+ or current_batch_mapping[-1][1] != i
+ ):
+ current_batch_mapping.append(
+ (
+ batch_char_count - len(word),
+ i,
+ text_line,
+ None,
+ word_start_positions[word_idx],
+ )
+ )
+ word_idx += 1
+
+ # Process final batch if any
+ if current_batch:
+ # Remove 'CUSTOM' and 'CUSTOM_VLM_*' entities from the chosen_llm_entities list
+ # CUSTOM_VLM_* entities are handled separately via VLM, not LLM
+ llm_chosen_redact_comprehend_entities = [
+ entity
+ for entity in chosen_llm_entities
+ if entity != "CUSTOM" and not entity.startswith("CUSTOM_VLM_")
+ ]
+ all_text_line_results, batch_input_tokens, batch_output_tokens = (
+ do_llm_entity_detection_call(
+ current_batch,
+ current_batch_mapping,
+ bedrock_runtime=bedrock_runtime,
+ language=aws_language,
+ allow_list=text_analyzer_kwargs.get("allow_list", []),
+ chosen_redact_llm_entities=llm_chosen_redact_comprehend_entities,
+ all_text_line_results=all_text_line_results,
+ model_choice=model_choice,
+ temperature=text_analyzer_kwargs.get(
+ "temperature", LLM_TEMPERATURE
+ ),
+ max_tokens=text_analyzer_kwargs.get(
+ "max_tokens", LLM_MAX_NEW_TOKENS
+ ),
+ output_folder=getattr(self, "output_folder", None),
+ batch_number=comprehend_query_number + 1,
+ custom_instructions=custom_llm_instructions,
+ file_name=file_name,
+ page_number=page_number,
+ inference_method=text_analyzer_kwargs.get("inference_method"),
+ # local_model=text_analyzer_kwargs.get("local_model"),
+ # tokenizer=text_analyzer_kwargs.get("tokenizer"),
+ # assistant_model=text_analyzer_kwargs.get("assistant_model"),
+ client=text_analyzer_kwargs.get("client"),
+ client_config=text_analyzer_kwargs.get("client_config"),
+ api_url=text_analyzer_kwargs.get("api_url"),
+ )
+ )
+ # Accumulate token usage
+ llm_total_input_tokens += batch_input_tokens
+ llm_total_output_tokens += batch_output_tokens
+ comprehend_query_number += 1
+
+ # Process results and create bounding boxes
+ combined_results = list()
+ for i, text_line in enumerate(line_level_ocr_results):
+ line_results = next(
+ (results for idx, results in all_text_line_results if idx == i), []
+ )
+ if line_results and i < len(ocr_results_with_words):
+ child_level_key = list(ocr_results_with_words.keys())[i]
+ ocr_results_with_words_line_level = ocr_results_with_words[
+ child_level_key
+ ]
+
+ for result in line_results:
+ bbox_results = self.map_analyzer_results_to_bounding_boxes(
+ [result],
+ [
+ OCRResult(
+ text=text_line.text[result.start : result.end],
+ left=text_line.left,
+ top=text_line.top,
+ width=text_line.width,
+ height=text_line.height,
+ conf=text_line.conf,
+ )
+ ],
+ text_line.text,
+ text_analyzer_kwargs.get("allow_list", []),
+ ocr_results_with_words_line_level,
+ )
+ combined_results.extend(bbox_results)
+
+ return (
+ combined_results,
+ comprehend_query_number,
+ llm_model_name,
+ llm_total_input_tokens,
+ llm_total_output_tokens,
+ )
+
+ @staticmethod
+ def _map_one_ocr_result_to_bboxes(
+ redaction_relevant_ocr_result: OCRResult,
+ text_analyzer_results: List[RecognizerResult],
+ ocr_results_with_words_child_info: Dict[str, Dict],
+ allow_list: List[str],
+ ) -> List[CustomImageRecognizerResult]:
+ """Map one OCR result to bounding boxes; safe to run in a thread."""
+ bboxes = []
+ line_text = ocr_results_with_words_child_info["text"]
+ line_length = len(line_text)
+ redaction_text = redaction_relevant_ocr_result.text
+
+ for redaction_result in text_analyzer_results:
+ if allow_list:
+ allow_list_normalized = [
+ item.strip().lower() for item in allow_list if item
+ ]
+ redaction_text_normalized = redaction_text.strip().lower()
+ is_in_allow_list = redaction_text_normalized in allow_list_normalized
+ else:
+ is_in_allow_list = False
+
+ if not is_in_allow_list:
+ start_in_line = max(0, redaction_result.start)
+ end_in_line = min(line_length, redaction_result.end)
+ matched_text = line_text[start_in_line:end_in_line]
+ matched_text.split()
+
+ matching_word_boxes = []
+ current_position = 0
+ for word_info in ocr_results_with_words_child_info.get("words", []):
+ word_text = word_info["text"]
+ word_length = len(word_text)
+ word_start = current_position
+ word_end = current_position + word_length
+ current_position += word_length + 1
+ if word_start < end_in_line and word_end > start_in_line:
+ matching_word_boxes.append(word_info["bounding_box"])
+
+ if matching_word_boxes:
+ left = min(box[0] for box in matching_word_boxes)
+ top = min(box[1] for box in matching_word_boxes)
+ right = max(box[2] for box in matching_word_boxes)
+ bottom = max(box[3] for box in matching_word_boxes)
+ bboxes.append(
+ CustomImageRecognizerResult(
+ entity_type=redaction_result.entity_type,
+ start=start_in_line,
+ end=end_in_line,
+ score=round(redaction_result.score, 2),
+ left=left,
+ top=top,
+ width=right - left,
+ height=bottom - top,
+ text=matched_text,
+ )
+ )
+ else:
+ line_left = redaction_relevant_ocr_result.left
+ line_top = redaction_relevant_ocr_result.top
+ line_width = redaction_relevant_ocr_result.width
+ line_height = redaction_relevant_ocr_result.height
+ if line_length > 0:
+ text_proportion = len(matched_text) / line_length
+ char_width_estimate = line_width / line_length
+ estimated_left_offset = start_in_line * char_width_estimate
+ left = line_left + estimated_left_offset
+ top = line_top
+ width = text_proportion * line_width
+ height = line_height
+ else:
+ left = line_left
+ top = line_top
+ width = line_width
+ height = line_height
+ bboxes.append(
+ CustomImageRecognizerResult(
+ entity_type=redaction_result.entity_type,
+ start=start_in_line,
+ end=end_in_line,
+ score=round(redaction_result.score, 2),
+ left=left,
+ top=top,
+ width=width,
+ height=height,
+ text=matched_text,
+ )
+ )
+ return bboxes
+
+ @staticmethod
+ def map_analyzer_results_to_bounding_boxes(
+ text_analyzer_results: List[RecognizerResult],
+ redaction_relevant_ocr_results: List[OCRResult],
+ full_text: str,
+ allow_list: List[str],
+ ocr_results_with_words_child_info: Dict[str, Dict],
+ ) -> List[CustomImageRecognizerResult]:
+ if not redaction_relevant_ocr_results:
+ return []
+
+ n = len(redaction_relevant_ocr_results)
+ max_workers = min(MAX_WORKERS, n)
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
+ results = list(
+ executor.map(
+ lambda ocr_result: CustomImageAnalyzerEngine._map_one_ocr_result_to_bboxes(
+ ocr_result,
+ text_analyzer_results,
+ ocr_results_with_words_child_info,
+ allow_list,
+ ),
+ redaction_relevant_ocr_results,
+ )
+ )
+ redaction_bboxes = [bbox for bbox_list in results for bbox in bbox_list]
+ return redaction_bboxes
+
+ @staticmethod
+ def remove_space_boxes(ocr_result: dict) -> dict:
+ """Remove OCR bboxes that are for spaces.
+ :param ocr_result: OCR results (raw or thresholded).
+ :return: OCR results with empty words removed.
+ """
+ # Get indices of items with no text
+ idx = list()
+ for i, text in enumerate(ocr_result["text"]):
+ is_not_space = text.isspace() is False
+ if text != "" and is_not_space:
+ idx.append(i)
+
+ # Only retain items with text
+ filtered_ocr_result = {}
+ for key in list(ocr_result.keys()):
+ filtered_ocr_result[key] = [ocr_result[key][i] for i in idx]
+
+ return filtered_ocr_result
+
+ @staticmethod
+ def _scale_bbox_results(
+ ocr_result: Dict[str, List[Union[int, str]]], scale_factor: float
+ ) -> Dict[str, float]:
+ """Scale down the bounding box results based on a scale percentage.
+ :param ocr_result: OCR results (raw).
+ :param scale_percent: Scale percentage for resizing the bounding box.
+ :return: OCR results (scaled).
+ """
+ scaled_results = deepcopy(ocr_result)
+ coordinate_keys = ["left", "top"]
+ dimension_keys = ["width", "height"]
+
+ for coord_key in coordinate_keys:
+ scaled_results[coord_key] = [
+ int(np.ceil((x) / (scale_factor))) for x in scaled_results[coord_key]
+ ]
+
+ for dim_key in dimension_keys:
+ scaled_results[dim_key] = [
+ max(1, int(np.ceil(x / (scale_factor))))
+ for x in scaled_results[dim_key]
+ ]
+ return scaled_results
+
+ @staticmethod
+ def estimate_x_offset(full_text: str, start: int) -> int:
+ # Estimate the x-offset based on character position
+ # This is a simple estimation and might need refinement for variable-width fonts
+ return int(start / len(full_text) * len(full_text))
+
+ def estimate_width(self, ocr_result: OCRResult, start: int, end: int) -> int:
+ # Extract the relevant text portion
+ relevant_text = ocr_result.text[start:end]
+
+ # If the relevant text is the same as the full text, return the full width
+ if relevant_text == ocr_result.text:
+ return ocr_result.width
+
+ # Estimate width based on the proportion of the relevant text length to the total text length
+ total_text_length = len(ocr_result.text)
+ relevant_text_length = len(relevant_text)
+
+ if total_text_length == 0:
+ return 0 # Avoid division by zero
+
+ # Proportion of the relevant text to the total text
+ proportion = relevant_text_length / total_text_length
+
+ # Estimate the width based on the proportion
+ estimated_width = int(proportion * ocr_result.width)
+
+ return estimated_width
+
+
+def bounding_boxes_overlap(box1: List, box2: List):
+ """Check if two bounding boxes overlap."""
+ return (
+ box1[0] < box2[2]
+ and box2[0] < box1[2]
+ and box1[1] < box2[3]
+ and box2[1] < box1[3]
+ )
+
+
+def map_back_entity_results(
+ page_analyser_result: dict,
+ page_text_mapping: dict,
+ all_text_line_results: List[Tuple],
+ allow_list: List[str] = None,
+):
+ """
+ Map Presidio analyzer results back to line-level results.
+
+ Args:
+ page_analyser_result: Results from Presidio analyzer
+ page_text_mapping: Mapping of batch positions to line indices
+ all_text_line_results: Existing line-level results to append to
+ allow_list: List of allowed text values (to skip) - case-insensitive matching
+ """
+ # Normalize allow_list for case-insensitive matching
+ if allow_list:
+ allow_list_normalized = [item.strip().lower() for item in allow_list if item]
+ else:
+ allow_list_normalized = []
+
+ for entity in page_analyser_result:
+ entity_start = entity.start
+ entity_end = entity.end
+
+ # Track if the entity has been added to any line
+ added_to_line = False
+
+ for batch_start, line_idx, original_line, chars in page_text_mapping:
+ batch_end = batch_start + len(original_line.text)
+
+ # Check if the entity overlaps with the current line
+ if (
+ batch_start < entity_end and batch_end > entity_start
+ ): # Overlap condition
+ relative_start = max(
+ 0, entity_start - batch_start
+ ) # Adjust start relative to the line
+ relative_end = min(
+ entity_end - batch_start, len(original_line.text)
+ ) # Adjust end relative to the line
+
+ # Get the text for this entity to check against allow_list
+ result_text = original_line.text[relative_start:relative_end]
+
+ # Check if result_text is in allow_list (case-insensitive)
+ # If allow_list contains this text, skip adding it as a PII entity
+ # This allows allow_list terms to "overrule" PII detection
+ result_text_normalized = result_text.strip().lower()
+ if result_text_normalized not in allow_list_normalized:
+ # Create a new adjusted entity
+ adjusted_entity = copy.deepcopy(entity)
+ adjusted_entity.start = relative_start
+ adjusted_entity.end = relative_end
+
+ # Check if this line already has an entry
+ existing_entry = next(
+ (
+ entry
+ for idx, entry in all_text_line_results
+ if idx == line_idx
+ ),
+ None,
+ )
+
+ if existing_entry is None:
+ all_text_line_results.append((line_idx, [adjusted_entity]))
+ else:
+ existing_entry.append(
+ adjusted_entity
+ ) # Append to the existing list of entities
+
+ added_to_line = True
+
+ # If the entity spans multiple lines, you may want to handle that here
+ if not added_to_line:
+ # Handle cases where the entity does not fit in any line (optional)
+ print(f"Entity '{entity}' does not fit in any line.")
+
+ return all_text_line_results
+
+
+def map_back_comprehend_entity_results(
+ response: object,
+ current_batch_mapping: List[Tuple],
+ allow_list: List[str],
+ chosen_redact_comprehend_entities: List[str],
+ all_text_line_results: List[Tuple],
+):
+ """
+ Map AWS Comprehend entity results back to line-level results.
+
+ Args:
+ response: AWS Comprehend response object
+ current_batch_mapping: Mapping of batch positions to line indices
+ allow_list: List of allowed text values (to skip) - case-insensitive matching
+ chosen_redact_comprehend_entities: List of entity types to include
+ all_text_line_results: Existing line-level results to append to
+ """
+ if not response or "Entities" not in response:
+ return all_text_line_results
+
+ # Normalize allow_list for case-insensitive matching
+ if allow_list:
+ allow_list_normalized = [item.strip().lower() for item in allow_list if item]
+ else:
+ allow_list_normalized = []
+
+ for entity in response["Entities"]:
+ if entity.get("Type") not in chosen_redact_comprehend_entities:
+ continue
+
+ entity_start = entity["BeginOffset"]
+ entity_end = entity["EndOffset"]
+
+ # Track if the entity has been added to any line
+ added_to_line = False
+
+ # Find the correct line and offset within that line
+ for (
+ batch_start,
+ line_idx,
+ original_line,
+ chars,
+ line_offset,
+ ) in current_batch_mapping:
+ batch_end = batch_start + len(original_line.text[line_offset:])
+
+ # Check if the entity overlaps with the current line
+ if (
+ batch_start < entity_end and batch_end > entity_start
+ ): # Overlap condition
+ # Calculate the absolute position within the line
+ relative_start = max(0, entity_start - batch_start + line_offset)
+ relative_end = min(
+ entity_end - batch_start + line_offset, len(original_line.text)
+ )
+
+ result_text = original_line.text[relative_start:relative_end]
+
+ # Check if result_text is in allow_list (case-insensitive)
+ # If allow_list contains this text, skip adding it as a PII entity
+ # This allows allow_list terms to "overrule" AWS Comprehend PII detection
+ result_text_normalized = result_text.strip().lower()
+ if result_text_normalized not in allow_list_normalized:
+ adjusted_entity = entity.copy()
+ adjusted_entity["BeginOffset"] = (
+ relative_start # Now relative to the full line
+ )
+ adjusted_entity["EndOffset"] = relative_end
+
+ recogniser_entity = recognizer_result_from_dict(adjusted_entity)
+
+ existing_entry = next(
+ (
+ entry
+ for idx, entry in all_text_line_results
+ if idx == line_idx
+ ),
+ None,
+ )
+ if existing_entry is None:
+ all_text_line_results.append((line_idx, [recogniser_entity]))
+ else:
+ existing_entry.append(
+ recogniser_entity
+ ) # Append to the existing list of entities
+
+ added_to_line = True
+
+ # Optional: Handle cases where the entity does not fit in any line
+ if not added_to_line:
+ print(f"Entity '{entity}' does not fit in any line.")
+
+ return all_text_line_results
+
+
+def do_aws_comprehend_call(
+ current_batch: str,
+ current_batch_mapping: List[Tuple],
+ comprehend_client: botocore.client.BaseClient,
+ language: str,
+ allow_list: List[str],
+ chosen_redact_comprehend_entities: List[str],
+ all_text_line_results: List[Tuple],
+ max_retries: int = 10,
+ retry_delay: int = 1,
+):
+ """
+ Uses AWS Comprehend to detect PII entities in a text batch and maps the results
+ back to the original lines for further processing.
+
+ Args:
+ current_batch (str): The concatenated text being analysed for PII.
+ current_batch_mapping (List[Tuple]): Mapping from batch offsets back to
+ individual line offsets and line indices for result mapping.
+ comprehend_client (botocore.client.BaseClient): AWS Comprehend boto3 client for making API calls.
+ language (str): The ISO language code for the text (e.g. "en").
+ allow_list (List[str]): List of lowercased phrases or words which, if detected,
+ should not be flagged/redacted even if AWS returns them as PII.
+ chosen_redact_comprehend_entities (List[str]): List of PII entity types (from AWS) enabled for detection/redaction.
+ all_text_line_results (List[Tuple]): Existing recognition results by line; this will be updated in-place and returned.
+ max_retries (int, optional): Maximum number of times to retry the AWS API in case of failure. Default is 10.
+ retry_delay (int, optional): Number of seconds to wait between retries. Default is 1.
+
+ Returns:
+ List[Tuple]: Updated list of recognition results by text line, with AWS detected PII mapped back to their source.
+ """
+ if not current_batch:
+ return all_text_line_results
+ # Guard: if no relevant AWS entity types are selected, skip AWS entirely.
+ # (CUSTOM/CUSTOM_FUZZY and other local-only entities are handled via Presidio.)
+ if not chosen_redact_comprehend_entities:
+ return all_text_line_results
+
+ for attempt in range(max_retries):
+ try:
+ response = comprehend_client.detect_pii_entities(
+ Text=current_batch.strip(), LanguageCode=language
+ )
+
+ all_text_line_results = map_back_comprehend_entity_results(
+ response,
+ current_batch_mapping,
+ allow_list,
+ chosen_redact_comprehend_entities,
+ all_text_line_results,
+ )
+
+ return all_text_line_results
+
+ except Exception as e:
+ if attempt == max_retries - 1:
+ print("AWS Comprehend calls failed due to", e)
+ raise
+ time.sleep(retry_delay)
+
+
+def run_page_text_redaction(
+ language: str,
+ chosen_redact_entities: List[str],
+ chosen_redact_comprehend_entities: List[str],
+ line_level_text_results_list: List[str],
+ line_characters: List,
+ page_analyser_results: List = list(),
+ page_analysed_bounding_boxes: List = list(),
+ comprehend_client=None,
+ allow_list: List[str] = None,
+ pii_identification_method: str = LOCAL_PII_OPTION,
+ nlp_analyser: AnalyzerEngine = None,
+ score_threshold: float = 0.0,
+ custom_entities: List[str] = None,
+ comprehend_query_number: int = 0,
+ bedrock_runtime=None,
+ model_choice: str = CLOUD_LLM_PII_MODEL_CHOICE,
+ custom_llm_instructions: str = "",
+ chosen_llm_entities: List[str] = None,
+ output_folder: str = None,
+ file_name: Optional[str] = None,
+ page_number: Optional[int] = None,
+ **text_analyzer_kwargs,
+):
+ """
+ This function performs text redaction on a page based on the specified language and chosen entities.
+
+ Args:
+ language (str): The language code for the text being processed.
+ chosen_redact_entities (List[str]): A list of entities to be redacted from the text.
+ chosen_redact_comprehend_entities (List[str]): A list of entities identified by AWS Comprehend for redaction.
+ line_level_text_results_list (List[str]): A list of text lines extracted from the page.
+ line_characters (List): A list of character-level information for each line of text.
+ page_analyser_results (List, optional): Results from previous page analysis. Defaults to an empty list.
+ page_analysed_bounding_boxes (List, optional): Bounding boxes for the analysed page. Defaults to an empty list.
+ comprehend_client: The AWS Comprehend client for making API calls. Defaults to None.
+ allow_list (List[str], optional): A list of allowed entities that should not be redacted. Defaults to None.
+ pii_identification_method (str, optional): The method used for PII identification. Defaults to LOCAL_PII_OPTION.
+ nlp_analyser (AnalyzerEngine, optional): The NLP analyzer engine used for local analysis. Defaults to None.
+ score_threshold (float, optional): The threshold score for entity detection. Defaults to 0.0.
+ custom_entities (List[str], optional): A list of custom entities for redaction. Defaults to None.
+ comprehend_query_number (int, optional): A counter for AWS Comprehend usage in units of 100 characters (1 unit = 100 characters, per AWS billing). Defaults to 0.
+ bedrock_runtime: The AWS Bedrock runtime client for LLM-based entity detection. Defaults to None.
+ model_choice (str, optional): The LLM model choice for entity detection. Defaults to CLOUD_LLM_PII_MODEL_CHOICE.
+ custom_llm_instructions (str, optional): Custom instructions for LLM-based entity detection. Defaults to "".
+ chosen_llm_entities (List[str], optional): A list of entities for LLM-based detection. Defaults to None.
+ output_folder (str, optional): Output folder for saving LLM prompts and responses. Defaults to None.
+ file_name (str, optional): File name (without extension) for saving LLM logs. Defaults to None.
+ page_number (int, optional): Page number for saving LLM logs. Defaults to None.
+ **text_analyzer_kwargs: Additional keyword arguments for text analysis.
+ """
+
+ page_text = ""
+ page_text_mapping = list()
+ all_text_line_results = list()
+ comprehend_query_number = 0
+
+ # Track LLM token usage
+ llm_total_input_tokens = 0
+ llm_total_output_tokens = 0
+ llm_model_name = ""
+
+ # Default chosen_llm_entities to chosen_redact_comprehend_entities if not provided
+ if chosen_llm_entities is None:
+ chosen_llm_entities = chosen_redact_comprehend_entities
+
+ # Collect all text from the page
+ for i, text_line in enumerate(line_level_text_results_list):
+ if page_text:
+ page_text += " "
+
+ start_pos = len(page_text)
+ page_text += text_line.text
+ page_text_mapping.append((start_pos, i, text_line, line_characters[i]))
+
+ # Determine language for downstream services
+ aws_language = language or "en"
+
+ valid_language_entities = nlp_analyser.registry.get_supported_entities(
+ languages=[language]
+ )
+ if "CUSTOM" not in valid_language_entities:
+ valid_language_entities.append("CUSTOM")
+ if "CUSTOM_FUZZY" not in valid_language_entities:
+ valid_language_entities.append("CUSTOM_FUZZY")
+
+ # Process based on identification method
+ if pii_identification_method == LOCAL_PII_OPTION:
+ if not nlp_analyser:
+ raise ValueError("nlp_analyser is required for Local identification method")
+
+ language_supported_entities = filter_entities_for_language(
+ chosen_redact_entities, valid_language_entities, language
+ )
+
+ # When only CUSTOM_VLM_* entities are chosen, local PII has nothing to do;
+ # allow progress so image/VLM analysis can run.
+ only_custom_vlm = chosen_redact_entities and all(
+ str(e).startswith("CUSTOM_VLM_") for e in (chosen_redact_entities or [])
+ )
+
+ if language_supported_entities:
+ text_analyzer_kwargs["entities"] = language_supported_entities
+ elif only_custom_vlm:
+ # Skip local PII; leave all_text_line_results empty so pipeline continues to VLM
+ pass
+ else:
+ out_message = f"No relevant entities supported for language: {language}"
+ print(out_message)
+ raise Warning(out_message)
+
+ if language_supported_entities:
+ # Filter out LLM-specific parameters that Presidio AnalyzerEngine doesn't accept
+ # Also exclude allow_list since we pass it explicitly
+ presidio_kwargs = {
+ k: v
+ for k, v in text_analyzer_kwargs.items()
+ if k
+ not in [
+ "inference_method",
+ "model_choice",
+ "api_url",
+ "local_model",
+ "tokenizer",
+ "assistant_model",
+ "client",
+ "client_config",
+ "temperature",
+ "max_tokens",
+ "custom_instructions",
+ "allow_list",
+ ]
+ }
+
+ page_analyser_result = nlp_analyser.analyze(
+ text=page_text,
+ language=language,
+ score_threshold=score_threshold,
+ return_decision_process=True,
+ allow_list=allow_list,
+ **presidio_kwargs,
+ )
+
+ all_text_line_results = map_back_entity_results(
+ page_analyser_result,
+ page_text_mapping,
+ all_text_line_results,
+ allow_list=allow_list,
+ )
+
+ elif pii_identification_method == AWS_PII_OPTION:
+
+ # Run local detection for any custom entities (including CUSTOM/CUSTOM_FUZZY)
+ local_custom_entities = [
+ entity
+ for entity in (chosen_redact_comprehend_entities or [])
+ if entity in (custom_entities or []) or entity in ("CUSTOM", "CUSTOM_FUZZY")
+ ]
+
+ if local_custom_entities:
+ # Filter entities to only include those supported by the language
+ language_supported_entities = filter_entities_for_language(
+ local_custom_entities, valid_language_entities, language
+ )
+
+ if language_supported_entities:
+ text_analyzer_kwargs["entities"] = language_supported_entities
+
+ # Filter out LLM-specific parameters that Presidio AnalyzerEngine doesn't accept
+ # Also exclude allow_list since we pass it explicitly
+ presidio_kwargs = {
+ k: v
+ for k, v in text_analyzer_kwargs.items()
+ if k
+ not in [
+ "inference_method",
+ "model_choice",
+ "api_url",
+ "local_model",
+ "tokenizer",
+ "assistant_model",
+ "client",
+ "client_config",
+ "temperature",
+ "max_tokens",
+ "custom_instructions",
+ "allow_list",
+ ]
+ }
+
+ page_analyser_result = nlp_analyser.analyze(
+ text=page_text,
+ language=language,
+ score_threshold=score_threshold,
+ return_decision_process=True,
+ allow_list=allow_list,
+ **presidio_kwargs,
+ )
+
+ all_text_line_results = map_back_entity_results(
+ page_analyser_result,
+ page_text_mapping,
+ all_text_line_results,
+ allow_list=allow_list,
+ )
+
+ # Guard: only call AWS Comprehend when at least one non-custom Comprehend entity is selected.
+ aws_comprehend_entities = [
+ entity
+ for entity in (chosen_redact_comprehend_entities or [])
+ if entity in (FULL_COMPREHEND_ENTITY_LIST or [])
+ and entity not in ("CUSTOM", "CUSTOM_FUZZY")
+ ]
+
+ # Process text in batches for AWS Comprehend
+ current_batch = ""
+ current_batch_mapping = list()
+ batch_char_count = 0
+ batch_word_count = 0
+
+ for i, text_line in enumerate(line_level_text_results_list):
+ words = text_line.text.split()
+ word_start_positions = list()
+ current_pos = 0
+
+ for word in words:
+ word_start_positions.append(current_pos)
+ current_pos += len(word) + 1
+
+ word_idx = 0
+ while word_idx < len(words):
+ word = words[word_idx]
+ new_batch_char_count = len(current_batch) + len(word) + 1
+
+ # Check if we've hit the limit
+ limit_reached = (
+ batch_word_count >= DEFAULT_NEW_BATCH_WORD_COUNT
+ or new_batch_char_count >= DEFAULT_NEW_BATCH_CHAR_COUNT
+ )
+
+ if limit_reached:
+ # Add the current word to the batch first
+ if current_batch:
+ current_batch += " "
+ batch_char_count += 1
+ current_batch += word
+ batch_char_count += len(word)
+ batch_word_count += 1
+
+ if not current_batch_mapping or current_batch_mapping[-1][1] != i:
+ current_batch_mapping.append(
+ (
+ batch_char_count - len(word),
+ i,
+ text_line,
+ line_characters[i],
+ word_start_positions[word_idx],
+ )
+ )
+
+ # Check if current word ends with phrase punctuation
+ if ends_with_phrase_punctuation(word):
+ # Process current batch
+ all_text_line_results = do_aws_comprehend_call(
+ current_batch,
+ current_batch_mapping,
+ comprehend_client,
+ aws_language,
+ text_analyzer_kwargs.get("allow_list", allow_list or []),
+ aws_comprehend_entities,
+ all_text_line_results,
+ )
+ if aws_comprehend_entities:
+ comprehend_query_number += (
+ len(current_batch.strip())
+ + COMPREHEND_CHARACTERS_PER_UNIT
+ - 1
+ ) // COMPREHEND_CHARACTERS_PER_UNIT
+
+ # Reset batch
+ current_batch = ""
+ batch_word_count = 0
+ batch_char_count = 0
+ current_batch_mapping = list()
+ word_idx += 1
+ else:
+ # Look ahead in current line for phrase-ending punctuation or end of line
+ lookahead_idx = word_idx + 1
+ lookahead_batch = current_batch
+ lookahead_char_count = batch_char_count
+ lookahead_word_count = batch_word_count
+ lookahead_mapping = list(current_batch_mapping)
+
+ # Continue adding words until we find phrase-ending punctuation or end of line
+ while lookahead_idx < len(words):
+ lookahead_word = words[lookahead_idx]
+
+ # Add the word to lookahead batch
+ if lookahead_batch:
+ lookahead_batch += " "
+ lookahead_char_count += 1
+ lookahead_batch += lookahead_word
+ lookahead_char_count += len(lookahead_word)
+ lookahead_word_count += 1
+
+ if not lookahead_mapping or lookahead_mapping[-1][1] != i:
+ lookahead_mapping.append(
+ (
+ lookahead_char_count - len(lookahead_word),
+ i,
+ text_line,
+ line_characters[i],
+ word_start_positions[lookahead_idx],
+ )
+ )
+
+ # Check if this word ends with phrase punctuation
+ if ends_with_phrase_punctuation(lookahead_word):
+ break
+
+ lookahead_idx += 1
+
+ # Use the lookahead batch (either found phrase end or reached end of line)
+ current_batch = lookahead_batch
+ batch_char_count = lookahead_char_count
+ batch_word_count = lookahead_word_count
+ current_batch_mapping = lookahead_mapping
+
+ # Process current batch
+ all_text_line_results = do_aws_comprehend_call(
+ current_batch,
+ current_batch_mapping,
+ comprehend_client,
+ aws_language,
+ text_analyzer_kwargs.get("allow_list", allow_list or []),
+ aws_comprehend_entities,
+ all_text_line_results,
+ )
+ if aws_comprehend_entities:
+ comprehend_query_number += (
+ len(current_batch.strip())
+ + COMPREHEND_CHARACTERS_PER_UNIT
+ - 1
+ ) // COMPREHEND_CHARACTERS_PER_UNIT
+
+ # Reset batch
+ current_batch = ""
+ batch_word_count = 0
+ batch_char_count = 0
+ current_batch_mapping = list()
+ word_idx = lookahead_idx + 1
+ else:
+ # Normal case: add word to batch
+ if current_batch:
+ current_batch += " "
+ batch_char_count += 1
+ current_batch += word
+ batch_char_count += len(word)
+ batch_word_count += 1
+
+ if not current_batch_mapping or current_batch_mapping[-1][1] != i:
+ current_batch_mapping.append(
+ (
+ batch_char_count - len(word),
+ i,
+ text_line,
+ line_characters[i],
+ word_start_positions[word_idx],
+ )
+ )
+ word_idx += 1
+
+ # Process final batch if any
+ if current_batch:
+ all_text_line_results = do_aws_comprehend_call(
+ current_batch,
+ current_batch_mapping,
+ comprehend_client,
+ aws_language,
+ text_analyzer_kwargs.get("allow_list", allow_list or []),
+ aws_comprehend_entities,
+ all_text_line_results,
+ )
+ if aws_comprehend_entities:
+ comprehend_query_number += (
+ len(current_batch.strip()) + COMPREHEND_CHARACTERS_PER_UNIT - 1
+ ) // COMPREHEND_CHARACTERS_PER_UNIT
+
+ elif pii_identification_method == AWS_LLM_PII_OPTION:
+ # LLM-based entity detection using AWS Bedrock
+ try:
+ from tools.llm_entity_detection import do_llm_entity_detection_call
+ except ImportError as e:
+ print(f"Error importing LLM entity detection: {e}")
+ raise ImportError(
+ "LLM entity detection not available. Please ensure llm_entity_detection.py is accessible."
+ )
+
+ if not bedrock_runtime:
+ raise ValueError(
+ "bedrock_runtime is required when using LLM-based PII detection"
+ )
+
+ # Set inference method to aws-bedrock if not already set
+ if text_analyzer_kwargs.get("inference_method") is None:
+ text_analyzer_kwargs["inference_method"] = "aws-bedrock"
+
+ # Update model_choice to use CLOUD_LLM_PII_MODEL_CHOICE for Bedrock, or value from text_analyzer_kwargs if set
+ if text_analyzer_kwargs.get("model_choice") is None:
+ model_choice = CLOUD_LLM_PII_MODEL_CHOICE
+ else:
+ model_choice = text_analyzer_kwargs.get(
+ "model_choice", CLOUD_LLM_PII_MODEL_CHOICE
+ )
+
+ # Set LLM model name for tracking (use custom-instructions model when applicable)
+ custom_instructions_model = (
+ CLOUD_LLM_PII_CUSTOM_INSTRUCTIONS_MODEL_CHOICE.strip()
+ if isinstance(CLOUD_LLM_PII_CUSTOM_INSTRUCTIONS_MODEL_CHOICE, str)
+ and CLOUD_LLM_PII_CUSTOM_INSTRUCTIONS_MODEL_CHOICE.strip()
+ else ""
+ )
+ if (
+ (custom_llm_instructions or "").strip()
+ and model_choice == CLOUD_LLM_PII_MODEL_CHOICE
+ and custom_instructions_model
+ ):
+ llm_model_name = custom_instructions_model
+ else:
+ llm_model_name = model_choice or ""
+
+ # Handle custom entities first (same as AWS Comprehend)
+ # Include CUSTOM/CUSTOM_FUZZY (deny list) so deny-list words are redacted when CUSTOM is selected
+ local_custom_entities = [
+ entity
+ for entity in (chosen_llm_entities or [])
+ if entity in (custom_entities or []) or entity in ("CUSTOM", "CUSTOM_FUZZY")
+ ]
+
+ if local_custom_entities:
+ # Filter entities to only include those supported by the language
+ language_supported_entities = filter_entities_for_language(
+ local_custom_entities, valid_language_entities, language
+ )
+
+ if language_supported_entities:
+ text_analyzer_kwargs["entities"] = language_supported_entities
+
+ # Filter out LLM-specific parameters that Presidio AnalyzerEngine doesn't accept
+ # Also exclude allow_list since we pass it explicitly
+ presidio_kwargs = {
+ k: v
+ for k, v in text_analyzer_kwargs.items()
+ if k
+ not in [
+ "inference_method",
+ "model_choice",
+ "api_url",
+ "local_model",
+ "tokenizer",
+ "assistant_model",
+ "client",
+ "client_config",
+ "temperature",
+ "max_tokens",
+ "custom_instructions",
+ "allow_list",
+ ]
+ }
+
+ page_analyser_result = nlp_analyser.analyze(
+ text=page_text,
+ language=language,
+ score_threshold=score_threshold,
+ return_decision_process=True,
+ allow_list=allow_list,
+ **presidio_kwargs,
+ )
+ all_text_line_results = map_back_entity_results(
+ page_analyser_result, page_text_mapping, all_text_line_results
+ )
+
+ # Process text in batches for LLM (same batching logic as AWS Comprehend)
+ current_batch = ""
+ current_batch_mapping = list()
+ batch_char_count = 0
+ batch_word_count = 0
+
+ for i, text_line in enumerate(line_level_text_results_list):
+ words = text_line.text.split()
+ word_start_positions = list()
+ current_pos = 0
+
+ for word in words:
+ word_start_positions.append(current_pos)
+ current_pos += len(word) + 1
+
+ word_idx = 0
+ while word_idx < len(words):
+ word = words[word_idx]
+ new_batch_char_count = len(current_batch) + len(word) + 1
+
+ # Check if we've hit the limit
+ limit_reached = (
+ batch_word_count >= DEFAULT_NEW_BATCH_WORD_COUNT
+ or new_batch_char_count >= DEFAULT_NEW_BATCH_CHAR_COUNT
+ )
+
+ if limit_reached:
+ # Add the current word to the batch first
+ if current_batch:
+ current_batch += " "
+ batch_char_count += 1
+ current_batch += word
+ batch_char_count += len(word)
+ batch_word_count += 1
+
+ if not current_batch_mapping or current_batch_mapping[-1][1] != i:
+ current_batch_mapping.append(
+ (
+ batch_char_count - len(word),
+ i,
+ text_line,
+ line_characters[i],
+ word_start_positions[word_idx],
+ )
+ )
+
+ # Check if current word ends with phrase punctuation
+ if ends_with_phrase_punctuation(word):
+ # Remove 'CUSTOM' entities from the chosen_llm_entities list
+ llm_chosen_redact_comprehend_entities = [
+ entity
+ for entity in chosen_llm_entities
+ if entity != "CUSTOM"
+ ]
+ # Process current batch
+ (
+ all_text_line_results,
+ batch_input_tokens,
+ batch_output_tokens,
+ ) = do_llm_entity_detection_call(
+ current_batch,
+ current_batch_mapping,
+ bedrock_runtime=bedrock_runtime,
+ language=aws_language,
+ allow_list=text_analyzer_kwargs.get(
+ "allow_list", allow_list or []
+ ),
+ chosen_redact_llm_entities=llm_chosen_redact_comprehend_entities,
+ all_text_line_results=all_text_line_results,
+ model_choice=model_choice,
+ temperature=text_analyzer_kwargs.get(
+ "temperature", LLM_TEMPERATURE
+ ),
+ max_tokens=text_analyzer_kwargs.get(
+ "max_tokens", LLM_MAX_NEW_TOKENS
+ ),
+ output_folder=output_folder,
+ batch_number=comprehend_query_number + 1,
+ custom_instructions=custom_llm_instructions,
+ file_name=file_name,
+ page_number=page_number,
+ inference_method=text_analyzer_kwargs.get(
+ "inference_method"
+ ),
+ # local_model=text_analyzer_kwargs.get("local_model"),
+ # tokenizer=text_analyzer_kwargs.get("tokenizer"),
+ # assistant_model=text_analyzer_kwargs.get("assistant_model"),
+ client=text_analyzer_kwargs.get("client"),
+ client_config=text_analyzer_kwargs.get("client_config"),
+ api_url=text_analyzer_kwargs.get("api_url"),
+ )
+ comprehend_query_number += 1
+
+ # Reset batch
+ current_batch = ""
+ batch_word_count = 0
+ batch_char_count = 0
+ current_batch_mapping = list()
+ word_idx += 1
+ else:
+ # Look ahead in current line for phrase-ending punctuation or end of line
+ lookahead_idx = word_idx + 1
+ lookahead_batch = current_batch
+ lookahead_char_count = batch_char_count
+ lookahead_word_count = batch_word_count
+ lookahead_mapping = list(current_batch_mapping)
+
+ # Continue adding words until we find phrase-ending punctuation or end of line
+ while lookahead_idx < len(words):
+ lookahead_word = words[lookahead_idx]
+
+ # Add the word to lookahead batch
+ if lookahead_batch:
+ lookahead_batch += " "
+ lookahead_char_count += 1
+ lookahead_batch += lookahead_word
+ lookahead_char_count += len(lookahead_word)
+ lookahead_word_count += 1
+
+ if not lookahead_mapping or lookahead_mapping[-1][1] != i:
+ lookahead_mapping.append(
+ (
+ lookahead_char_count - len(lookahead_word),
+ i,
+ text_line,
+ line_characters[i],
+ word_start_positions[lookahead_idx],
+ )
+ )
+
+ # Check if this word ends with phrase punctuation
+ if ends_with_phrase_punctuation(lookahead_word):
+ break
+
+ lookahead_idx += 1
+
+ # Use the lookahead batch (either found phrase end or reached end of line)
+ current_batch = lookahead_batch
+ batch_char_count = lookahead_char_count
+ batch_word_count = lookahead_word_count
+ current_batch_mapping = lookahead_mapping
+
+ # Remove 'CUSTOM' entities from the chosen_llm_entities list
+ llm_chosen_redact_comprehend_entities = [
+ entity
+ for entity in chosen_llm_entities
+ if entity != "CUSTOM"
+ ]
+ # Process current batch
+ (
+ all_text_line_results,
+ batch_input_tokens,
+ batch_output_tokens,
+ ) = do_llm_entity_detection_call(
+ current_batch,
+ current_batch_mapping,
+ bedrock_runtime=bedrock_runtime,
+ language=aws_language,
+ allow_list=text_analyzer_kwargs.get(
+ "allow_list", allow_list or []
+ ),
+ chosen_redact_llm_entities=llm_chosen_redact_comprehend_entities,
+ all_text_line_results=all_text_line_results,
+ model_choice=model_choice,
+ temperature=text_analyzer_kwargs.get(
+ "temperature", LLM_TEMPERATURE
+ ),
+ max_tokens=text_analyzer_kwargs.get(
+ "max_tokens", LLM_MAX_NEW_TOKENS
+ ),
+ output_folder=output_folder,
+ batch_number=comprehend_query_number + 1,
+ custom_instructions=custom_llm_instructions,
+ file_name=file_name,
+ page_number=page_number,
+ inference_method=text_analyzer_kwargs.get(
+ "inference_method"
+ ),
+ # local_model=text_analyzer_kwargs.get("local_model"),
+ # tokenizer=text_analyzer_kwargs.get("tokenizer"),
+ # assistant_model=text_analyzer_kwargs.get("assistant_model"),
+ client=text_analyzer_kwargs.get("client"),
+ client_config=text_analyzer_kwargs.get("client_config"),
+ api_url=text_analyzer_kwargs.get("api_url"),
+ )
+ comprehend_query_number += 1
+
+ # Reset batch
+ current_batch = ""
+ batch_word_count = 0
+ batch_char_count = 0
+ current_batch_mapping = list()
+ word_idx = lookahead_idx + 1
+ else:
+ # Normal case: add word to batch
+ if current_batch:
+ current_batch += " "
+ batch_char_count += 1
+ current_batch += word
+ batch_char_count += len(word)
+ batch_word_count += 1
+
+ if not current_batch_mapping or current_batch_mapping[-1][1] != i:
+ current_batch_mapping.append(
+ (
+ batch_char_count - len(word),
+ i,
+ text_line,
+ line_characters[i],
+ word_start_positions[word_idx],
+ )
+ )
+ word_idx += 1
+
+ # Process final batch if any
+ if current_batch:
+ # Remove 'CUSTOM' entities from the chosen_llm_entities list
+ llm_chosen_redact_comprehend_entities = [
+ entity for entity in chosen_llm_entities if entity != "CUSTOM"
+ ]
+ all_text_line_results, batch_input_tokens, batch_output_tokens = (
+ do_llm_entity_detection_call(
+ current_batch,
+ current_batch_mapping,
+ bedrock_runtime=bedrock_runtime,
+ language=aws_language,
+ allow_list=text_analyzer_kwargs.get("allow_list", allow_list or []),
+ chosen_redact_llm_entities=llm_chosen_redact_comprehend_entities,
+ all_text_line_results=all_text_line_results,
+ model_choice=model_choice,
+ temperature=text_analyzer_kwargs.get(
+ "temperature", LLM_TEMPERATURE
+ ),
+ max_tokens=text_analyzer_kwargs.get(
+ "max_tokens", LLM_MAX_NEW_TOKENS
+ ),
+ output_folder=output_folder,
+ batch_number=comprehend_query_number + 1,
+ custom_instructions=custom_llm_instructions,
+ file_name=file_name,
+ page_number=page_number,
+ inference_method=text_analyzer_kwargs.get("inference_method"),
+ # local_model=text_analyzer_kwargs.get("local_model"),
+ # tokenizer=text_analyzer_kwargs.get("tokenizer"),
+ # assistant_model=text_analyzer_kwargs.get("assistant_model"),
+ client=text_analyzer_kwargs.get("client"),
+ client_config=text_analyzer_kwargs.get("client_config"),
+ api_url=text_analyzer_kwargs.get("api_url"),
+ )
+ )
+ # Accumulate token usage
+ llm_total_input_tokens += batch_input_tokens
+ llm_total_output_tokens += batch_output_tokens
+ comprehend_query_number += 1
+ comprehend_query_number += 1
+ elif pii_identification_method == INFERENCE_SERVER_PII_OPTION:
+ # LLM-based entity detection using inference server
+ try:
+ from tools.llm_entity_detection import do_llm_entity_detection_call
+ except ImportError as e:
+ print(f"Error importing LLM entity detection: {e}")
+ raise ImportError(
+ "LLM entity detection not available. Please ensure llm_entity_detection.py is accessible."
+ )
+
+ # Set inference method to inference-server if not already set
+ if text_analyzer_kwargs.get("inference_method") is None:
+ text_analyzer_kwargs["inference_method"] = "inference-server"
+
+ # Set API URL if not already set
+ if text_analyzer_kwargs.get("api_url") is None:
+ text_analyzer_kwargs["api_url"] = INFERENCE_SERVER_API_URL
+
+ # Set model choice if not already set - use INFERENCE_SERVER_LLM_PII_MODEL_CHOICE
+ if text_analyzer_kwargs.get("model_choice") is None:
+ text_analyzer_kwargs["model_choice"] = INFERENCE_SERVER_LLM_PII_MODEL_CHOICE
+
+ # Update model_choice to use the value from text_analyzer_kwargs
+ model_choice = text_analyzer_kwargs.get(
+ "model_choice", INFERENCE_SERVER_LLM_PII_MODEL_CHOICE
+ )
+
+ # Set LLM model name for tracking
+ llm_model_name = model_choice or ""
+
+ # Handle custom entities first (same as AWS Comprehend)
+ # Include CUSTOM/CUSTOM_FUZZY (deny list) so deny-list words are redacted when CUSTOM is selected
+ local_custom_entities = [
+ entity
+ for entity in (chosen_llm_entities or [])
+ if entity in (custom_entities or []) or entity in ("CUSTOM", "CUSTOM_FUZZY")
+ ]
+
+ if local_custom_entities:
+ # Filter entities to only include those supported by the language
+ language_supported_entities = filter_entities_for_language(
+ local_custom_entities, valid_language_entities, language
+ )
+
+ if language_supported_entities:
+ text_analyzer_kwargs["entities"] = language_supported_entities
+
+ # Filter out LLM-specific parameters that Presidio AnalyzerEngine doesn't accept
+ # Also exclude allow_list since we pass it explicitly
+ presidio_kwargs = {
+ k: v
+ for k, v in text_analyzer_kwargs.items()
+ if k
+ not in [
+ "inference_method",
+ "model_choice",
+ "api_url",
+ "local_model",
+ "tokenizer",
+ "assistant_model",
+ "client",
+ "client_config",
+ "temperature",
+ "max_tokens",
+ "custom_instructions",
+ "allow_list",
+ ]
+ }
+
+ page_analyser_result = nlp_analyser.analyze(
+ text=page_text,
+ language=language,
+ score_threshold=score_threshold,
+ return_decision_process=True,
+ allow_list=allow_list,
+ **presidio_kwargs,
+ )
+ all_text_line_results = map_back_entity_results(
+ page_analyser_result, page_text_mapping, all_text_line_results
+ )
+
+ # Process text in batches for LLM (same batching logic as AWS Comprehend)
+ current_batch = ""
+ current_batch_mapping = list()
+ batch_char_count = 0
+ batch_word_count = 0
+
+ for i, text_line in enumerate(line_level_text_results_list):
+ words = text_line.text.split()
+ word_start_positions = list()
+ current_pos = 0
+
+ for word in words:
+ word_start_positions.append(current_pos)
+ current_pos += len(word) + 1
+
+ word_idx = 0
+ while word_idx < len(words):
+ word = words[word_idx]
+ new_batch_char_count = len(current_batch) + len(word) + 1
+
+ # Check if we've hit the limit
+ limit_reached = (
+ batch_word_count >= DEFAULT_NEW_BATCH_WORD_COUNT
+ or new_batch_char_count >= DEFAULT_NEW_BATCH_CHAR_COUNT
+ )
+
+ if limit_reached:
+ # Add the current word to the batch first
+ if current_batch:
+ current_batch += " "
+ batch_char_count += 1
+ current_batch += word
+ batch_char_count += len(word)
+ batch_word_count += 1
+
+ if not current_batch_mapping or current_batch_mapping[-1][1] != i:
+ current_batch_mapping.append(
+ (
+ batch_char_count - len(word),
+ i,
+ text_line,
+ line_characters[i],
+ word_start_positions[word_idx],
+ )
+ )
+
+ # Check if current word ends with phrase punctuation
+ if ends_with_phrase_punctuation(word):
+ # Remove 'CUSTOM' entities from the chosen_llm_entities list
+ llm_chosen_redact_comprehend_entities = [
+ entity
+ for entity in chosen_llm_entities
+ if entity != "CUSTOM"
+ ]
+ # Process current batch
+ (
+ all_text_line_results,
+ batch_input_tokens,
+ batch_output_tokens,
+ ) = do_llm_entity_detection_call(
+ current_batch,
+ current_batch_mapping,
+ bedrock_runtime=bedrock_runtime,
+ language=aws_language,
+ allow_list=text_analyzer_kwargs.get(
+ "allow_list", allow_list or []
+ ),
+ chosen_redact_llm_entities=llm_chosen_redact_comprehend_entities,
+ all_text_line_results=all_text_line_results,
+ model_choice=model_choice,
+ temperature=text_analyzer_kwargs.get(
+ "temperature", LLM_TEMPERATURE
+ ),
+ max_tokens=text_analyzer_kwargs.get(
+ "max_tokens", LLM_MAX_NEW_TOKENS
+ ),
+ output_folder=output_folder,
+ batch_number=comprehend_query_number + 1,
+ custom_instructions=custom_llm_instructions,
+ file_name=file_name,
+ page_number=page_number,
+ inference_method=text_analyzer_kwargs.get(
+ "inference_method"
+ ),
+ # local_model=text_analyzer_kwargs.get("local_model"),
+ # tokenizer=text_analyzer_kwargs.get("tokenizer"),
+ # assistant_model=text_analyzer_kwargs.get("assistant_model"),
+ client=text_analyzer_kwargs.get("client"),
+ client_config=text_analyzer_kwargs.get("client_config"),
+ api_url=text_analyzer_kwargs.get("api_url"),
+ )
+ comprehend_query_number += 1
+
+ # Reset batch
+ current_batch = ""
+ batch_word_count = 0
+ batch_char_count = 0
+ current_batch_mapping = list()
+ word_idx += 1
+ else:
+ # Look ahead in current line for phrase-ending punctuation or end of line
+ lookahead_idx = word_idx + 1
+ lookahead_batch = current_batch
+ lookahead_char_count = batch_char_count
+ lookahead_word_count = batch_word_count
+ lookahead_mapping = list(current_batch_mapping)
+
+ # Continue adding words until we find phrase-ending punctuation or end of line
+ while lookahead_idx < len(words):
+ lookahead_word = words[lookahead_idx]
+
+ # Add the word to lookahead batch
+ if lookahead_batch:
+ lookahead_batch += " "
+ lookahead_char_count += 1
+ lookahead_batch += lookahead_word
+ lookahead_char_count += len(lookahead_word)
+ lookahead_word_count += 1
+
+ if not lookahead_mapping or lookahead_mapping[-1][1] != i:
+ lookahead_mapping.append(
+ (
+ lookahead_char_count - len(lookahead_word),
+ i,
+ text_line,
+ line_characters[i],
+ word_start_positions[lookahead_idx],
+ )
+ )
+
+ # Check if this word ends with phrase punctuation
+ if ends_with_phrase_punctuation(lookahead_word):
+ break
+
+ lookahead_idx += 1
+
+ # Use the lookahead batch (either found phrase end or reached end of line)
+ current_batch = lookahead_batch
+ batch_char_count = lookahead_char_count
+ batch_word_count = lookahead_word_count
+ current_batch_mapping = lookahead_mapping
+
+ # Remove 'CUSTOM' entities from the chosen_llm_entities list
+ llm_chosen_redact_comprehend_entities = [
+ entity
+ for entity in chosen_llm_entities
+ if entity != "CUSTOM"
+ ]
+ # Process current batch
+ (
+ all_text_line_results,
+ batch_input_tokens,
+ batch_output_tokens,
+ ) = do_llm_entity_detection_call(
+ current_batch,
+ current_batch_mapping,
+ bedrock_runtime=bedrock_runtime,
+ language=aws_language,
+ allow_list=text_analyzer_kwargs.get(
+ "allow_list", allow_list or []
+ ),
+ chosen_redact_llm_entities=llm_chosen_redact_comprehend_entities,
+ all_text_line_results=all_text_line_results,
+ model_choice=model_choice,
+ temperature=text_analyzer_kwargs.get(
+ "temperature", LLM_TEMPERATURE
+ ),
+ max_tokens=text_analyzer_kwargs.get(
+ "max_tokens", LLM_MAX_NEW_TOKENS
+ ),
+ output_folder=output_folder,
+ batch_number=comprehend_query_number + 1,
+ custom_instructions=custom_llm_instructions,
+ file_name=file_name,
+ page_number=page_number,
+ inference_method=text_analyzer_kwargs.get(
+ "inference_method"
+ ),
+ # local_model=text_analyzer_kwargs.get("local_model"),
+ # tokenizer=text_analyzer_kwargs.get("tokenizer"),
+ # assistant_model=text_analyzer_kwargs.get("assistant_model"),
+ client=text_analyzer_kwargs.get("client"),
+ client_config=text_analyzer_kwargs.get("client_config"),
+ api_url=text_analyzer_kwargs.get("api_url"),
+ )
+ comprehend_query_number += 1
+
+ # Reset batch
+ current_batch = ""
+ batch_word_count = 0
+ batch_char_count = 0
+ current_batch_mapping = list()
+ word_idx = lookahead_idx + 1
+ else:
+ # Normal case: add word to batch
+ if current_batch:
+ current_batch += " "
+ batch_char_count += 1
+ current_batch += word
+ batch_char_count += len(word)
+ batch_word_count += 1
+
+ if not current_batch_mapping or current_batch_mapping[-1][1] != i:
+ current_batch_mapping.append(
+ (
+ batch_char_count - len(word),
+ i,
+ text_line,
+ line_characters[i],
+ word_start_positions[word_idx],
+ )
+ )
+ word_idx += 1
+
+ # Process final batch if any
+ if current_batch:
+ # Remove 'CUSTOM' entities from the chosen_llm_entities list
+ llm_chosen_redact_comprehend_entities = [
+ entity for entity in chosen_llm_entities if entity != "CUSTOM"
+ ]
+ all_text_line_results, batch_input_tokens, batch_output_tokens = (
+ do_llm_entity_detection_call(
+ current_batch,
+ current_batch_mapping,
+ bedrock_runtime=bedrock_runtime,
+ language=aws_language,
+ allow_list=text_analyzer_kwargs.get("allow_list", allow_list or []),
+ chosen_redact_llm_entities=llm_chosen_redact_comprehend_entities,
+ all_text_line_results=all_text_line_results,
+ model_choice=model_choice,
+ temperature=text_analyzer_kwargs.get(
+ "temperature", LLM_TEMPERATURE
+ ),
+ max_tokens=text_analyzer_kwargs.get(
+ "max_tokens", LLM_MAX_NEW_TOKENS
+ ),
+ output_folder=output_folder,
+ batch_number=comprehend_query_number + 1,
+ custom_instructions=custom_llm_instructions,
+ file_name=file_name,
+ page_number=page_number,
+ inference_method=text_analyzer_kwargs.get("inference_method"),
+ # local_model=text_analyzer_kwargs.get("local_model"),
+ # tokenizer=text_analyzer_kwargs.get("tokenizer"),
+ # assistant_model=text_analyzer_kwargs.get("assistant_model"),
+ client=text_analyzer_kwargs.get("client"),
+ client_config=text_analyzer_kwargs.get("client_config"),
+ api_url=text_analyzer_kwargs.get("api_url"),
+ )
+ )
+ # Accumulate token usage
+ llm_total_input_tokens += batch_input_tokens
+ llm_total_output_tokens += batch_output_tokens
+ comprehend_query_number += 1
+
+ elif pii_identification_method == LOCAL_TRANSFORMERS_LLM_PII_OPTION:
+ # LLM-based entity detection using local transformers models
+ try:
+ from tools.llm_entity_detection import do_llm_entity_detection_call
+ except ImportError as e:
+ print(f"Error importing LLM entity detection: {e}")
+ raise ImportError(
+ "LLM entity detection not available. Please ensure llm_entity_detection.py is accessible."
+ )
+
+ # Set inference method to local if not already set
+ if text_analyzer_kwargs.get("inference_method") is None:
+ text_analyzer_kwargs["inference_method"] = "local"
+
+ # Set model choice if not already set - use VLM model when USE_TRANSFORMERS_VLM_MODEL_AS_LLM else LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE
+ if text_analyzer_kwargs.get("model_choice") is None:
+ text_analyzer_kwargs["model_choice"] = (
+ SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL
+ if USE_TRANSFORMERS_VLM_MODEL_AS_LLM
+ else LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE
+ )
+
+ # Update model_choice to use the value from text_analyzer_kwargs
+ model_choice = text_analyzer_kwargs.get(
+ "model_choice",
+ (
+ SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL
+ if USE_TRANSFORMERS_VLM_MODEL_AS_LLM
+ else LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE
+ ),
+ )
+
+ # Handle custom entities first (same as AWS Comprehend)
+ # Include CUSTOM/CUSTOM_FUZZY (deny list) so deny-list words are redacted when CUSTOM is selected
+ local_custom_entities = [
+ entity
+ for entity in (chosen_llm_entities or [])
+ if entity in (custom_entities or []) or entity in ("CUSTOM", "CUSTOM_FUZZY")
+ ]
+
+ if local_custom_entities:
+ # Filter entities to only include those supported by the language
+ language_supported_entities = filter_entities_for_language(
+ local_custom_entities, valid_language_entities, language
+ )
+
+ if language_supported_entities:
+ text_analyzer_kwargs["entities"] = language_supported_entities
+
+ # Filter out LLM-specific parameters that Presidio AnalyzerEngine doesn't accept
+ # Also exclude allow_list since we pass it explicitly
+ presidio_kwargs = {
+ k: v
+ for k, v in text_analyzer_kwargs.items()
+ if k
+ not in [
+ "inference_method",
+ "model_choice",
+ "api_url",
+ "local_model",
+ "tokenizer",
+ "assistant_model",
+ "client",
+ "client_config",
+ "temperature",
+ "max_tokens",
+ "custom_instructions",
+ "allow_list",
+ ]
+ }
+
+ page_analyser_result = nlp_analyser.analyze(
+ text=page_text,
+ language=language,
+ score_threshold=score_threshold,
+ return_decision_process=True,
+ allow_list=allow_list,
+ **presidio_kwargs,
+ )
+ all_text_line_results = map_back_entity_results(
+ page_analyser_result, page_text_mapping, all_text_line_results
+ )
+
+ # Process text in batches for LLM (same batching logic as AWS Comprehend)
+ current_batch = ""
+ current_batch_mapping = list()
+ batch_char_count = 0
+ batch_word_count = 0
+
+ for i, text_line in enumerate(line_level_text_results_list):
+ words = text_line.text.split()
+ word_start_positions = list()
+ current_pos = 0
+
+ for word in words:
+ word_start_positions.append(current_pos)
+ current_pos += len(word) + 1
+
+ word_idx = 0
+ while word_idx < len(words):
+ word = words[word_idx]
+ new_batch_char_count = len(current_batch) + len(word) + 1
+
+ # Check if we've hit the limit
+ limit_reached = (
+ batch_word_count >= DEFAULT_NEW_BATCH_WORD_COUNT
+ or new_batch_char_count >= DEFAULT_NEW_BATCH_CHAR_COUNT
+ )
+
+ if limit_reached:
+ # Add the current word to the batch first
+ if current_batch:
+ current_batch += " "
+ batch_char_count += 1
+ current_batch += word
+ batch_char_count += len(word)
+ batch_word_count += 1
+
+ if not current_batch_mapping or current_batch_mapping[-1][1] != i:
+ current_batch_mapping.append(
+ (
+ batch_char_count - len(word),
+ i,
+ text_line,
+ line_characters[i],
+ word_start_positions[word_idx],
+ )
+ )
+
+ # Check if current word ends with phrase punctuation
+ if ends_with_phrase_punctuation(word):
+ # Remove 'CUSTOM' entities from the chosen_llm_entities list
+ llm_chosen_redact_comprehend_entities = [
+ entity
+ for entity in chosen_llm_entities
+ if entity != "CUSTOM"
+ ]
+ # Process current batch
+ (
+ all_text_line_results,
+ batch_input_tokens,
+ batch_output_tokens,
+ ) = do_llm_entity_detection_call(
+ current_batch,
+ current_batch_mapping,
+ bedrock_runtime=bedrock_runtime,
+ language=aws_language,
+ allow_list=text_analyzer_kwargs.get(
+ "allow_list", allow_list or []
+ ),
+ chosen_redact_llm_entities=llm_chosen_redact_comprehend_entities,
+ all_text_line_results=all_text_line_results,
+ model_choice=model_choice,
+ temperature=text_analyzer_kwargs.get(
+ "temperature", LLM_TEMPERATURE
+ ),
+ max_tokens=text_analyzer_kwargs.get(
+ "max_tokens", LLM_MAX_NEW_TOKENS
+ ),
+ output_folder=output_folder,
+ batch_number=comprehend_query_number + 1,
+ custom_instructions=custom_llm_instructions,
+ file_name=file_name,
+ page_number=page_number,
+ inference_method=text_analyzer_kwargs.get(
+ "inference_method"
+ ),
+ # local_model=text_analyzer_kwargs.get("local_model"),
+ # tokenizer=text_analyzer_kwargs.get("tokenizer"),
+ # assistant_model=text_analyzer_kwargs.get("assistant_model"),
+ client=text_analyzer_kwargs.get("client"),
+ client_config=text_analyzer_kwargs.get("client_config"),
+ api_url=text_analyzer_kwargs.get("api_url"),
+ )
+ comprehend_query_number += 1
+
+ # Reset batch
+ current_batch = ""
+ batch_word_count = 0
+ batch_char_count = 0
+ current_batch_mapping = list()
+ word_idx += 1
+ else:
+ # Look ahead in current line for phrase-ending punctuation or end of line
+ lookahead_idx = word_idx + 1
+ lookahead_batch = current_batch
+ lookahead_char_count = batch_char_count
+ lookahead_word_count = batch_word_count
+ lookahead_mapping = list(current_batch_mapping)
+
+ # Continue adding words until we find phrase-ending punctuation or end of line
+ while lookahead_idx < len(words):
+ lookahead_word = words[lookahead_idx]
+
+ # Add the word to lookahead batch
+ if lookahead_batch:
+ lookahead_batch += " "
+ lookahead_char_count += 1
+ lookahead_batch += lookahead_word
+ lookahead_char_count += len(lookahead_word)
+ lookahead_word_count += 1
+
+ if not lookahead_mapping or lookahead_mapping[-1][1] != i:
+ lookahead_mapping.append(
+ (
+ lookahead_char_count - len(lookahead_word),
+ i,
+ text_line,
+ line_characters[i],
+ word_start_positions[lookahead_idx],
+ )
+ )
+
+ # Check if this word ends with phrase punctuation
+ if ends_with_phrase_punctuation(lookahead_word):
+ break
+
+ lookahead_idx += 1
+
+ # Use the lookahead batch (either found phrase end or reached end of line)
+ current_batch = lookahead_batch
+ batch_char_count = lookahead_char_count
+ batch_word_count = lookahead_word_count
+ current_batch_mapping = lookahead_mapping
+
+ # Remove 'CUSTOM' entities from the chosen_llm_entities list
+ llm_chosen_redact_comprehend_entities = [
+ entity
+ for entity in chosen_llm_entities
+ if entity != "CUSTOM"
+ ]
+ # Process current batch
+ (
+ all_text_line_results,
+ batch_input_tokens,
+ batch_output_tokens,
+ ) = do_llm_entity_detection_call(
+ current_batch,
+ current_batch_mapping,
+ bedrock_runtime=bedrock_runtime,
+ language=aws_language,
+ allow_list=text_analyzer_kwargs.get(
+ "allow_list", allow_list or []
+ ),
+ chosen_redact_llm_entities=llm_chosen_redact_comprehend_entities,
+ all_text_line_results=all_text_line_results,
+ model_choice=model_choice,
+ temperature=text_analyzer_kwargs.get(
+ "temperature", LLM_TEMPERATURE
+ ),
+ max_tokens=text_analyzer_kwargs.get(
+ "max_tokens", LLM_MAX_NEW_TOKENS
+ ),
+ output_folder=output_folder,
+ batch_number=comprehend_query_number + 1,
+ custom_instructions=custom_llm_instructions,
+ file_name=file_name,
+ page_number=page_number,
+ inference_method=text_analyzer_kwargs.get(
+ "inference_method"
+ ),
+ # local_model=text_analyzer_kwargs.get("local_model"),
+ # tokenizer=text_analyzer_kwargs.get("tokenizer"),
+ # assistant_model=text_analyzer_kwargs.get("assistant_model"),
+ client=text_analyzer_kwargs.get("client"),
+ client_config=text_analyzer_kwargs.get("client_config"),
+ api_url=text_analyzer_kwargs.get("api_url"),
+ )
+ comprehend_query_number += 1
+
+ # Reset batch
+ current_batch = ""
+ batch_word_count = 0
+ batch_char_count = 0
+ current_batch_mapping = list()
+ word_idx = lookahead_idx + 1
+ else:
+ # Normal case: add word to batch
+ if current_batch:
+ current_batch += " "
+ batch_char_count += 1
+ current_batch += word
+ batch_char_count += len(word)
+ batch_word_count += 1
+
+ if not current_batch_mapping or current_batch_mapping[-1][1] != i:
+ current_batch_mapping.append(
+ (
+ batch_char_count - len(word),
+ i,
+ text_line,
+ line_characters[i],
+ word_start_positions[word_idx],
+ )
+ )
+ word_idx += 1
+
+ # Process final batch if any
+ if current_batch:
+ # Remove 'CUSTOM' entities from the chosen_llm_entities list
+ llm_chosen_redact_comprehend_entities = [
+ entity for entity in chosen_llm_entities if entity != "CUSTOM"
+ ]
+ all_text_line_results, batch_input_tokens, batch_output_tokens = (
+ do_llm_entity_detection_call(
+ current_batch,
+ current_batch_mapping,
+ bedrock_runtime=bedrock_runtime,
+ language=aws_language,
+ allow_list=text_analyzer_kwargs.get("allow_list", allow_list or []),
+ chosen_redact_llm_entities=llm_chosen_redact_comprehend_entities,
+ all_text_line_results=all_text_line_results,
+ model_choice=model_choice,
+ temperature=text_analyzer_kwargs.get(
+ "temperature", LLM_TEMPERATURE
+ ),
+ max_tokens=text_analyzer_kwargs.get(
+ "max_tokens", LLM_MAX_NEW_TOKENS
+ ),
+ output_folder=output_folder,
+ batch_number=comprehend_query_number + 1,
+ custom_instructions=custom_llm_instructions,
+ file_name=file_name,
+ page_number=page_number,
+ inference_method=text_analyzer_kwargs.get("inference_method"),
+ # local_model=text_analyzer_kwargs.get("local_model"),
+ # tokenizer=text_analyzer_kwargs.get("tokenizer"),
+ # assistant_model=text_analyzer_kwargs.get("assistant_model"),
+ client=text_analyzer_kwargs.get("client"),
+ client_config=text_analyzer_kwargs.get("client_config"),
+ api_url=text_analyzer_kwargs.get("api_url"),
+ )
+ )
+ # Accumulate token usage
+ llm_total_input_tokens += batch_input_tokens
+ llm_total_output_tokens += batch_output_tokens
+ comprehend_query_number += 1
+
+ # Process results for each line
+ for i, text_line in enumerate(line_level_text_results_list):
+ line_results = next(
+ (results for idx, results in all_text_line_results if idx == i), []
+ )
+
+ if line_results:
+ text_line_bounding_boxes = merge_text_bounding_boxes(
+ line_results, line_characters[i]
+ )
+
+ page_analyser_results.extend(line_results)
+ page_analysed_bounding_boxes.extend(text_line_bounding_boxes)
+
+ return (
+ page_analysed_bounding_boxes,
+ llm_model_name,
+ llm_total_input_tokens,
+ llm_total_output_tokens,
+ )
+
+
+def _char_bbox_and_text(char: Any) -> Tuple[Optional[List[float]], str]:
+ """
+ Get bbox and text from a character object. Supports both pdfminer LTChar
+ and PyMuPDF dict format {"text": ..., "bbox": [x0,y0,x1,y1], ...}.
+ Returns (bbox_list or None, text_str).
+ """
+ if isinstance(char, LTChar):
+ return (
+ getattr(char, "bbox", None),
+ getattr(char, "_text", None)
+ or (char.get_text() if callable(getattr(char, "get_text", None)) else "")
+ or "",
+ )
+ if isinstance(char, dict) and "bbox" in char:
+ bbox = char["bbox"]
+ text = char.get("text", "")
+ return (
+ bbox if isinstance(bbox, (list, tuple)) and len(bbox) >= 4 else None,
+ text,
+ )
+ return (None, "")
+
+
+def merge_text_bounding_boxes(
+ analyser_results: dict,
+ characters: List[Any],
+ combine_pixel_dist: int = 20,
+ vertical_padding: int = 0,
+):
+ """
+ Merge identified bounding boxes containing PII that are very close to one another.
+ Supports both pdfminer LTChar objects and PyMuPDF-style dicts with "bbox" and "text" keys.
+ """
+ analysed_bounding_boxes = list()
+ original_bounding_boxes = list() # List to hold original bounding boxes
+
+ if len(analyser_results) > 0 and len(characters) > 0:
+ # Extract bounding box coordinates for sorting
+ bounding_boxes = list()
+ for result in analyser_results:
+ char_boxes = []
+ char_text = []
+ for char in characters[result.start : result.end]:
+ bbox, text = _char_bbox_and_text(char)
+ if bbox is not None:
+ char_boxes.append(bbox)
+ char_text.append(text)
+ if char_boxes:
+ # Calculate the bounding box that encompasses all characters
+ left = min(box[0] for box in char_boxes)
+ bottom = min(box[1] for box in char_boxes)
+ right = max(box[2] for box in char_boxes)
+ top = max(box[3] for box in char_boxes) + vertical_padding
+ bbox = [left, bottom, right, top]
+ bounding_boxes.append(
+ (bottom, left, result, bbox, char_text)
+ ) # (y, x, result, bbox, text)
+
+ # Store original bounding boxes
+ original_bounding_boxes.append(
+ {
+ "text": "".join(char_text),
+ "boundingBox": bbox,
+ "result": copy.deepcopy(result),
+ }
+ )
+
+ # Sort the results by y-coordinate and then by x-coordinate
+ bounding_boxes.sort()
+
+ if MERGE_BOUNDING_BOXES:
+ merged_bounding_boxes = list()
+ current_box = None
+ current_y = None
+ current_result = None
+ current_text = list()
+
+ for y, x, result, next_box, text in bounding_boxes:
+ if current_y is None or current_box is None:
+ # Initialize the first bounding box
+ current_box = next_box
+ current_y = next_box[1]
+ current_result = result
+ current_text = list(text)
+ else:
+ vertical_diff_bboxes = abs(next_box[1] - current_y)
+ horizontal_diff_bboxes = abs(next_box[0] - current_box[2])
+
+ if (
+ vertical_diff_bboxes <= 5
+ and horizontal_diff_bboxes <= combine_pixel_dist
+ ):
+ # Merge bounding boxes
+ # print("Merging boxes")
+ merged_box = current_box.copy()
+ merged_result = current_result
+ merged_text = current_text.copy()
+
+ merged_box[2] = next_box[2] # Extend horizontally
+ merged_box[3] = max(
+ current_box[3], next_box[3]
+ ) # Adjust the top
+ merged_result.end = max(
+ current_result.end, result.end
+ ) # Extend text range
+ try:
+ if current_result.entity_type != result.entity_type:
+ merged_result.entity_type = (
+ current_result.entity_type
+ + " - "
+ + result.entity_type
+ )
+ else:
+ merged_result.entity_type = current_result.entity_type
+ except Exception as e:
+ print("Unable to combine result entity types:", e)
+ if current_text:
+ merged_text.append(" ") # Add space between texts
+ merged_text.extend(text)
+
+ merged_bounding_boxes.append(
+ {
+ "text": "".join(merged_text),
+ "boundingBox": merged_box,
+ "result": merged_result,
+ }
+ )
+
+ else:
+ # Start a new bounding box
+ current_box = next_box
+ current_y = next_box[1]
+ current_result = result
+ current_text = list(text)
+
+ # Combine original and merged bounding boxes
+ analysed_bounding_boxes.extend(original_bounding_boxes)
+ analysed_bounding_boxes.extend(merged_bounding_boxes)
+ else:
+ # Keep boxes without merging
+ analysed_bounding_boxes.extend(original_bounding_boxes)
+
+ # print("Analysed bounding boxes:", analysed_bounding_boxes)
+
+ return analysed_bounding_boxes
+
+
+def recreate_page_line_level_ocr_results_with_page(
+ page_line_level_ocr_results_with_words: dict,
+):
+ reconstructed_results = list()
+
+ # Assume all lines belong to the same page, so we can just read it from one item
+ # page = next(iter(page_line_level_ocr_results_with_words.values()))["page"]
+
+ page = page_line_level_ocr_results_with_words["page"]
+
+ for line_data in page_line_level_ocr_results_with_words["results"].values():
+ bbox = line_data["bounding_box"]
+ text = line_data["text"]
+ if line_data["line"]:
+ line_number = line_data["line"]
+ # Support both "confidence" (Textract) and "conf" (other OCR)
+ if line_data["words"]:
+ conf = sum(
+ word.get("confidence", word.get("conf", 0.0))
+ for word in line_data["words"]
+ ) / len(line_data["words"])
+ else:
+ conf = line_data.get("confidence", line_data.get("conf", 0.0))
+
+ # Recreate the OCRResult
+ line_result = OCRResult(
+ text=text,
+ left=bbox[0],
+ top=bbox[1],
+ width=bbox[2] - bbox[0],
+ height=bbox[3] - bbox[1],
+ line=line_number,
+ conf=round(float(conf), 0),
+ )
+ reconstructed_results.append(line_result)
+
+ page_line_level_ocr_results_with_page = {
+ "page": page,
+ "results": reconstructed_results,
+ }
+
+ return page_line_level_ocr_results_with_page
+
+
+_PUNCTUATION_SPLIT_RE = re.compile(r"([(\[{]*)(.*?)_?([.,?!:;)\}\]]*)$")
+
+
+def split_words_and_punctuation_from_line(
+ line_of_words: List[OCRResult],
+) -> List[OCRResult]:
+ """
+ Takes a list of OCRResult objects and splits words with trailing/leading punctuation.
+
+ For a word like "example.", it creates two new OCRResult objects for "example"
+ and "." and estimates their bounding boxes. Words with internal hyphens like
+ "high-tech" are preserved.
+ """
+ # Punctuation that will be split off. Hyphen is not included.
+
+ new_word_list = list()
+
+ for word_result in line_of_words:
+ word_text = word_result.text
+
+ # This regex finds a central "core" word, and captures leading and trailing punctuation.
+ # Compiled once to avoid re-parsing the regex on every line.
+ # Handles cases like "(word)." -> group1='(', group2='word', group3='.'
+ match = _PUNCTUATION_SPLIT_RE.match(word_text)
+
+ # Handle words with internal hyphens that might confuse the regex
+ if "-" in word_text and not match.group(2):
+ core_part_text = word_text
+ leading_punc = ""
+ trailing_punc = ""
+ elif match:
+ leading_punc, core_part_text, trailing_punc = match.groups()
+ else: # Failsafe
+ new_word_list.append(word_result)
+ continue
+
+ # If no split is needed, just add the original and continue
+ if not leading_punc and not trailing_punc:
+ new_word_list.append(word_result)
+ continue
+
+ # --- A split is required ---
+ # Estimate new bounding boxes by proportionally allocating width
+ original_width = word_result.width
+ if not word_text or original_width == 0:
+ continue # Failsafe
+
+ avg_char_width = original_width / len(word_text)
+ current_left = word_result.left
+
+ # Add leading punctuation if it exists
+ if leading_punc:
+ punc_width = avg_char_width * len(leading_punc)
+ new_word_list.append(
+ OCRResult(
+ text=leading_punc,
+ left=current_left,
+ top=word_result.top,
+ width=punc_width,
+ height=word_result.height,
+ conf=word_result.conf,
+ model=word_result.model,
+ )
+ )
+ current_left += punc_width
+
+ # Add the core part of the word
+ if core_part_text:
+ core_width = avg_char_width * len(core_part_text)
+ new_word_list.append(
+ OCRResult(
+ text=core_part_text,
+ left=current_left,
+ top=word_result.top,
+ width=core_width,
+ height=word_result.height,
+ conf=word_result.conf,
+ model=word_result.model,
+ )
+ )
+ current_left += core_width
+
+ # Add trailing punctuation if it exists
+ if trailing_punc:
+ punc_width = avg_char_width * len(trailing_punc)
+ new_word_list.append(
+ OCRResult(
+ text=trailing_punc,
+ left=current_left,
+ top=word_result.top,
+ width=punc_width,
+ height=word_result.height,
+ conf=word_result.conf,
+ model=word_result.model,
+ )
+ )
+
+ return new_word_list
+
+
+def create_ocr_result_with_children(
+ combined_results: dict, i: int, current_bbox: dict, current_line: list
+):
+ combined_results["text_line_" + str(i)] = {
+ "line": i,
+ "text": current_bbox.text,
+ "bounding_box": (
+ current_bbox.left,
+ current_bbox.top,
+ current_bbox.left + current_bbox.width,
+ current_bbox.top + current_bbox.height,
+ ),
+ "words": [
+ {
+ "text": word.text,
+ "bounding_box": (
+ word.left,
+ word.top,
+ word.left + word.width,
+ word.top + word.height,
+ ),
+ "conf": word.conf,
+ "model": word.model,
+ }
+ for word in current_line
+ ],
+ "conf": current_bbox.conf,
+ }
+ return combined_results["text_line_" + str(i)]
+
+
+def combine_ocr_results(
+ ocr_results: List[OCRResult],
+ x_threshold: float = 50.0,
+ y_threshold: float = 12.0,
+ page: int = 1,
+):
+ """
+ Group OCR results into lines, splitting words from punctuation.
+ """
+ if not ocr_results:
+ return {"page": page, "results": []}, {"page": page, "results": {}}
+
+ lines = list()
+ current_line = list()
+
+ for result in sorted(ocr_results, key=lambda x: (x.top, x.left)):
+ if not current_line or abs(result.top - current_line[0].top) <= y_threshold:
+ current_line.append(result)
+ else:
+ lines.append(sorted(current_line, key=lambda x: x.left))
+ current_line = [result]
+ if current_line:
+ lines.append(sorted(current_line, key=lambda x: x.left))
+
+ page_line_level_ocr_results = list()
+ page_line_level_ocr_results_with_words = {}
+ line_counter = 1
+
+ for line in lines:
+ if not line:
+ continue
+
+ # Process the line to split punctuation from words
+ processed_line = split_words_and_punctuation_from_line(line)
+
+ # Re-calculate the line-level text and bounding box from the ORIGINAL words
+ line_text = " ".join([word.text for word in line])
+ line_left = line[0].left
+ line_top = min(word.top for word in line)
+ line_right = max(word.left + word.width for word in line)
+ line_bottom = max(word.top + word.height for word in line)
+ line_conf = round(
+ sum(word.conf for word in line) / len(line), 0
+ ) # This is mean confidence for the line
+
+ final_line_bbox = OCRResult(
+ text=line_text,
+ left=line_left,
+ top=line_top,
+ width=line_right - line_left,
+ height=line_bottom - line_top,
+ line=line_counter,
+ conf=line_conf,
+ )
+
+ page_line_level_ocr_results.append(final_line_bbox)
+
+ # Use the PROCESSED line to create the children. Creates a result within page_line_level_ocr_results_with_words
+ page_line_level_ocr_results_with_words["text_line_" + str(line_counter)] = (
+ create_ocr_result_with_children(
+ page_line_level_ocr_results_with_words,
+ line_counter,
+ final_line_bbox,
+ processed_line,
+ )
+ )
+ line_counter += 1
+
+ page_level_results_with_page = {
+ "page": page,
+ "results": page_line_level_ocr_results,
+ }
+ page_level_results_with_words = {
+ "page": page,
+ "results": page_line_level_ocr_results_with_words,
+ }
+
+ return page_level_results_with_page, page_level_results_with_words
diff --git a/tools/data_anonymise.py b/tools/data_anonymise.py
new file mode 100644
index 0000000000000000000000000000000000000000..97a4208e49702ba4dc7f5967794a55269951d8e9
--- /dev/null
+++ b/tools/data_anonymise.py
@@ -0,0 +1,1902 @@
+import base64
+import os
+import secrets
+import time
+import unicodedata
+from concurrent.futures import ThreadPoolExecutor
+from typing import Any, Dict, List, Optional, Tuple
+
+import boto3
+import botocore
+import docx
+import gradio as gr
+import pandas as pd
+import polars as pl
+from botocore.client import BaseClient
+from faker import Faker
+from gradio import Progress
+from openpyxl import Workbook
+from presidio_analyzer import (
+ AnalyzerEngine,
+ BatchAnalyzerEngine,
+ DictAnalyzerResult,
+ RecognizerResult,
+)
+from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
+from presidio_anonymizer.entities import OperatorConfig
+
+from tools.config import (
+ AWS_ACCESS_KEY,
+ AWS_LLM_PII_OPTION,
+ AWS_REGION,
+ AWS_SECRET_KEY,
+ CLOUD_LLM_PII_CUSTOM_INSTRUCTIONS_MODEL_CHOICE,
+ CLOUD_LLM_PII_MODEL_CHOICE, # Legacy alias for CLOUD_LLM_PII_MODEL_CHOICE
+ CUSTOM_ENTITIES,
+ DEFAULT_LANGUAGE,
+ DO_INITIAL_TABULAR_DATA_CLEAN,
+ INFERENCE_SERVER_PII_OPTION,
+ LLM_MAX_NEW_TOKENS,
+ LLM_TEMPERATURE,
+ LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE,
+ LOCAL_TRANSFORMERS_LLM_PII_OPTION,
+ MAX_SIMULTANEOUS_FILES,
+ MAX_TABLE_COLUMNS,
+ MAX_TABLE_ROWS,
+ MAX_WORKERS,
+ OUTPUT_FOLDER,
+ PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS,
+ RUN_AWS_FUNCTIONS,
+ aws_comprehend_language_choices,
+)
+from tools.helper_functions import (
+ detect_file_type,
+ get_file_name_without_type,
+ read_file,
+)
+from tools.llm_entity_detection import call_llm_for_entity_detection
+from tools.load_spacy_model_custom_recognisers import (
+ CustomWordFuzzyRecognizer,
+ create_nlp_analyser,
+ custom_word_list_recogniser,
+ load_spacy_model,
+ nlp_analyser,
+ score_threshold,
+)
+
+# Use custom version of analyze_dict to be able to track progress
+from tools.presidio_analyzer_custom import analyze_dict, analyze_iterator_custom
+from tools.secure_path_utils import secure_join
+
+# AWS Comprehend billing: 1 unit = 100 characters (entity recognition, PII, etc.)
+COMPREHEND_CHARACTERS_PER_UNIT = 100
+
+# Max concurrent API calls for Bedrock/LLM (avoid rate limits; Comprehend uses MAX_WORKERS)
+LLM_PII_MAX_CONCURRENT_REQUESTS = min(MAX_WORKERS, 10)
+
+
+def _comprehend_one_cell(
+ comprehend_client: BaseClient,
+ text_str: str,
+ language: str,
+ chosen_redact_comprehend_entities: List[str],
+ in_allow_list_flat: List[str],
+ max_retries: int = 3,
+ retry_delay: int = 3,
+) -> Tuple[List[RecognizerResult], int]:
+ """Call AWS Comprehend for one text cell. Returns (recognizer_results, query_units)."""
+ query_units = (
+ len(text_str.strip()) + COMPREHEND_CHARACTERS_PER_UNIT - 1
+ ) // COMPREHEND_CHARACTERS_PER_UNIT
+ for attempt in range(max_retries):
+ try:
+ response = comprehend_client.detect_pii_entities(
+ Text=text_str, LanguageCode=language
+ )
+ results = []
+ for entity in response["Entities"]:
+ if entity.get("Type") not in chosen_redact_comprehend_entities:
+ continue
+ entity_text = text_str[entity["BeginOffset"] : entity["EndOffset"]]
+ if in_allow_list_flat:
+ allow_list_normalized = [
+ item.strip().lower() for item in in_allow_list_flat if item
+ ]
+ if entity_text.strip().lower() in allow_list_normalized:
+ continue
+ results.append(
+ RecognizerResult(
+ entity_type=entity["Type"],
+ start=entity["BeginOffset"],
+ end=entity["EndOffset"],
+ score=entity["Score"],
+ )
+ )
+ return (results, query_units)
+ except Exception:
+ if attempt == max_retries - 1:
+ raise
+ time.sleep(retry_delay)
+ return ([], query_units)
+
+
+custom_entities = CUSTOM_ENTITIES
+
+fake = Faker("en_UK")
+
+
+def fake_first_name(x):
+ return fake.first_name()
+
+
+# #### Some of my cleaning functions
+url_pattern = r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+|(?:www\.)[a-zA-Z0-9._-]+\.[a-zA-Z]{2,}"
+html_pattern_regex = r"<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0| "
+html_start_pattern_end_dots_regex = r"<(.*?)\.\."
+non_ascii_pattern = r"[^\x00-\x7F]+"
+and_sign_regex = r"&"
+multiple_spaces_regex = r"\s{2,}"
+multiple_new_lines_regex = r"(\r\n|\n)+"
+multiple_punctuation_regex = r"(\p{P})\p{P}+"
+
+
+def initial_clean(texts: pd.Series) -> pd.Series:
+ """
+ This function cleans the text by removing URLs, HTML tags, and non-ASCII characters.
+ """
+ for text in texts:
+ if not text or pd.isnull(text):
+ text = ""
+
+ # Normalize unicode characters to decompose any special forms
+ normalized_text = unicodedata.normalize("NFKC", text)
+
+ # Replace smart quotes and special punctuation with standard ASCII equivalents
+ replacements = {
+ "‘": "'",
+ "’": "'",
+ "“": '"',
+ "”": '"',
+ "–": "-",
+ "—": "-",
+ "…": "...",
+ "•": "*",
+ }
+
+ # Perform replacements
+ for old_char, new_char in replacements.items():
+ normalised_text = normalized_text.replace(old_char, new_char)
+
+ text = normalised_text
+
+ # Convert to polars Series
+ texts = pl.Series(texts).str.strip_chars()
+
+ # Define a list of patterns and their replacements
+ patterns = [
+ (multiple_new_lines_regex, " "),
+ (r"\r", ""),
+ (url_pattern, " "),
+ (html_pattern_regex, " "),
+ (html_start_pattern_end_dots_regex, " "),
+ (non_ascii_pattern, " "),
+ (multiple_spaces_regex, " "),
+ (multiple_punctuation_regex, "${1}"),
+ (and_sign_regex, "and"),
+ ]
+
+ # Apply each regex replacement
+ for pattern, replacement in patterns:
+ texts = texts.str.replace_all(pattern, replacement)
+
+ # Convert the series back to a list
+ texts = texts.to_list()
+
+ return texts
+
+
+def process_recognizer_result(
+ result: RecognizerResult,
+ recognizer_result: RecognizerResult,
+ data_row: int,
+ dictionary_key: int,
+ df_dict: Dict[str, List[Any]],
+ keys_to_keep: List[str],
+) -> Tuple[List[str], List[Dict[str, Any]]]:
+ output = list()
+ output_dicts = list()
+
+ if hasattr(result, "value"):
+ text = result.value[data_row]
+ else:
+ text = ""
+
+ if isinstance(recognizer_result, list):
+ for sub_result in recognizer_result:
+ if isinstance(text, str):
+ found_text = text[sub_result.start : sub_result.end]
+ else:
+ found_text = ""
+ analysis_explanation = {
+ key: sub_result.__dict__[key] for key in keys_to_keep
+ }
+ analysis_explanation.update(
+ {
+ "data_row": str(data_row),
+ "column": list(df_dict.keys())[dictionary_key],
+ "entity": found_text,
+ }
+ )
+ output.append(str(analysis_explanation))
+ output_dicts.append(analysis_explanation)
+
+ return output, output_dicts
+
+
+# Writing decision making process to file
+def generate_log(
+ analyzer_results: List[DictAnalyzerResult], df_dict: Dict[str, List[Any]]
+) -> Tuple[str, pd.DataFrame]:
+ """
+ Generate a detailed output of the decision process for entity recognition.
+
+ This function takes the results from the analyzer and the original data dictionary,
+ and produces a string output detailing the decision process for each recognized entity.
+ It includes information such as entity type, position, confidence score, and the context
+ in which the entity was found.
+
+ Args:
+ analyzer_results (List[DictAnalyzerResult]): The results from the entity analyzer.
+ df_dict (Dict[str, List[Any]]): The original data in dictionary format.
+
+ Returns:
+ Tuple[str, pd.DataFrame]: A tuple containing the string output and DataFrame with all columns.
+ """
+ decision_process_output = list()
+ decision_process_output_dicts = list() # New list to store dictionaries
+ keys_to_keep = ["entity_type", "start", "end"]
+
+ # Run through each column to analyse for PII
+ for i, result in enumerate(analyzer_results):
+
+ # If a single result
+ if isinstance(result, RecognizerResult):
+ output, output_dicts = process_recognizer_result(
+ result, result, 0, i, df_dict, keys_to_keep
+ )
+ decision_process_output.extend(output)
+ decision_process_output_dicts.extend(output_dicts)
+
+ # If a list of results
+ elif isinstance(result, list) or isinstance(result, DictAnalyzerResult):
+ for x, recognizer_result in enumerate(result.recognizer_results):
+ output, output_dicts = process_recognizer_result(
+ result, recognizer_result, x, i, df_dict, keys_to_keep
+ )
+ decision_process_output.extend(output)
+ decision_process_output_dicts.extend(output_dicts)
+
+ else:
+ try:
+ output, output_dicts = process_recognizer_result(
+ result, result, 0, i, df_dict, keys_to_keep
+ )
+ decision_process_output.extend(output)
+ decision_process_output_dicts.extend(output_dicts)
+ except Exception as e:
+ print(e)
+
+ decision_process_output_str = "\n".join(decision_process_output)
+ decision_process_output_df = pd.DataFrame(decision_process_output_dicts)
+
+ return decision_process_output_str, decision_process_output_df
+
+
+def anon_consistent_names(df: pd.DataFrame) -> pd.DataFrame:
+ # ## Pick out common names and replace them with the same person value
+ df_dict = df.to_dict(orient="list")
+
+ # analyzer = AnalyzerEngine()
+ batch_analyzer = BatchAnalyzerEngine(analyzer_engine=nlp_analyser)
+
+ analyzer_results = batch_analyzer.analyze_dict(df_dict, language=DEFAULT_LANGUAGE)
+ analyzer_results = list(analyzer_results)
+
+ text = analyzer_results[3].value
+
+ recognizer_result = str(analyzer_results[3].recognizer_results)
+
+ data_str = recognizer_result # abbreviated for brevity
+
+ # Adjusting the parse_dict function to handle trailing ']'
+ # Splitting the main data string into individual list strings
+ list_strs = data_str[1:-1].split("], [")
+
+ def parse_dict(s):
+ s = s.strip("[]") # Removing any surrounding brackets
+ items = s.split(", ")
+ d = {}
+ for item in items:
+ key, value = item.split(": ")
+ if key == "score":
+ d[key] = float(value)
+ elif key in ["start", "end"]:
+ d[key] = int(value)
+ else:
+ d[key] = value
+ return d
+
+ # Re-running the improved processing code
+
+ result = list()
+
+ for lst_str in list_strs:
+ # Splitting each list string into individual dictionary strings
+ dict_strs = lst_str.split(", type: ")
+ dict_strs = [dict_strs[0]] + [
+ "type: " + s for s in dict_strs[1:]
+ ] # Prepending "type: " back to the split strings
+
+ # Parsing each dictionary string
+ dicts = [parse_dict(d) for d in dict_strs]
+ result.append(dicts)
+
+ names = list()
+
+ for idx, paragraph in enumerate(text):
+ paragraph_texts = list()
+ for dictionary in result[idx]:
+ if dictionary["type"] == "PERSON":
+ paragraph_texts.append(
+ paragraph[dictionary["start"] : dictionary["end"]]
+ )
+ names.append(paragraph_texts)
+
+ # Flatten the list of lists and extract unique names
+ unique_names = list(set(name for sublist in names for name in sublist))
+
+ fake_names = pd.Series(unique_names).apply(fake_first_name)
+
+ mapping_df = pd.DataFrame(
+ data={"Unique names": unique_names, "Fake names": fake_names}
+ )
+
+ # Convert mapping dataframe to dictionary, adding word boundaries for full-word match
+ name_map = {
+ r"\b" + k + r"\b": v
+ for k, v in zip(mapping_df["Unique names"], mapping_df["Fake names"])
+ }
+
+ name_map
+
+ scrubbed_df_consistent_names = df.replace(name_map, regex=True)
+
+ scrubbed_df_consistent_names
+
+ return scrubbed_df_consistent_names
+
+
+def handle_docx_anonymisation(
+ file_path: str,
+ output_folder: str,
+ anon_strategy: str,
+ chosen_redact_entities: List[str],
+ in_allow_list: List[str],
+ in_deny_list: List[str],
+ max_fuzzy_spelling_mistakes_num: int,
+ pii_identification_method: str,
+ chosen_redact_comprehend_entities: List[str],
+ comprehend_query_number: int,
+ comprehend_client: BaseClient,
+ language: Optional[str] = DEFAULT_LANGUAGE,
+ out_file_paths: List[str] = list(),
+ nlp_analyser: AnalyzerEngine = nlp_analyser,
+):
+ """
+ Anonymises a .docx file by extracting text, processing it, and re-inserting it.
+
+ Returns:
+ A tuple containing the output file path and the log file path.
+ """
+
+ # 1. Load the document and extract text elements
+ doc = docx.Document(file_path)
+ text_elements = (
+ list()
+ ) # This will store the actual docx objects (paragraphs, cells)
+ original_texts = list() # This will store the text from those objects
+
+ paragraph_count = len(doc.paragraphs)
+
+ if paragraph_count > MAX_TABLE_ROWS:
+ out_message = f"Number of paragraphs in document is greater than {MAX_TABLE_ROWS}. Please submit a smaller document."
+ print(out_message)
+ raise Exception(out_message)
+
+ # Extract from paragraphs
+ for para in doc.paragraphs:
+ if para.text.strip(): # Only process non-empty paragraphs
+ text_elements.append(para)
+ original_texts.append(para.text)
+
+ # Extract from tables
+ for table in doc.tables:
+ for row in table.rows:
+ for cell in row.cells:
+ if cell.text.strip(): # Only process non-empty cells
+ text_elements.append(cell)
+ original_texts.append(cell.text)
+
+ # If there's no text to process, return early
+ if not original_texts:
+ print(f"No text found in {file_path}. Skipping.")
+ return None, None, 0
+
+ # 2. Convert to a DataFrame for the existing anonymisation script
+ df_to_anonymise = pd.DataFrame({"text_to_redact": original_texts})
+
+ # 3. Call the core anonymisation script
+ (
+ anonymised_df,
+ _,
+ decision_log,
+ comprehend_query_number,
+ decision_process_output_df,
+ llm_total_input_tokens,
+ llm_total_output_tokens,
+ llm_model_name,
+ ) = anonymise_script(
+ df=df_to_anonymise,
+ anon_strategy=anon_strategy,
+ language=language,
+ chosen_redact_entities=chosen_redact_entities,
+ in_allow_list=in_allow_list,
+ in_deny_list=in_deny_list,
+ max_fuzzy_spelling_mistakes_num=max_fuzzy_spelling_mistakes_num,
+ pii_identification_method=pii_identification_method,
+ chosen_redact_comprehend_entities=chosen_redact_comprehend_entities,
+ comprehend_query_number=comprehend_query_number,
+ comprehend_client=comprehend_client,
+ nlp_analyser=nlp_analyser,
+ output_folder=output_folder,
+ )
+
+ anonymised_texts = anonymised_df["text_to_redact"].tolist()
+
+ # 4. Re-insert the anonymised text back into the document objects
+ for element, new_text in zip(text_elements, anonymised_texts):
+ if isinstance(element, docx.text.paragraph.Paragraph):
+ # Clear existing content (runs) and add the new text in a single new run
+ element.clear()
+ element.add_run(new_text)
+ elif isinstance(element, docx.table._Cell):
+ # For cells, setting .text works similarly
+ element.text = new_text
+
+ # 5. Save the redacted document and the log file
+ base_name = os.path.basename(file_path)
+ file_name_without_ext = os.path.splitext(base_name)[0]
+
+ output_docx_path = secure_join(
+ output_folder, f"{file_name_without_ext}_redacted.docx"
+ )
+
+ out_file_paths.append(output_docx_path)
+
+ output_xlsx_path = secure_join(
+ output_folder, f"{file_name_without_ext}_redacted.csv"
+ )
+
+ anonymised_df.to_csv(output_xlsx_path, encoding="utf-8-sig", index=None)
+ doc.save(output_docx_path)
+
+ out_file_paths.append(output_xlsx_path)
+
+ # Reconstruct log_file_path for return value
+ log_file_path = secure_join(
+ output_folder, f"{file_name_without_ext}_redacted_log.csv"
+ )
+
+ decision_process_output_df.to_csv(log_file_path, index=None, encoding="utf-8-sig")
+
+ out_file_paths.append(log_file_path)
+
+ return (
+ out_file_paths,
+ comprehend_query_number,
+ llm_total_input_tokens,
+ llm_total_output_tokens,
+ llm_model_name,
+ )
+
+
+def anonymise_files_with_open_text(
+ file_paths: List[str],
+ in_text: str,
+ anon_strategy: str,
+ chosen_cols: List[str],
+ chosen_redact_entities: List[str],
+ in_allow_list: List[str] = None,
+ latest_file_completed: int = 0,
+ out_message: list = list(),
+ out_file_paths: list = list(),
+ log_files_output_paths: list = list(),
+ in_excel_sheets: list = list(),
+ first_loop_state: bool = False,
+ output_folder: str = OUTPUT_FOLDER,
+ in_deny_list: list[str] = list(),
+ max_fuzzy_spelling_mistakes_num: int = 0,
+ pii_identification_method: str = "Local",
+ chosen_redact_comprehend_entities: List[str] = list(),
+ comprehend_query_number: int = 0,
+ aws_access_key_textbox: str = "",
+ aws_secret_key_textbox: str = "",
+ actual_time_taken_number: float = 0,
+ do_initial_clean: bool = DO_INITIAL_TABULAR_DATA_CLEAN,
+ language: Optional[str] = None,
+ progress: Progress = Progress(track_tqdm=True),
+ custom_llm_instructions: str = "",
+ chosen_llm_entities: Optional[List[str]] = None,
+):
+ """
+ This function anonymises data files based on the provided parameters.
+
+ Parameters:
+ - file_paths (List[str]): A list of file paths to anonymise: '.xlsx', '.xls', '.csv', '.parquet', or '.docx'.
+ - in_text (str): The text to anonymise if file_paths is 'open_text'.
+ - anon_strategy (str): The anonymisation strategy to use.
+ - chosen_cols (List[str]): A list of column names to anonymise.
+ - language (str): The language of the text to anonymise.
+ - chosen_redact_entities (List[str]): A list of entities to redact.
+ - in_allow_list (List[str], optional): A list of allowed values. Defaults to None.
+ - latest_file_completed (int, optional): The index of the last file completed. Defaults to 0.
+ - out_message (list, optional): A list to store output messages. Defaults to an empty list.
+ - out_file_paths (list, optional): A list to store output file paths. Defaults to an empty list.
+ - log_files_output_paths (list, optional): A list to store log file paths. Defaults to an empty list.
+ - in_excel_sheets (list, optional): A list of Excel sheet names. Defaults to an empty list.
+ - first_loop_state (bool, optional): Indicates if this is the first loop iteration. Defaults to False.
+ - output_folder (str, optional): The output folder path. Defaults to the global output_folder variable.
+ - in_deny_list (list[str], optional): A list of specific terms to redact.
+ - max_fuzzy_spelling_mistakes_num (int, optional): The maximum number of spelling mistakes allowed in a searched phrase for fuzzy matching. Can range from 0-9.
+ - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
+ - chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from files, chosen from the official list from AWS Comprehend service.
+ - comprehend_query_number (int, optional): A counter for AWS Comprehend usage in units of 100 characters (1 unit = 100 characters, per AWS billing). Defaults to 0.
+ - aws_access_key_textbox (str, optional): AWS access key for account with Textract and Comprehend permissions.
+ - aws_secret_key_textbox (str, optional): AWS secret key for account with Textract and Comprehend permissions.
+ - actual_time_taken_number (float, optional): Time taken to do the redaction.
+ - language (str, optional): The language of the text to anonymise.
+ - progress (Progress, optional): A Progress object to track progress. Defaults to a Progress object with track_tqdm=True.
+ - do_initial_clean (bool, optional): Whether to perform an initial cleaning of the text. Defaults to True.
+ - custom_llm_instructions (str, optional): Custom instructions for LLM entity detection (tabular). Defaults to "".
+ - chosen_llm_entities (List[str], optional): Entity types to detect when using LLM PII method (tabular). Defaults to None (uses chosen_redact_comprehend_entities).
+ """
+
+ tic = time.perf_counter()
+ comprehend_client = ""
+ out_message_out = ""
+ llm_total_input_tokens = 0
+ llm_total_output_tokens = 0
+ llm_model_name = ""
+
+ # Normalise LLM params (Gradio may send None or single value)
+ if custom_llm_instructions is None:
+ custom_llm_instructions = ""
+ if chosen_llm_entities is not None and not isinstance(chosen_llm_entities, list):
+ chosen_llm_entities = [chosen_llm_entities] if chosen_llm_entities else None
+
+ # If output folder doesn't end with a forward slash, add one
+ if not output_folder.endswith("/"):
+ output_folder = output_folder + "/"
+
+ # Use provided language or default
+ language = language or DEFAULT_LANGUAGE
+
+ if pii_identification_method == "AWS Comprehend":
+ if language not in aws_comprehend_language_choices:
+ out_message = f"Please note that this language is not supported by AWS Comprehend: {language}"
+ raise Warning(out_message)
+
+ # If this is the first time around, set variables to 0/blank
+ if first_loop_state is True:
+ latest_file_completed = 0
+ out_message = list()
+ out_file_paths = list()
+
+ # Load file
+ # If out message or out_file_paths are blank, change to a list so it can be appended to
+ if isinstance(out_message, str):
+ out_message = [out_message]
+
+ if isinstance(log_files_output_paths, str):
+ log_files_output_paths = list()
+
+ if not out_file_paths:
+ out_file_paths = list()
+
+ # Handle both list (new Dropdown format) and DataFrame (legacy)
+ if isinstance(in_allow_list, list):
+ # Dropdown component returns a list directly
+ in_allow_list_flat = (
+ [str(item) for item in in_allow_list if item] if in_allow_list else list()
+ )
+ elif isinstance(in_allow_list, pd.DataFrame):
+ if not in_allow_list.empty:
+ in_allow_list_flat = list(in_allow_list.iloc[:, 0].unique())
+ else:
+ in_allow_list_flat = list()
+ else:
+ in_allow_list_flat = list()
+
+ anon_df = pd.DataFrame()
+
+ # Try to connect to AWS services directly only if RUN_AWS_FUNCTIONS environmental variable is 1, otherwise an environment variable or direct textbox input is needed.
+ if pii_identification_method == "AWS Comprehend":
+ print("Trying to connect to AWS Comprehend service")
+ if RUN_AWS_FUNCTIONS and PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS:
+ print("Connecting to Comprehend via existing SSO connection")
+ comprehend_client = boto3.client("comprehend", region_name=AWS_REGION)
+ elif aws_access_key_textbox and aws_secret_key_textbox:
+ print(
+ "Connecting to Comprehend using AWS access key and secret keys from textboxes."
+ )
+ comprehend_client = boto3.client(
+ "comprehend",
+ aws_access_key_id=aws_access_key_textbox,
+ aws_secret_access_key=aws_secret_key_textbox,
+ )
+ elif RUN_AWS_FUNCTIONS:
+ print("Connecting to Comprehend via existing SSO connection")
+ comprehend_client = boto3.client("comprehend")
+ elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
+ print("Getting Comprehend credentials from environment variables")
+ comprehend_client = boto3.client(
+ "comprehend",
+ aws_access_key_id=AWS_ACCESS_KEY,
+ aws_secret_access_key=AWS_SECRET_KEY,
+ )
+ else:
+ comprehend_client = ""
+ out_message = "Cannot connect to AWS Comprehend service. Please provide access keys under Textract settings on the Redaction settings tab, or choose another PII identification method."
+ raise (out_message)
+
+ # Create Bedrock runtime client when using LLM-based PII detection with AWS Bedrock
+ bedrock_runtime = None
+ if pii_identification_method == AWS_LLM_PII_OPTION:
+ if RUN_AWS_FUNCTIONS and PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS:
+ print("Connecting to Bedrock via existing SSO connection")
+ bedrock_runtime = boto3.client("bedrock-runtime", region_name=AWS_REGION)
+ elif aws_access_key_textbox and aws_secret_key_textbox:
+ print(
+ "Connecting to Bedrock using AWS access key and secret keys from user input."
+ )
+ bedrock_runtime = boto3.client(
+ "bedrock-runtime",
+ aws_access_key_id=aws_access_key_textbox,
+ aws_secret_access_key=aws_secret_key_textbox,
+ region_name=AWS_REGION,
+ )
+ elif RUN_AWS_FUNCTIONS:
+ print("Connecting to Bedrock via existing SSO connection")
+ bedrock_runtime = boto3.client("bedrock-runtime", region_name=AWS_REGION)
+ elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
+ print("Getting Bedrock credentials from environment variables")
+ bedrock_runtime = boto3.client(
+ "bedrock-runtime",
+ aws_access_key_id=AWS_ACCESS_KEY,
+ aws_secret_access_key=AWS_SECRET_KEY,
+ region_name=AWS_REGION,
+ )
+ else:
+ out_message = "Cannot connect to AWS Bedrock service. Please provide access keys under Textract settings on the Redaction settings tab, or choose another PII identification method."
+ print(out_message)
+ raise Exception(out_message)
+
+ # Check if files and text exist
+ if not file_paths:
+ if in_text:
+ file_paths = ["open_text"]
+ else:
+ out_message = "Please enter text or a file to redact."
+ raise Exception(out_message)
+
+ if not isinstance(file_paths, list):
+ file_paths = [file_paths]
+
+ if len(file_paths) > MAX_SIMULTANEOUS_FILES:
+ out_message = f"Number of files to anonymise is greater than {MAX_SIMULTANEOUS_FILES}. Please submit a smaller number of files."
+ print(out_message)
+ raise Exception(out_message)
+
+ # If we have already redacted the last file, return the input out_message and file list to the relevant components
+ if latest_file_completed >= len(file_paths):
+ print("Last file reached") # , returning files:", str(latest_file_completed))
+ # Set to a very high number so as not to mess with subsequent file processing by the user
+ # latest_file_completed = 99
+ final_out_message = "\n".join(out_message)
+
+ gr.Info(final_out_message)
+
+ return (
+ final_out_message,
+ out_file_paths,
+ out_file_paths,
+ latest_file_completed,
+ log_files_output_paths,
+ log_files_output_paths,
+ actual_time_taken_number,
+ comprehend_query_number,
+ llm_total_input_tokens,
+ llm_total_output_tokens,
+ llm_model_name,
+ )
+
+ file_path_loop = [file_paths[int(latest_file_completed)]]
+
+ for anon_file in progress.tqdm(
+ file_path_loop, desc="Anonymising files", unit="files"
+ ):
+
+ # Get a string file path
+ if isinstance(anon_file, str):
+ file_path = anon_file
+ else:
+ file_path = anon_file
+
+ if anon_file == "open_text":
+ anon_df = pd.DataFrame(data={"text": [in_text]})
+ chosen_cols = ["text"]
+ out_file_part = anon_file
+ sheet_name = ""
+ file_type = ""
+
+ (
+ out_file_paths,
+ out_message,
+ key_string,
+ log_files_output_paths,
+ comprehend_query_number,
+ tbl_llm_in,
+ tbl_llm_out,
+ tbl_llm_model,
+ ) = tabular_anonymise_wrapper_func(
+ file_path,
+ anon_df,
+ chosen_cols,
+ out_file_paths,
+ out_file_part,
+ out_message,
+ sheet_name,
+ anon_strategy,
+ language,
+ chosen_redact_entities,
+ in_allow_list,
+ file_type,
+ "",
+ log_files_output_paths,
+ in_deny_list,
+ max_fuzzy_spelling_mistakes_num,
+ pii_identification_method,
+ chosen_redact_comprehend_entities,
+ comprehend_query_number,
+ comprehend_client,
+ output_folder=output_folder,
+ do_initial_clean=do_initial_clean,
+ bedrock_runtime=bedrock_runtime,
+ custom_llm_instructions=custom_llm_instructions,
+ chosen_llm_entities=chosen_llm_entities,
+ )
+ llm_total_input_tokens += tbl_llm_in
+ llm_total_output_tokens += tbl_llm_out
+ if tbl_llm_model and not llm_model_name:
+ llm_model_name = tbl_llm_model
+ else:
+ # If file is an xlsx, we are going to run through all the Excel sheets to anonymise them separately.
+ file_type = detect_file_type(file_path)
+ # print("File type is:", file_type)
+
+ out_file_part = get_file_name_without_type(file_path)
+
+ if file_type == "docx":
+ (
+ out_file_paths,
+ comprehend_query_number,
+ docx_llm_in,
+ docx_llm_out,
+ docx_llm_model,
+ ) = handle_docx_anonymisation(
+ file_path=file_path,
+ output_folder=output_folder,
+ anon_strategy=anon_strategy,
+ chosen_redact_entities=chosen_redact_entities,
+ in_allow_list=in_allow_list_flat,
+ in_deny_list=in_deny_list,
+ max_fuzzy_spelling_mistakes_num=max_fuzzy_spelling_mistakes_num,
+ pii_identification_method=pii_identification_method,
+ chosen_redact_comprehend_entities=chosen_redact_comprehend_entities,
+ comprehend_query_number=comprehend_query_number,
+ comprehend_client=comprehend_client,
+ language=language,
+ out_file_paths=out_file_paths,
+ )
+ llm_total_input_tokens += docx_llm_in
+ llm_total_output_tokens += docx_llm_out
+ if docx_llm_model and not llm_model_name:
+ llm_model_name = docx_llm_model
+
+ elif file_type == "xlsx":
+ # print("Running through all xlsx sheets")
+ if not in_excel_sheets:
+ out_message.append(
+ "No Excel sheets selected. Please select at least one to anonymise."
+ )
+ continue
+
+ # Create xlsx file:
+ anon_xlsx = pd.ExcelFile(file_path)
+ anon_xlsx_export_file_name = (
+ output_folder + out_file_part + "_redacted.xlsx"
+ )
+
+ # Iterate through the sheet names
+ for sheet_name in progress.tqdm(
+ in_excel_sheets, desc="Anonymising sheets", unit="sheets"
+ ):
+ # Read each sheet into a DataFrame
+ if sheet_name not in anon_xlsx.sheet_names:
+ continue
+
+ anon_df = pd.read_excel(file_path, sheet_name=sheet_name)
+
+ (
+ out_file_paths,
+ out_message,
+ key_string,
+ log_files_output_paths,
+ comprehend_query_number,
+ tbl_llm_in,
+ tbl_llm_out,
+ tbl_llm_model,
+ ) = tabular_anonymise_wrapper_func(
+ anon_file,
+ anon_df,
+ chosen_cols,
+ out_file_paths,
+ out_file_part,
+ out_message,
+ sheet_name,
+ anon_strategy,
+ language,
+ chosen_redact_entities,
+ in_allow_list,
+ file_type,
+ anon_xlsx_export_file_name,
+ log_files_output_paths,
+ in_deny_list,
+ max_fuzzy_spelling_mistakes_num,
+ pii_identification_method,
+ language,
+ chosen_redact_comprehend_entities,
+ comprehend_query_number,
+ comprehend_client,
+ output_folder=output_folder,
+ do_initial_clean=do_initial_clean,
+ bedrock_runtime=bedrock_runtime,
+ custom_llm_instructions=custom_llm_instructions,
+ chosen_llm_entities=chosen_llm_entities,
+ )
+ llm_total_input_tokens += tbl_llm_in
+ llm_total_output_tokens += tbl_llm_out
+ if tbl_llm_model and not llm_model_name:
+ llm_model_name = tbl_llm_model
+
+ else:
+ sheet_name = ""
+ anon_df = read_file(file_path)
+ out_file_part = get_file_name_without_type(file_path)
+
+ (
+ out_file_paths,
+ out_message,
+ key_string,
+ log_files_output_paths,
+ comprehend_query_number,
+ tbl_llm_in,
+ tbl_llm_out,
+ tbl_llm_model,
+ ) = tabular_anonymise_wrapper_func(
+ anon_file,
+ anon_df,
+ chosen_cols,
+ out_file_paths,
+ out_file_part,
+ out_message,
+ sheet_name,
+ anon_strategy,
+ language,
+ chosen_redact_entities,
+ in_allow_list,
+ file_type,
+ "",
+ log_files_output_paths,
+ in_deny_list,
+ max_fuzzy_spelling_mistakes_num,
+ pii_identification_method,
+ language,
+ chosen_redact_comprehend_entities,
+ comprehend_query_number,
+ comprehend_client,
+ output_folder=output_folder,
+ do_initial_clean=do_initial_clean,
+ bedrock_runtime=bedrock_runtime,
+ custom_llm_instructions=custom_llm_instructions,
+ chosen_llm_entities=chosen_llm_entities,
+ )
+ llm_total_input_tokens += tbl_llm_in
+ llm_total_output_tokens += tbl_llm_out
+ if tbl_llm_model and not llm_model_name:
+ llm_model_name = tbl_llm_model
+
+ out_message_out = ""
+
+ # Increase latest file completed count unless we are at the last file
+ if latest_file_completed != len(file_paths):
+ print("Completed file number:", str(latest_file_completed))
+ latest_file_completed += 1
+
+ toc = time.perf_counter()
+ out_time_float = toc - tic
+ out_time = f"in {out_time_float:0.1f} seconds."
+ print(out_time)
+
+ actual_time_taken_number += out_time_float
+ actual_time_taken_number = round(actual_time_taken_number, 1)
+
+ if isinstance(out_message, str):
+ out_message = [out_message]
+
+ out_message.append(
+ "Anonymisation of file '" + out_file_part + "' successfully completed in"
+ )
+
+ out_message_out = "\n".join(out_message)
+ out_message_out = out_message_out + " " + out_time
+
+ if anon_strategy == "encrypt":
+ out_message_out.append(". Your decryption key is " + key_string)
+
+ out_message_out = (
+ out_message_out
+ + "\n\nPlease give feedback on the results below to help improve this app."
+ )
+
+ from tools.secure_regex_utils import safe_remove_leading_newlines
+
+ out_message_out = safe_remove_leading_newlines(out_message_out)
+ out_message_out = out_message_out.lstrip(". ")
+
+ return (
+ out_message_out,
+ out_file_paths,
+ out_file_paths,
+ latest_file_completed,
+ log_files_output_paths,
+ log_files_output_paths,
+ actual_time_taken_number,
+ comprehend_query_number,
+ llm_total_input_tokens,
+ llm_total_output_tokens,
+ llm_model_name,
+ )
+
+
+def tabular_anonymise_wrapper_func(
+ anon_file: str,
+ anon_df: pd.DataFrame,
+ chosen_cols: List[str],
+ out_file_paths: List[str],
+ out_file_part: str,
+ out_message: str,
+ excel_sheet_name: str,
+ anon_strategy: str,
+ language: str,
+ chosen_redact_entities: List[str],
+ in_allow_list: List[str],
+ file_type: str,
+ anon_xlsx_export_file_name: str,
+ log_files_output_paths: List[str],
+ in_deny_list: List[str] = list(),
+ max_fuzzy_spelling_mistakes_num: int = 0,
+ pii_identification_method: str = "Local",
+ comprehend_language: Optional[str] = None,
+ chosen_redact_comprehend_entities: List[str] = list(),
+ comprehend_query_number: int = 0,
+ comprehend_client: botocore.client.BaseClient = "",
+ nlp_analyser: AnalyzerEngine = nlp_analyser,
+ output_folder: str = OUTPUT_FOLDER,
+ do_initial_clean: bool = DO_INITIAL_TABULAR_DATA_CLEAN,
+ bedrock_runtime=None,
+ custom_llm_instructions: str = "",
+ chosen_llm_entities: Optional[List[str]] = None,
+):
+ """
+ This function wraps the anonymisation process for a given dataframe. It filters the dataframe based on chosen columns, applies the specified anonymisation strategy using the anonymise_script function, and exports the anonymised data to a file.
+
+ Input Variables:
+ - anon_file: The path to the file containing the data to be anonymized.
+ - anon_df: The pandas DataFrame containing the data to be anonymized.
+ - chosen_cols: A list of column names to be anonymized.
+ - out_file_paths: A list of paths where the anonymized files will be saved.
+ - out_file_part: A part of the output file name.
+ - out_message: A message to be displayed during the anonymization process.
+ - excel_sheet_name: The name of the Excel sheet where the anonymized data will be exported.
+ - anon_strategy: The anonymization strategy to be applied.
+ - language: The language of the data to be anonymized.
+ - chosen_redact_entities: A list of entities to be redacted.
+ - in_allow_list: A list of allowed values.
+ - file_type: The type of file to be exported.
+ - anon_xlsx_export_file_name: The name of the anonymized Excel file.
+ - log_files_output_paths: A list of paths where the log files will be saved.
+ - in_deny_list: List of specific terms to remove from the data.
+ - max_fuzzy_spelling_mistakes_num (int, optional): The maximum number of spelling mistakes allowed in a searched phrase for fuzzy matching. Can range from 0-9.
+ - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
+ - chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from files, chosen from the official list from AWS Comprehend service.
+ - comprehend_query_number (int, optional): A counter for AWS Comprehend usage in units of 100 characters (1 unit = 100 characters, per AWS billing). Defaults to 0.
+ - comprehend_client (optional): The client object from AWS containing a client connection to AWS Comprehend if that option is chosen on the first tab.
+ - output_folder: The folder where the anonymized files will be saved. Defaults to the 'output_folder' variable.
+ - do_initial_clean (bool, optional): Whether to perform an initial cleaning of the text. Defaults to True.
+ """
+
+ def check_lists(list1, list2):
+ return any(string in list2 for string in list1)
+
+ def get_common_strings(list1, list2):
+ """
+ Finds the common strings between two lists.
+
+ Args:
+ list1: The first list of strings.
+ list2: The second list of strings.
+
+ Returns:
+ A list containing the common strings.
+ """
+ common_strings = list()
+ for string in list1:
+ if string in list2:
+ common_strings.append(string)
+ return common_strings
+
+ if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
+ raise (
+ "Connection to AWS Comprehend service not found, please check connection details."
+ )
+
+ # Check for chosen col, skip file if not found
+ all_cols_original_order = list(anon_df.columns)
+
+ any_cols_found = check_lists(chosen_cols, all_cols_original_order)
+
+ if any_cols_found is False:
+ out_message = "No chosen columns found in dataframe: " + out_file_part
+ key_string = ""
+ print(out_message)
+ return (
+ out_file_paths,
+ out_message,
+ key_string,
+ log_files_output_paths,
+ comprehend_query_number,
+ )
+ else:
+ chosen_cols_in_anon_df = get_common_strings(
+ chosen_cols, all_cols_original_order
+ )
+
+ # Split dataframe to keep only selected columns
+ # print("Remaining columns to redact:", chosen_cols_in_anon_df)
+
+ if not anon_df.index.is_unique:
+ anon_df = anon_df.reset_index(drop=True)
+
+ anon_df_part = anon_df[chosen_cols_in_anon_df]
+ anon_df_remain = anon_df.drop(chosen_cols_in_anon_df, axis=1)
+
+ row_count = anon_df_part.shape[0]
+
+ if row_count > MAX_TABLE_ROWS:
+ out_message = f"Number of rows in dataframe is greater than {MAX_TABLE_ROWS}. Please submit a smaller dataframe."
+ print(out_message)
+ raise Exception(out_message)
+
+ column_count = anon_df_part.shape[1]
+
+ if column_count > MAX_TABLE_COLUMNS:
+ out_message = f"Number of columns in dataframe is greater than {MAX_TABLE_COLUMNS}. Please submit a smaller dataframe."
+ print(out_message)
+ raise Exception(out_message)
+
+ # Anonymise the selected columns
+ (
+ anon_df_part_out,
+ key_string,
+ decision_process_output_str,
+ comprehend_query_number,
+ decision_process_output_df,
+ llm_total_input_tokens,
+ llm_total_output_tokens,
+ llm_model_name,
+ ) = anonymise_script(
+ anon_df_part,
+ anon_strategy,
+ language,
+ chosen_redact_entities,
+ in_allow_list,
+ in_deny_list,
+ max_fuzzy_spelling_mistakes_num,
+ pii_identification_method,
+ chosen_redact_comprehend_entities,
+ comprehend_query_number,
+ comprehend_client,
+ nlp_analyser=nlp_analyser,
+ do_initial_clean=do_initial_clean,
+ bedrock_runtime=bedrock_runtime,
+ file_name=out_file_part,
+ sheet_name=excel_sheet_name if excel_sheet_name else None,
+ output_folder=output_folder,
+ custom_llm_instructions=custom_llm_instructions,
+ chosen_llm_entities=chosen_llm_entities,
+ )
+
+ anon_df_part_out.replace("^nan$", "", regex=True, inplace=True)
+
+ # Rejoin the dataframe together
+ anon_df_out = pd.concat([anon_df_part_out, anon_df_remain], axis=1)
+ # Reorder to match original column order; add any missing columns as empty
+ # (avoids KeyError when e.g. chosen_cols referred to columns from another sheet/file)
+ missing_cols = [c for c in all_cols_original_order if c not in anon_df_out.columns]
+ if missing_cols:
+ for c in missing_cols:
+ anon_df_out[c] = ""
+ anon_df_out = anon_df_out[all_cols_original_order]
+
+ # Export file
+ # Rename anonymisation strategy for file path naming
+ if anon_strategy == "replace with 'REDACTED'":
+ anon_strat_txt = "redact_replace"
+ elif anon_strategy == "replace with ":
+ anon_strat_txt = "redact_entity_type"
+ elif anon_strategy == "redact completely":
+ anon_strat_txt = "redact_remove"
+ else:
+ anon_strat_txt = anon_strategy
+
+ # If the file is an xlsx, add a new sheet to the existing xlsx. Otherwise, write to csv
+ if file_type == "xlsx":
+
+ anon_export_file_name = anon_xlsx_export_file_name
+
+ if not os.path.exists(anon_xlsx_export_file_name):
+ wb = Workbook()
+ ws = wb.active # Get the default active sheet
+ ws.title = excel_sheet_name
+ wb.save(anon_xlsx_export_file_name)
+
+ # Create a Pandas Excel writer using XlsxWriter as the engine.
+ with pd.ExcelWriter(
+ anon_xlsx_export_file_name,
+ engine="openpyxl",
+ mode="a",
+ if_sheet_exists="replace",
+ ) as writer:
+ # Write each DataFrame to a different worksheet.
+ anon_df_out.to_excel(writer, sheet_name=excel_sheet_name, index=None)
+
+ decision_process_log_output_file = (
+ anon_xlsx_export_file_name + "_" + excel_sheet_name + "_log.csv"
+ )
+
+ decision_process_output_df.to_csv(
+ decision_process_log_output_file, index=None, encoding="utf-8-sig"
+ )
+
+ else:
+ anon_export_file_name = (
+ output_folder + out_file_part + "_anon_" + anon_strat_txt + ".csv"
+ )
+ anon_df_out.to_csv(anon_export_file_name, index=None, encoding="utf-8-sig")
+
+ decision_process_log_output_file = anon_export_file_name + "_log.csv"
+
+ decision_process_output_df.to_csv(
+ decision_process_log_output_file, index=None, encoding="utf-8-sig"
+ )
+
+ out_file_paths.append(anon_export_file_name)
+ out_file_paths.append(decision_process_log_output_file)
+
+ # As files are created in a loop, there is a risk of duplicate file names being output. Use set to keep uniques.
+ out_file_paths = list(set(out_file_paths))
+
+ # Print result text to output text box if just anonymising open text
+ if anon_file == "open_text":
+ out_message = ["'" + anon_df_out["text"][0] + "'"]
+
+ return (
+ out_file_paths,
+ out_message,
+ key_string,
+ log_files_output_paths,
+ comprehend_query_number,
+ llm_total_input_tokens,
+ llm_total_output_tokens,
+ llm_model_name,
+ )
+
+
+def anonymise_script(
+ df: pd.DataFrame,
+ anon_strategy: str,
+ language: str,
+ chosen_redact_entities: List[str],
+ in_allow_list: List[str] = list(),
+ in_deny_list: List[str] = list(),
+ max_fuzzy_spelling_mistakes_num: int = 0,
+ pii_identification_method: str = "Local",
+ chosen_redact_comprehend_entities: List[str] = list(),
+ comprehend_query_number: int = 0,
+ comprehend_client: botocore.client.BaseClient = "",
+ custom_entities: List[str] = custom_entities,
+ nlp_analyser: AnalyzerEngine = nlp_analyser,
+ do_initial_clean: bool = DO_INITIAL_TABULAR_DATA_CLEAN,
+ progress: Progress = Progress(track_tqdm=True),
+ bedrock_runtime=None,
+ model_choice: str = CLOUD_LLM_PII_MODEL_CHOICE,
+ custom_llm_instructions: str = "",
+ chosen_llm_entities: List[str] = None,
+ file_name: Optional[str] = None,
+ sheet_name: Optional[str] = None,
+ output_folder: Optional[str] = None,
+ **text_analyzer_kwargs,
+):
+ """
+ Conduct anonymisation of a dataframe using Presidio, AWS Comprehend, or LLM if chosen.
+
+ Args:
+ df (pd.DataFrame): The input DataFrame containing text to be anonymised.
+ anon_strategy (str): The anonymisation strategy to apply (e.g., "replace with 'REDACTED'", "replace with ", "redact completely").
+ language (str): The language of the text for analysis (e.g., "en", "es").
+ chosen_redact_entities (List[str]): A list of entity types to redact using the local (Presidio) method.
+ in_allow_list (List[str], optional): A list of terms to explicitly allow and not redact. Defaults to an empty list.
+ in_deny_list (List[str], optional): A list of terms to explicitly deny and always redact. Defaults to an empty list.
+ max_fuzzy_spelling_mistakes_num (int, optional): The maximum number of fuzzy spelling mistakes to tolerate for custom recognizers. Defaults to 0.
+ pii_identification_method (str, optional): The method for PII identification ("Local", "AWS Comprehend", or "LLM (AWS Bedrock)"). Defaults to "Local".
+ chosen_redact_comprehend_entities (List[str], optional): A list of entity types to redact using AWS Comprehend or LLM. Defaults to an empty list.
+ comprehend_query_number (int, optional): For AWS Comprehend, counter in units of 100 characters (1 unit = 100 characters, per AWS billing). For LLM, incremented per batch. Defaults to 0.
+ comprehend_client (botocore.client.BaseClient, optional): An initialized AWS Comprehend client. Defaults to an empty string.
+ custom_entities (List[str], optional): A list of custom entities to be recognized. Defaults to `custom_entities`.
+ nlp_analyser (AnalyzerEngine, optional): The Presidio AnalyzerEngine instance to use. Defaults to `nlp_analyser`.
+ do_initial_clean (bool, optional): Whether to perform an initial cleaning of the text. Defaults to True.
+ progress (Progress, optional): Gradio Progress object for tracking progress. Defaults to Progress(track_tqdm=False).
+ bedrock_runtime (optional): AWS Bedrock runtime client for LLM-based entity detection.
+ model_choice (str, optional): LLM model choice for entity detection. Defaults to CLOUD_LLM_PII_MODEL_CHOICE.
+ custom_llm_instructions (str, optional): Custom instructions for LLM entity detection. Defaults to empty string.
+ chosen_llm_entities (List[str], optional): List of entity types to detect using LLM. Defaults to None (uses chosen_redact_comprehend_entities).
+ file_name (Optional[str], optional): File name for logging purposes. Defaults to None.
+ output_folder (Optional[str], optional): Folder for LLM prompt/response logs. When None, uses OUTPUT_FOLDER from config. Pass the session output folder (e.g. from output_folder_textbox) so logs go to the same place as other outputs.
+ **text_analyzer_kwargs: Additional keyword arguments for text analyzer (e.g., temperature, max_tokens, inference_method).
+ """
+
+ print("Identifying personal information")
+ analyse_tic = time.perf_counter()
+
+ # LLM token counts (used when pii_identification_method is an LLM option)
+ llm_total_input_tokens = 0
+ llm_total_output_tokens = 0
+ llm_model_name = ""
+
+ # Initialize analyzer_results as an empty dictionary to store results by column
+ results_by_column = dict()
+ key_string = ""
+
+ # Handle both list (new Dropdown format) and DataFrame (legacy)
+ if isinstance(in_allow_list, list):
+ # Dropdown component returns a list directly
+ in_allow_list_flat = (
+ [str(item) for item in in_allow_list if item] if in_allow_list else list()
+ )
+ elif isinstance(in_allow_list, pd.DataFrame):
+ if not in_allow_list.empty:
+ in_allow_list_flat = list(in_allow_list.iloc[:, 0].unique())
+ else:
+ in_allow_list_flat = list()
+ else:
+ in_allow_list_flat = list()
+
+ ### Language check - check if selected language packs exist
+ try:
+ if language != "en":
+ progress(0.1, desc=f"Loading spaCy model for {language}")
+
+ load_spacy_model(language)
+
+ except Exception as e:
+ out_message = f"Error downloading language packs for {language}: {e}"
+ print(out_message)
+ raise Exception(out_message)
+
+ # Try updating the supported languages for the spacy analyser
+ try:
+ nlp_analyser = create_nlp_analyser(language, existing_nlp_analyser=nlp_analyser)
+ # Check list of nlp_analyser recognisers and languages
+ if language != "en":
+ gr.Info(
+ f"Language: {language} only supports the following entity detection: {str(nlp_analyser.registry.get_supported_entities(languages=[language]))}"
+ )
+
+ except Exception as e:
+ out_message = f"Error creating nlp_analyser for {language}: {e}"
+ print(out_message)
+ raise Exception(out_message)
+
+ # Handle both list (new Dropdown format) and DataFrame (legacy)
+ if isinstance(in_deny_list, list):
+ # Dropdown component returns a list directly
+ in_deny_list = (
+ [str(item) for item in in_deny_list if item] if in_deny_list else list()
+ )
+ # Sort the strings in order from the longest string to the shortest
+ in_deny_list = sorted(in_deny_list, key=len, reverse=True)
+ elif isinstance(in_deny_list, pd.DataFrame):
+ if not in_deny_list.empty:
+ in_deny_list = in_deny_list.iloc[:, 0].tolist()
+ else:
+ # Handle the case where the DataFrame is empty
+ in_deny_list = list() # or some default value
+
+ # Sort the strings in order from the longest string to the shortest
+ in_deny_list = sorted(in_deny_list, key=len, reverse=True)
+
+ if in_deny_list:
+ nlp_analyser.registry.remove_recognizer("CUSTOM")
+ new_custom_recogniser = custom_word_list_recogniser(in_deny_list)
+ nlp_analyser.registry.add_recognizer(new_custom_recogniser)
+
+ nlp_analyser.registry.remove_recognizer("CustomWordFuzzyRecognizer")
+ new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(
+ supported_entities=["CUSTOM_FUZZY"],
+ custom_list=in_deny_list,
+ spelling_mistakes_max=in_deny_list,
+ search_whole_phrase=max_fuzzy_spelling_mistakes_num,
+ )
+ nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
+
+ # analyzer = nlp_analyser #AnalyzerEngine()
+ batch_analyzer = BatchAnalyzerEngine(analyzer_engine=nlp_analyser)
+ anonymizer = (
+ AnonymizerEngine()
+ ) # conflict_resolution=ConflictResolutionStrategy.MERGE_SIMILAR_OR_CONTAINED)
+ batch_anonymizer = BatchAnonymizerEngine(anonymizer_engine=anonymizer)
+ analyzer_results = list()
+
+ if do_initial_clean:
+ progress(0.2, desc="Cleaning text")
+ columns = list(df.columns)
+ max_workers = min(MAX_WORKERS, len(columns))
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
+ cleaned = list(
+ progress.tqdm(
+ executor.map(lambda col: (col, initial_clean(df[col])), columns),
+ total=len(columns),
+ desc="Cleaning text",
+ unit="Columns",
+ )
+ )
+ for col, cleaned_series in cleaned:
+ df[col] = cleaned_series
+
+ # DataFrame to dict
+ df_dict = df.to_dict(orient="list")
+
+ if pii_identification_method == "Local":
+
+ # Run Local (Presidio) analysis in parallel over columns
+ def _analyze_one_column_local(item):
+ column_name, texts = item
+ if not texts or (isinstance(texts, (list, tuple)) and len(texts) == 0):
+ return DictAnalyzerResult(
+ key=column_name, value=texts, recognizer_results=[]
+ )
+ if not isinstance(texts, (list, tuple)):
+ texts = [texts]
+ try:
+ results = analyze_iterator_custom(
+ batch_analyzer,
+ texts=texts,
+ language=language,
+ list_length=len(texts),
+ context=[column_name],
+ entities=chosen_redact_entities,
+ score_threshold=score_threshold,
+ return_decision_process=True,
+ allow_list=in_allow_list_flat,
+ )
+ return DictAnalyzerResult(
+ key=column_name, value=texts, recognizer_results=results
+ )
+ except Exception as e:
+ return (column_name, None, e)
+
+ local_tasks = list(df_dict.items())
+ max_workers = min(MAX_WORKERS, len(local_tasks)) if local_tasks else 1
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
+ local_results = list(
+ progress.tqdm(
+ executor.map(_analyze_one_column_local, local_tasks),
+ total=len(local_tasks),
+ desc="Analyzing text (Local PII).",
+ unit="columns",
+ )
+ )
+
+ for result in local_results:
+ if isinstance(result, tuple) and len(result) == 3 and result[2] is not None:
+ _, _, err = result
+ raise err
+ results_by_column[result.key] = result
+
+ # Convert the dictionary of results back to a list
+ analyzer_results = list(results_by_column.values())
+
+ # AWS Comprehend calls
+ elif pii_identification_method == "AWS Comprehend" and comprehend_client:
+
+ # Only run Local anonymisation for entities that are not covered by AWS Comprehend
+ if custom_entities:
+ custom_redact_entities = [
+ entity
+ for entity in chosen_redact_comprehend_entities
+ if entity in custom_entities
+ ]
+ if custom_redact_entities:
+ # Get results from analyze_dict
+ custom_results = analyze_dict(
+ batch_analyzer,
+ df_dict,
+ language=language,
+ entities=custom_redact_entities,
+ score_threshold=score_threshold,
+ return_decision_process=True,
+ allow_list=in_allow_list_flat,
+ )
+
+ # Initialize results_by_column with custom entity results
+ for result in custom_results:
+ results_by_column[result.key] = result
+
+ max_retries = 3
+ retry_delay = 3
+
+ # Build list of (column_name, text_idx, text_str) for all cells
+ comprehend_tasks = []
+ for column_name, texts in df_dict.items():
+ if column_name in results_by_column:
+ column_results = results_by_column[column_name]
+ else:
+ column_results = DictAnalyzerResult(
+ recognizer_results=[[] for _ in texts], key=column_name, value=texts
+ )
+ results_by_column[column_name] = column_results
+ for text_idx, text in enumerate(texts):
+ text_str = str(text) if text else ""
+ comprehend_tasks.append((column_name, text_idx, text_str))
+
+ def _run_comprehend_task(item):
+ column_name, text_idx, text_str = item
+ try:
+ recognizer_list, units = _comprehend_one_cell(
+ comprehend_client,
+ text_str,
+ language,
+ chosen_redact_comprehend_entities,
+ in_allow_list_flat,
+ max_retries=max_retries,
+ retry_delay=retry_delay,
+ )
+ return (column_name, text_idx, recognizer_list, units, None)
+ except Exception as e:
+ return (column_name, text_idx, [], 0, e)
+
+ max_workers = min(MAX_WORKERS, len(comprehend_tasks)) if comprehend_tasks else 1
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
+ completed = list(
+ progress.tqdm(
+ executor.map(_run_comprehend_task, comprehend_tasks),
+ total=len(comprehend_tasks),
+ desc="Querying AWS Comprehend service.",
+ unit="cells",
+ )
+ )
+
+ for column_name, text_idx, recognizer_list, units, err in completed:
+ if err is not None:
+ print(
+ f"AWS Comprehend calls failed for cell ({column_name}, {text_idx}) due to",
+ err,
+ )
+ raise err
+ comprehend_query_number += units
+ results_by_column[column_name].recognizer_results[
+ text_idx
+ ] = recognizer_list
+
+ # Convert the dictionary of results back to a list
+ analyzer_results = list(results_by_column.values())
+
+ elif (pii_identification_method == "AWS Comprehend") & (not comprehend_client):
+ raise ("Unable to redact, Comprehend connection details not found.")
+
+ # LLM-based entity detection
+ elif pii_identification_method == AWS_LLM_PII_OPTION:
+ if not bedrock_runtime and text_analyzer_kwargs.get("inference_method") not in [
+ "local",
+ "inference-server",
+ "azure-openai",
+ "gemini",
+ ]:
+ raise ValueError(
+ "bedrock_runtime is required when using LLM-based PII detection with AWS Bedrock"
+ )
+ # Set inference method to aws-bedrock if not already set
+ if text_analyzer_kwargs.get("inference_method") is None:
+ text_analyzer_kwargs["inference_method"] = "aws-bedrock"
+ # Set model choice if not already set
+ if text_analyzer_kwargs.get("model_choice") is None:
+ text_analyzer_kwargs["model_choice"] = (
+ model_choice or CLOUD_LLM_PII_MODEL_CHOICE
+ )
+ # Default chosen_llm_entities to chosen_redact_comprehend_entities if not provided
+ if chosen_llm_entities is None:
+ chosen_llm_entities = chosen_redact_comprehend_entities
+
+ elif pii_identification_method == INFERENCE_SERVER_PII_OPTION:
+ # LLM-based entity detection using inference server
+ from tools.config import (
+ INFERENCE_SERVER_API_URL,
+ )
+
+ # Set inference method to inference-server if not already set
+ if text_analyzer_kwargs.get("inference_method") is None:
+ text_analyzer_kwargs["inference_method"] = "inference-server"
+
+ # Set API URL if not already set
+ if text_analyzer_kwargs.get("api_url") is None:
+ text_analyzer_kwargs["api_url"] = INFERENCE_SERVER_API_URL
+
+ # Set model choice if not already set - use INFERENCE_SERVER_LLM_PII_MODEL_CHOICE
+ if text_analyzer_kwargs.get("model_choice") is None:
+ from tools.config import INFERENCE_SERVER_LLM_PII_MODEL_CHOICE
+
+ text_analyzer_kwargs["model_choice"] = INFERENCE_SERVER_LLM_PII_MODEL_CHOICE
+
+ # Use the same logic as AWS_LLM_PII_OPTION for the rest
+ # Default chosen_llm_entities to chosen_redact_comprehend_entities if not provided
+ if chosen_llm_entities is None:
+ chosen_llm_entities = chosen_redact_comprehend_entities
+
+ elif pii_identification_method == LOCAL_TRANSFORMERS_LLM_PII_OPTION:
+ # LLM-based entity detection using local transformers models
+ # Set inference method to local if not already set
+ if text_analyzer_kwargs.get("inference_method") is None:
+ text_analyzer_kwargs["inference_method"] = "local"
+
+ # Set model choice if not already set - use LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE
+ if text_analyzer_kwargs.get("model_choice") is None:
+ text_analyzer_kwargs["model_choice"] = (
+ LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE
+ )
+
+ # Use the same logic as AWS_LLM_PII_OPTION for the rest
+ # Default chosen_llm_entities to chosen_redact_comprehend_entities if not provided
+ if chosen_llm_entities is None:
+ chosen_llm_entities = chosen_redact_comprehend_entities
+
+ # Shared LLM column/cell detection for AWS Bedrock, Inference Server, and Local Transformers
+ if pii_identification_method in (
+ AWS_LLM_PII_OPTION,
+ INFERENCE_SERVER_PII_OPTION,
+ LOCAL_TRANSFORMERS_LLM_PII_OPTION,
+ ):
+ # Handle custom entities first (same as AWS Comprehend)
+ if custom_entities:
+ custom_redact_entities = [
+ entity for entity in chosen_llm_entities if entity in custom_entities
+ ]
+
+ if custom_redact_entities:
+ # Get valid language entities
+ valid_language_entities = nlp_analyser.registry.get_supported_entities(
+ languages=[language]
+ )
+ if "CUSTOM" not in valid_language_entities:
+ valid_language_entities.append("CUSTOM")
+ if "CUSTOM_FUZZY" not in valid_language_entities:
+ valid_language_entities.append("CUSTOM_FUZZY")
+
+ # Filter entities to only include those supported by the language
+ from tools.custom_image_analyser_engine import (
+ filter_entities_for_language,
+ )
+
+ language_supported_entities = filter_entities_for_language(
+ custom_redact_entities, valid_language_entities, language
+ )
+
+ if language_supported_entities:
+ custom_results = analyze_dict(
+ batch_analyzer,
+ df_dict,
+ language=language,
+ entities=language_supported_entities,
+ score_threshold=score_threshold,
+ return_decision_process=True,
+ allow_list=in_allow_list_flat,
+ )
+
+ # Initialize results_by_column with custom entity results
+ for result in custom_results:
+ results_by_column[result.key] = result
+
+ # Remove 'CUSTOM' and 'CUSTOM_VLM_*' entities from the chosen_llm_entities list
+ # CUSTOM_VLM_* entities are handled separately via VLM, not LLM
+ llm_chosen_redact_entities = [
+ entity
+ for entity in chosen_llm_entities
+ if entity != "CUSTOM" and not entity.startswith("CUSTOM_VLM_")
+ ]
+
+ # Validate: if no standard entities and no custom instructions, raise error
+ if not llm_chosen_redact_entities and (
+ not custom_llm_instructions or not custom_llm_instructions.strip()
+ ):
+ raise ValueError(
+ "No standard entities selected for LLM PII detection and no custom instructions provided. "
+ "Please select at least one entity type (excluding CUSTOM_VLM_* entities) or provide custom instructions."
+ )
+
+ # If no LLM entities to detect but custom instructions exist, still call LLM with custom instructions only
+ # If no entities and no custom instructions, the validation above will have raised an error
+ # So at this point, we either have entities OR custom instructions (or both)
+ max_retries = 3
+ retry_delay = 3
+
+ # Use model_choice from kwargs when set (e.g. by INFERENCE_SERVER or LOCAL_TRANSFORMERS branches)
+ effective_model_choice = text_analyzer_kwargs.get("model_choice", model_choice)
+ llm_total_input_tokens = 0
+ llm_total_output_tokens = 0
+ # Report the model actually used: upgraded to custom-instructions model when applicable
+ custom_instructions_model = (
+ CLOUD_LLM_PII_CUSTOM_INSTRUCTIONS_MODEL_CHOICE.strip()
+ if isinstance(CLOUD_LLM_PII_CUSTOM_INSTRUCTIONS_MODEL_CHOICE, str)
+ and CLOUD_LLM_PII_CUSTOM_INSTRUCTIONS_MODEL_CHOICE.strip()
+ else ""
+ )
+ if (
+ (custom_llm_instructions or "").strip()
+ and effective_model_choice == CLOUD_LLM_PII_MODEL_CHOICE
+ and custom_instructions_model
+ ):
+ llm_model_name = custom_instructions_model
+ else:
+ llm_model_name = effective_model_choice or ""
+ # Build list of (task_idx, column_name, text_idx, text_str) for non-empty cells
+ llm_tasks = []
+ for column_name, texts in df_dict.items():
+ if column_name in results_by_column:
+ column_results = results_by_column[column_name]
+ else:
+ column_results = DictAnalyzerResult(
+ recognizer_results=[[] for _ in texts],
+ key=column_name,
+ value=texts,
+ )
+ results_by_column[column_name] = column_results
+ for text_idx, text in enumerate(texts):
+ text_str = str(text) if text else ""
+ if not text_str.strip():
+ continue
+ llm_tasks.append((len(llm_tasks), column_name, text_idx, text_str))
+
+ def _run_llm_task(item):
+ task_idx, column_name, text_idx, text_str = item
+ for attempt in range(max_retries):
+ try:
+ entities, batch_input_tokens, batch_output_tokens = (
+ call_llm_for_entity_detection(
+ text=text_str,
+ entities_to_detect=llm_chosen_redact_entities,
+ language=language,
+ bedrock_runtime=bedrock_runtime,
+ model_choice=effective_model_choice,
+ temperature=text_analyzer_kwargs.get(
+ "temperature", LLM_TEMPERATURE
+ ),
+ max_tokens=text_analyzer_kwargs.get(
+ "max_tokens", LLM_MAX_NEW_TOKENS
+ ),
+ output_folder=(
+ output_folder
+ if output_folder is not None
+ else OUTPUT_FOLDER
+ ),
+ batch_number=task_idx + 1,
+ custom_instructions=custom_llm_instructions,
+ file_name=file_name,
+ page_number=None,
+ sheet_name=sheet_name,
+ column_name=column_name,
+ row_number=text_idx + 1,
+ inference_method=text_analyzer_kwargs.get(
+ "inference_method"
+ ),
+ client=text_analyzer_kwargs.get("client"),
+ client_config=text_analyzer_kwargs.get("client_config"),
+ api_url=text_analyzer_kwargs.get("api_url"),
+ )
+ )
+ return (
+ column_name,
+ text_idx,
+ text_str,
+ entities,
+ batch_input_tokens,
+ batch_output_tokens,
+ None,
+ )
+ except Exception as e:
+ if attempt == max_retries - 1:
+ return (column_name, text_idx, text_str, [], 0, 0, e)
+ time.sleep(retry_delay)
+ return (column_name, text_idx, text_str, [], 0, 0, None)
+
+ max_llm_workers = (
+ min(LLM_PII_MAX_CONCURRENT_REQUESTS, len(llm_tasks)) if llm_tasks else 1
+ )
+ with ThreadPoolExecutor(max_workers=max_llm_workers) as executor:
+ llm_results = list(
+ progress.tqdm(
+ executor.map(_run_llm_task, llm_tasks),
+ total=len(llm_tasks),
+ desc="Querying LLM service.",
+ unit="cells",
+ )
+ )
+
+ for (
+ column_name,
+ text_idx,
+ text_str,
+ entities,
+ batch_input_tokens,
+ batch_output_tokens,
+ err,
+ ) in llm_results:
+ if err is not None:
+ print(
+ f"LLM entity detection failed for text: {text_str[:100]}... due to",
+ err,
+ )
+ raise err
+ llm_total_input_tokens += batch_input_tokens
+ llm_total_output_tokens += batch_output_tokens
+ column_results = results_by_column[column_name]
+ for entity in entities:
+ if not isinstance(entity, dict):
+ continue
+ entity_type = entity.get("Type", "")
+ begin_offset = entity.get("BeginOffset", 0)
+ end_offset = entity.get("EndOffset", 0)
+ entity_text = entity.get("Text", text_str[begin_offset:end_offset])
+ if in_allow_list_flat:
+ allow_list_normalized = [
+ item.strip().lower() for item in in_allow_list_flat if item
+ ]
+ if entity_text.strip().lower() in allow_list_normalized:
+ continue
+ if (
+ llm_chosen_redact_entities
+ and entity_type not in llm_chosen_redact_entities
+ ):
+ if not (
+ custom_llm_instructions and str(custom_llm_instructions).strip()
+ ):
+ continue
+ recognizer_result = RecognizerResult(
+ entity_type=entity_type,
+ start=begin_offset,
+ end=end_offset,
+ score=entity.get("Score", 0.0),
+ )
+ column_results.recognizer_results[text_idx].append(recognizer_result)
+
+ # Convert the dictionary of results back to a list
+ analyzer_results = list(results_by_column.values())
+
+ # Usage in the main function:
+ decision_process_output_str, decision_process_output_df = generate_log(
+ analyzer_results, df_dict
+ )
+
+ analyse_toc = time.perf_counter()
+ analyse_time_out = (
+ f"Analysing the text took {analyse_toc - analyse_tic:0.1f} seconds."
+ )
+ print(analyse_time_out)
+
+ # Set up the anonymization configuration WITHOUT DATE_TIME
+ simple_replace_config = {
+ "DEFAULT": OperatorConfig("replace", {"new_value": "REDACTED"})
+ }
+ replace_config = {"DEFAULT": OperatorConfig("replace")}
+ redact_config = {"DEFAULT": OperatorConfig("redact")}
+ hash_config = {"DEFAULT": OperatorConfig("hash")}
+ mask_config = {
+ "DEFAULT": OperatorConfig(
+ "mask", {"masking_char": "*", "chars_to_mask": 100, "from_end": True}
+ )
+ }
+ people_encrypt_config = {
+ "PERSON": OperatorConfig("encrypt", {"key": key_string})
+ } # The encryption is using AES cypher in CBC mode and requires a cryptographic key as an input for both the encryption and the decryption.
+ fake_first_name_config = {
+ "PERSON": OperatorConfig("custom", {"lambda": fake_first_name})
+ }
+
+ if anon_strategy == "replace with 'REDACTED'":
+ chosen_mask_config = simple_replace_config
+ elif anon_strategy == "replace_redacted":
+ chosen_mask_config = simple_replace_config
+ elif anon_strategy == "replace with ":
+ chosen_mask_config = replace_config
+ elif anon_strategy == "entity_type":
+ chosen_mask_config = replace_config
+ elif anon_strategy == "redact completely":
+ chosen_mask_config = redact_config
+ elif anon_strategy == "redact":
+ chosen_mask_config = redact_config
+ elif anon_strategy == "hash":
+ chosen_mask_config = hash_config
+ elif anon_strategy == "mask":
+ chosen_mask_config = mask_config
+ elif anon_strategy == "encrypt":
+ chosen_mask_config = people_encrypt_config
+ key = secrets.token_bytes(16) # 128 bits = 16 bytes
+ key_string = base64.b64encode(key).decode("utf-8")
+
+ # Now inject the key into the operator config
+ for entity, operator in chosen_mask_config.items():
+ if operator.operator_name == "encrypt":
+ operator.params = {"key": key_string}
+ elif anon_strategy == "fake_first_name":
+ chosen_mask_config = fake_first_name_config
+ else:
+ print("Anonymisation strategy not found. Redacting completely by default.")
+ chosen_mask_config = redact_config # Redact completely by default
+
+ combined_config = {**chosen_mask_config}
+
+ anonymizer_results = batch_anonymizer.anonymize_dict(
+ analyzer_results, operators=combined_config
+ )
+
+ scrubbed_df = pd.DataFrame(anonymizer_results)
+
+ return (
+ scrubbed_df,
+ key_string,
+ decision_process_output_str,
+ comprehend_query_number,
+ decision_process_output_df,
+ llm_total_input_tokens,
+ llm_total_output_tokens,
+ llm_model_name,
+ )
diff --git a/tools/file_conversion.py b/tools/file_conversion.py
new file mode 100644
index 0000000000000000000000000000000000000000..e10b74792a9ba13b1d500ca9fa6af7e7e5068e05
--- /dev/null
+++ b/tools/file_conversion.py
@@ -0,0 +1,3848 @@
+import json
+import os
+import random
+import re
+import shutil
+import string
+import threading
+import time
+import zipfile
+from collections import defaultdict
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Set, Tuple
+
+import gradio as gr
+import numpy as np
+import pandas as pd
+import polars as pl
+import pymupdf
+from gradio import Progress
+from PIL import Image, ImageFile
+from pymupdf import Document, Page
+from scipy.spatial import cKDTree
+from tqdm import tqdm
+
+from tools.config import (
+ COMPRESS_REDACTED_PDF,
+ IMAGES_DPI,
+ INPUT_FOLDER,
+ LOAD_REDACTION_ANNOTATIONS_FROM_PDF,
+ LOAD_TRUNCATED_IMAGES,
+ LOCAL_OCR_MODEL_TEXT_EXTRACT_OPTION,
+ MAX_IMAGE_PIXELS,
+ MAX_SIMULTANEOUS_FILES,
+ MAX_WORKERS,
+ OUTPUT_FOLDER,
+ SELECTABLE_TEXT_EXTRACT_OPTION,
+ TEXTRACT_TEXT_EXTRACT_OPTION,
+)
+from tools.helper_functions import get_file_name_without_type, read_file
+from tools.secure_path_utils import secure_file_read, secure_join
+from tools.secure_regex_utils import safe_extract_page_number_from_path
+
+IMAGE_NUM_REGEX = re.compile(r"_(\d+)\.png$")
+
+pd.set_option("future.no_silent_downcasting", True)
+
+image_dpi = float(IMAGES_DPI)
+if not MAX_IMAGE_PIXELS:
+ Image.MAX_IMAGE_PIXELS = None
+else:
+ Image.MAX_IMAGE_PIXELS = MAX_IMAGE_PIXELS
+
+ImageFile.LOAD_TRUNCATED_IMAGES = LOAD_TRUNCATED_IMAGES
+
+
+_PDF_DOC_CACHE = threading.local()
+
+
+def _get_threadlocal_pymupdf_doc(pdf_path: str) -> Document:
+ """
+ Cache a PyMuPDF Document per thread to avoid reopening the same PDF for each page.
+
+ ThreadPoolExecutor threads are long-lived for the duration of the pool, so this
+ cuts overhead significantly when processing many pages.
+ """
+ cache = getattr(_PDF_DOC_CACHE, "docs", None)
+ if cache is None:
+ cache = {}
+ _PDF_DOC_CACHE.docs = cache
+ doc = cache.get(pdf_path)
+ if doc is None:
+ doc = pymupdf.open(pdf_path)
+ cache[pdf_path] = doc
+ return doc
+
+
+def _render_pdf_page_to_png_pymupdf_mediabox(
+ pdf_path: str,
+ page_num: int,
+ out_path: str,
+ dpi: float,
+) -> Image.Image:
+ """
+ Render a single PDF page to a grayscale PNG using PyMuPDF, ensuring MediaBox render.
+
+ PyMuPDF's Page.get_pixmap() respects the CropBox; to render the MediaBox we
+ temporarily set CropBox=MediaBox and restore it afterwards.
+ """
+ doc = _get_threadlocal_pymupdf_doc(pdf_path)
+ page = doc.load_page(page_num)
+
+ old_crop = page.cropbox
+ old_rot = page.rotation
+ try:
+ page.set_cropbox(page.mediabox)
+ # Preserve the PDF's intrinsic rotation (e.g. 180deg pages).
+ # Downstream coordinate logic assumes the rendered image matches PyMuPDF's display space.
+ page.set_rotation(old_rot)
+ pix = page.get_pixmap(
+ dpi=int(dpi) if dpi is not None else None,
+ colorspace=pymupdf.csGRAY,
+ alpha=False,
+ annots=False,
+ )
+ # Fast path: write PNG via MuPDF, then load with PIL for downstream resizing.
+ pix.save(out_path)
+ # Embed DPI in PNG (MuPDF write does not set pHYs); matches IMAGES_DPI render scale.
+ _dpi = max(1, int(round(float(dpi))))
+ _pil = Image.open(out_path)
+ _pil.save(out_path, format="PNG", dpi=(_dpi, _dpi))
+ return _pil
+ finally:
+ page.set_cropbox(old_crop)
+ if old_rot != 0:
+ page.set_rotation(old_rot)
+
+
+def is_pdf_or_image(filename):
+ """
+ Check if a file name is a PDF or an image file.
+
+ Args:
+ filename (str): The name of the file.
+
+ Returns:
+ bool: True if the file name ends with ".pdf", ".jpg", or ".png", False otherwise.
+ """
+ if (
+ filename.lower().endswith(".pdf")
+ or filename.lower().endswith(".jpg")
+ or filename.lower().endswith(".jpeg")
+ or filename.lower().endswith(".png")
+ ):
+ output = True
+ else:
+ output = False
+ return output
+
+
+def is_pdf(filename):
+ """
+ Check if a file name is a PDF.
+
+ Args:
+ filename (str): The name of the file.
+
+ Returns:
+ bool: True if the file name ends with ".pdf", False otherwise.
+ """
+ return filename.lower().endswith(".pdf")
+
+
+def check_image_size_and_reduce(out_path: str, image: Image):
+ """
+ Check if a given image size is above around 4.5mb, and reduce size if necessary.
+ 5mb is the maximum possible to submit to AWS Textract.
+
+ Args:
+ out_path (str): The file path where the image is currently saved and will be saved after resizing.
+ image (Image): The PIL Image object to be checked and potentially resized.
+ """
+
+ all_img_details = list()
+ page_num = 0
+
+ # Check file size and resize if necessary
+ max_size = 4.5 * 1024 * 1024 # 5 MB in bytes # 5
+ file_size = os.path.getsize(out_path)
+
+ width = image.width
+ height = image.height
+
+ # Resize images if they are too big
+ if file_size > max_size:
+ # Start with the original image size
+
+ print(f"Image size before {width}x{height}, original file_size: {file_size}")
+
+ while file_size > max_size:
+ # Reduce the size by a factor (e.g., 50% of the current size)
+ new_width = int(width * 0.5)
+ new_height = int(height * 0.5)
+ image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
+
+ # Save the resized image
+ _dp = max(1, int(round(float(image_dpi))))
+ image.save(out_path, format="PNG", optimize=True, dpi=(_dp, _dp))
+
+ # Update the file size
+ file_size = os.path.getsize(out_path)
+ print(f"Resized to {new_width}x{new_height}, new file_size: {file_size}")
+ else:
+ new_width = width
+ new_height = height
+
+ all_img_details.append((page_num, image, new_width, new_height))
+
+ return image, new_width, new_height, all_img_details, out_path
+
+
+def process_single_page_for_image_conversion(
+ pdf_path: str,
+ page_num: int,
+ image_dpi: float = image_dpi,
+ create_images: bool = True,
+ input_folder: str = INPUT_FOLDER,
+) -> tuple[int, str, float, float]:
+ """
+ Processes a single page of a PDF or image file for image conversion,
+ saving it as a PNG and optionally resizing it if too large.
+
+ Args:
+ pdf_path (str): The path to the input PDF or image file.
+ page_num (int): The 0-indexed page number to process.
+ image_dpi (float, optional): The DPI to use for PDF to image conversion. Defaults to image_dpi from config.
+ create_images (bool, optional): Whether to create and save the image. Defaults to True.
+ input_folder (str, optional): The folder where the converted images will be saved. Defaults to INPUT_FOLDER from config.
+
+ Returns:
+ tuple[int, str, float, float]: A tuple containing:
+ - The processed page number.
+ - The path to the saved output image.
+ - The width of the processed image.
+ - The height of the processed image.
+ """
+
+ out_path_placeholder = "placeholder_image_" + str(page_num) + ".png"
+
+ if create_images is True:
+ try:
+ # Construct the full output directory path
+ # Normalize input_folder to ensure it's used as-is without sanitization
+ if os.path.isabs(input_folder):
+ image_output_dir = Path(input_folder).resolve()
+ else:
+ # Join with cwd, but ensure input_folder is used as-is
+ base_dir = Path(os.getcwd()).resolve()
+ # Use Path.joinpath which doesn't sanitize folder names
+ image_output_dir = base_dir / input_folder
+ image_output_dir = image_output_dir.resolve()
+
+ # Ensure the directory exists
+ image_output_dir.mkdir(parents=True, exist_ok=True)
+
+ # Construct the output file path using secure_path_join for the filename only
+ from tools.secure_path_utils import secure_path_join
+
+ out_path = secure_path_join(
+ image_output_dir, f"{os.path.basename(pdf_path)}_{page_num}.png"
+ )
+ # Convert Path object to string immediately to avoid downstream type issues
+ out_path = str(out_path)
+
+ if os.path.exists(out_path):
+ # Load existing image
+ image = Image.open(out_path)
+ elif pdf_path.lower().endswith(".pdf"):
+ # Convert PDF page to image (MediaBox) using PyMuPDF for speed.
+ # We render directly as grayscale and save as PNG.
+ image = _render_pdf_page_to_png_pymupdf_mediabox(
+ pdf_path=pdf_path,
+ page_num=page_num,
+ out_path=out_path,
+ dpi=image_dpi,
+ )
+ elif (
+ pdf_path.lower().endswith(".jpg")
+ or pdf_path.lower().endswith(".png")
+ or pdf_path.lower().endswith(".jpeg")
+ ):
+ image = Image.open(pdf_path)
+ _dp = max(1, int(round(float(image_dpi))))
+ image.save(out_path, format="PNG", dpi=(_dp, _dp))
+ else:
+ raise Warning("Could not create image.")
+
+ width, height = image.size
+
+ # Check if image size too large and reduce if necessary
+ # print("Checking size of image and reducing if necessary.")
+ image, width, height, all_img_details, img_path = (
+ check_image_size_and_reduce(out_path, image)
+ )
+
+ return page_num, out_path, width, height
+
+ except Exception as e:
+
+ print(f"Error processing page {page_num + 1}: {e}")
+ return page_num, out_path_placeholder, pd.NA, pd.NA
+ else:
+ # print("Not creating image for page", page_num)
+ return page_num, out_path_placeholder, pd.NA, pd.NA
+
+
+def convert_pdf_to_images(
+ pdf_path: str,
+ prepare_for_review: bool = False,
+ page_min: int = 0,
+ page_max: int = 0,
+ create_images: bool = True,
+ image_dpi: float = image_dpi,
+ num_threads: Optional[int] = None,
+ input_folder: str = INPUT_FOLDER,
+ progress: Progress = Progress(track_tqdm=True),
+ page_numbers: Optional[List[int]] = None,
+):
+ """
+ Converts a PDF document into a series of images, processing each page concurrently.
+
+ Args:
+ pdf_path (str): The path to the PDF file to convert.
+ prepare_for_review (bool, optional): If True, only the first page is processed (feature not currently used). Defaults to False.
+ page_min (int, optional): The starting page number (0-indexed) for conversion. If 0, uses the first page. Defaults to 0.
+ page_max (int, optional): The ending page number (exclusive, 0-indexed) for conversion. If 0, uses the last page of the document. Defaults to 0.
+ create_images (bool, optional): If True, images are created and saved to disk. Defaults to True.
+ image_dpi (float, optional): The DPI (dots per inch) to use for converting PDF pages to images. Defaults to the global `image_dpi`.
+ num_threads (int, optional): The number of threads to use for concurrent page processing. Defaults to MAX_WORKERS from config/env.
+ input_folder (str, optional): The base input folder, used for determining output paths. Defaults to `INPUT_FOLDER`.
+ page_numbers (list, optional): If provided, only these 0-indexed page numbers are converted; page_min/page_max are ignored.
+
+ Returns:
+ list: A list of tuples, where each tuple contains (page_num, image_path, width, height) for successfully processed pages.
+ For failed pages, it returns (page_num, placeholder_path, pd.NA, pd.NA).
+ """
+ if num_threads is None:
+ num_threads = MAX_WORKERS
+
+ # Page count via PyMuPDF (faster + avoids Poppler dependency for this step)
+ try:
+ _count_doc = pymupdf.open(pdf_path)
+ page_count = len(_count_doc)
+ finally:
+ try:
+ _count_doc.close()
+ except Exception:
+ pass
+
+ # If preparing for review, just load the first page (not currently used)
+ if prepare_for_review is True:
+ page_min = 0
+ page_max = page_count
+ page_numbers = None
+
+ if page_numbers is not None:
+ pages_to_convert = sorted(
+ set(int(p) for p in page_numbers if 0 <= p < page_count)
+ )
+ total_pages = len(pages_to_convert)
+ if total_pages == 0:
+ return [], [], [], []
+ print(f"Creating images for {total_pages} page(s) (EFFICIENT_OCR).")
+ else:
+ print(f"Creating images. Number of pages in PDF: {page_count}")
+ # Handle special cases for page range
+ if page_min == 0:
+ page_min = 0
+ else:
+ page_min = page_min - 1
+ if page_max == 0:
+ page_max = page_count
+ pages_to_convert = list(range(page_min, page_max))
+ total_pages = len(pages_to_convert)
+
+ progress(0.1, desc="Creating images")
+
+ results = list()
+ with ThreadPoolExecutor(max_workers=num_threads) as executor:
+ futures = list()
+ for page_num in pages_to_convert:
+ futures.append(
+ executor.submit(
+ process_single_page_for_image_conversion,
+ pdf_path,
+ page_num,
+ image_dpi,
+ create_images=create_images,
+ input_folder=input_folder,
+ )
+ )
+
+ completed = 0
+ # Throttle Gradio updates to ~every 2% or every 10 pages so UI stays responsive
+ update_interval = max(1, min(total_pages // 50, 10))
+ for future in tqdm(
+ as_completed(futures),
+ total=len(futures),
+ unit="pages",
+ desc="Converting pages to image",
+ ):
+ page_num, img_path, width, height = future.result()
+ completed += 1
+ # Report progress to Gradio so the upload component shows page-by-page progress
+ if completed % update_interval == 0 or completed == total_pages:
+ progress(
+ 0.1 + 0.8 * (completed / total_pages),
+ desc="Converting pages to image",
+ )
+ if img_path:
+ results.append((page_num, img_path, width, height))
+ else:
+ print(f"Page {page_num + 1} failed to process.")
+ results.append(
+ (
+ page_num,
+ "placeholder_image_" + str(page_num) + ".png",
+ pd.NA,
+ pd.NA,
+ )
+ )
+
+ # Sort results by page number
+ progress(0.95, desc="Loading images")
+ results.sort(key=lambda x: x[0])
+ images = [result[1] for result in results]
+ widths = [result[2] for result in results]
+ heights = [result[3] for result in results]
+
+ print("PDF has been converted to images.")
+ return images, widths, heights, results
+
+
+# Function to take in a file path, decide if it is an image or pdf, then process appropriately.
+def process_file_for_image_creation(
+ file_path: str,
+ prepare_for_review: bool = False,
+ input_folder: str = INPUT_FOLDER,
+ create_images: bool = True,
+ page_min: int = 0,
+ page_max: int = 0,
+ progress: Progress = Progress(track_tqdm=True),
+):
+ """
+ Processes a given file path, determining if it's an image or a PDF,
+ and then converts it into a list of image paths, along with their dimensions.
+
+ Args:
+ file_path (str): The path to the file (image or PDF) to be processed.
+ prepare_for_review (bool, optional): If True, prepares the PDF for review
+ (e.g., by converting pages to images). Defaults to False.
+ input_folder (str, optional): The folder where input files are located. Defaults to INPUT_FOLDER.
+ create_images (bool, optional): If True, images will be created from PDF pages.
+ If False, only metadata will be extracted. Defaults to True.
+ page_min (int, optional): The minimum page number to process (0-indexed). If 0, uses the first page. Defaults to 0.
+ page_max (int, optional): The maximum page number to process (0-indexed). If 0, uses the last page of the document. Defaults to 0.
+ progress (Progress, optional): The progress object to update. Defaults to a Progress object with track_tqdm=True.
+ """
+ # Get the file extension
+ file_extension = os.path.splitext(file_path)[1].lower()
+
+ # Check if the file is an image type
+ if file_extension in [".jpg", ".jpeg", ".png"]:
+ print(f"{file_path} is an image file.")
+ progress(0.1, desc="Processing image file")
+ # Perform image processing here
+ img_object = [file_path] # [Image.open(file_path)]
+
+ # Load images from the file paths. Test to see if it is bigger than 4.5 mb and reduct if needed (Textract limit is 5mb)
+ image = Image.open(file_path)
+ img_object, image_sizes_width, image_sizes_height, all_img_details, img_path = (
+ check_image_size_and_reduce(file_path, image)
+ )
+
+ if not isinstance(image_sizes_width, list):
+ img_path = [img_path]
+ image_sizes_width = [image_sizes_width]
+ image_sizes_height = [image_sizes_height]
+ all_img_details = [all_img_details]
+
+ # Check if the file is a PDF
+ elif file_extension == ".pdf":
+
+ # Run your function for processing PDF files here
+ img_path, image_sizes_width, image_sizes_height, all_img_details = (
+ convert_pdf_to_images(
+ file_path,
+ prepare_for_review,
+ page_min=page_min,
+ page_max=page_max,
+ input_folder=input_folder,
+ create_images=create_images,
+ progress=progress,
+ )
+ )
+
+ else:
+ print(f"{file_path} is not an image or PDF file.")
+ img_path = list()
+ image_sizes_width = list()
+ image_sizes_height = list()
+ all_img_details = list()
+
+ return img_path, image_sizes_width, image_sizes_height, all_img_details
+
+
+def _process_one_input_file(
+ file: Any,
+ source_document_only: bool,
+ source_document_extensions: tuple,
+) -> Tuple[str, str, str, bool, bool, int]:
+ """
+ Process a single file for get_input_file_names; safe to run in a thread.
+ Returns (file_path_without_ext, file_extension, file_path, acceptable, is_source, page_count).
+ """
+ file_path = file if isinstance(file, str) else file.name
+ file_path_without_ext = get_file_name_without_type(file_path)
+ file_path_without_ext_lower = (file_path_without_ext or "").lower()
+ file_extension = os.path.splitext(file_path)[1].lower()
+ is_excluded_name = (
+ "review_file" in file_path_without_ext_lower
+ or "ocr_output" in file_path_without_ext_lower
+ or "ocr_results_with_words" in file_path_without_ext_lower
+ )
+ acceptable = (
+ file_extension
+ in (".jpg", ".jpeg", ".png", ".pdf", ".xlsx", ".csv", ".parquet", ".docx")
+ and not is_excluded_name
+ )
+ if file_extension == ".pdf":
+ try:
+ pdf_document = pymupdf.open(file_path)
+ page_count = pdf_document.page_count
+ pdf_document.close()
+ except Exception:
+ page_count = 1
+ else:
+ page_count = 1
+ is_source = not source_document_only or file_extension in source_document_extensions
+ return (
+ file_path_without_ext,
+ file_extension,
+ file_path,
+ acceptable,
+ is_source,
+ page_count,
+ )
+
+
+def get_input_file_names(
+ file_input: List[str],
+ source_document_only: bool = False,
+):
+ """
+ Get list of input files to report to logs.
+
+ When source_document_only is True (e.g. for document redaction / review tab),
+ full_file_name is only set for source documents (PDF, image, or Word), never
+ for outputs like review CSVs or OCR CSVs. This keeps doc_full_file_name_textbox
+ referring to the document being redacted (PDF or image).
+ """
+ all_relevant_files = list()
+ file_name_with_extension = ""
+ full_file_name = ""
+ total_pdf_page_count = 0
+ source_document_extensions = (".pdf", ".jpg", ".jpeg", ".png", ".docx")
+
+ if isinstance(file_input, dict):
+ file_input = os.path.abspath(file_input["name"])
+
+ if isinstance(file_input, str):
+ file_input_list = [file_input]
+ else:
+ file_input_list = file_input
+
+ if not file_input_list:
+ return (
+ ", ".join(all_relevant_files),
+ file_name_with_extension,
+ full_file_name,
+ all_relevant_files,
+ total_pdf_page_count,
+ )
+
+ max_workers = min(MAX_WORKERS, len(file_input_list))
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
+ results = list(
+ executor.map(
+ lambda f: _process_one_input_file(
+ f, source_document_only, source_document_extensions
+ ),
+ file_input_list,
+ )
+ )
+
+ for (
+ file_path_without_ext,
+ file_extension,
+ file_path,
+ acceptable,
+ is_source,
+ page_count,
+ ) in results:
+ total_pdf_page_count += page_count
+ if acceptable:
+ all_relevant_files.append(file_path_without_ext)
+ file_name_with_extension = file_path_without_ext + file_extension
+ if is_source:
+ full_file_name = file_path
+
+ all_relevant_files_str = ", ".join(all_relevant_files)
+ print("file_name_with_extension on document upload:", file_name_with_extension)
+ return (
+ all_relevant_files_str,
+ file_name_with_extension,
+ full_file_name,
+ all_relevant_files,
+ total_pdf_page_count,
+ )
+
+
+def get_document_file_names(file_input: List[str]):
+ """
+ Same as get_input_file_names but with source_document_only=True, so the
+ returned full_file_name is only ever a PDF, image, or Word doc (the document
+ being redacted), never a review CSV or OCR output. Use this for flows that
+ update doc_full_file_name_textbox.
+ """
+ return get_input_file_names(file_input, source_document_only=True)
+
+
+def convert_pymupdf_to_image_coords(
+ pymupdf_page: Page,
+ x1: float,
+ y1: float,
+ x2: float,
+ y2: float,
+ image: Image = None,
+ image_dimensions: dict = dict(),
+):
+ """
+ Converts bounding box coordinates from PyMuPDF page format to image coordinates.
+
+ PyMuPDF uses coordinates relative to ``page.rect`` (the visible area). The
+ rectangle is often **normalised** so its top-left is (0, 0) even when the CropBox
+ is inset in the MediaBox, so the inset must be taken from ``page.cropbox`` vs
+ ``page.mediabox`` (same as ``off_x`` / ``off_y`` in
+ ``process_page_to_structured_ocr_pymupdf``). Review/annotation images are rendered
+ from the full MediaBox (see ``_render_pdf_page_to_png_pymupdf_mediabox``), so we
+ shift rect-local points to MediaBox-local, then scale by image_size /
+ mediabox_size. This replaces the old symmetric (mediabox−rect)/2 heuristic, which
+ was wrong for asymmetric crops.
+
+ Args:
+ pymupdf_page (Page): The PyMuPDF page object from which the coordinates originate.
+ x1 (float): The x-coordinate of the top-left corner in PyMuPDF page units.
+ y1 (float): The y-coordinate of the top-left corner in PyMuPDF page units.
+ x2 (float): The x-coordinate of the bottom-right corner in PyMuPDF page units.
+ y2 (float): The y-coordinate of the bottom-right corner in PyMuPDF page units.
+ image (Image, optional): A PIL Image object. If provided, its dimensions
+ are used as the target image dimensions. Defaults to None.
+ image_dimensions (dict, optional): A dictionary containing 'image_width' and
+ 'image_height'. Used if 'image' is not provided
+ and 'image' is None. Defaults to an empty dictionary.
+ """
+ mediabox = pymupdf_page.mediabox
+ cropbox = pymupdf_page.cropbox
+ mediabox_width = mediabox.width
+ mediabox_height = mediabox.height
+
+ if mediabox_width <= 0 or mediabox_height <= 0:
+ return x1, y1, x2, y2
+
+ if image:
+ image_page_width, image_page_height = image.size
+ elif image_dimensions:
+ image_page_width, image_page_height = (
+ image_dimensions["image_width"],
+ image_dimensions["image_height"],
+ )
+ else:
+ image_page_width, image_page_height = mediabox_width, mediabox_height
+
+ sx = image_page_width / mediabox_width
+ sy = image_page_height / mediabox_height
+
+ # Rect-local → MediaBox-local: use cropbox vs mediabox (rect may be normalised to
+ # origin 0,0 while cropbox keeps PDF placement).
+ dx = cropbox.x0 - mediabox.x0
+ dy = cropbox.y0 - mediabox.y0
+
+ x1_image = (x1 + dx) * sx
+ x2_image = (x2 + dx) * sx
+ y1_image = (y1 + dy) * sy
+ y2_image = (y2 + dy) * sy
+
+ return x1_image, y1_image, x2_image, y2_image
+
+
+def create_page_size_objects(
+ pymupdf_doc: Document,
+ image_sizes_width: List[float],
+ image_sizes_height: List[float],
+ image_file_paths: List[str],
+ page_min: int = 0,
+ page_max: int = 0,
+):
+ """
+ Creates page size objects for a PyMuPDF document.
+
+ Creates entries for ALL pages in the document. Pages that were processed for image creation
+ will have actual image paths and dimensions. Pages that were not processed will have
+ placeholder image paths and no image dimensions.
+
+ Args:
+ pymupdf_doc (Document): The PyMuPDF document object.
+ image_sizes_width (List[float]): List of image widths for processed pages.
+ image_sizes_height (List[float]): List of image heights for processed pages.
+ image_file_paths (List[str]): List of image file paths for processed pages.
+ page_min (int, optional): The minimum page number that was processed (0-indexed). If 0, uses the first page. Defaults to 0.
+ page_max (int, optional): The maximum page number that was processed (0-indexed). If 0, uses the last page of the document. Defaults to 0.
+ """
+ page_sizes = list()
+ original_cropboxes = list()
+
+ # Handle special cases for page range
+ # If page_min is 0, use the first page (0-indexed)
+ if page_min == 0:
+ page_min = 0 # First page is 0-indexed
+ else:
+ page_min = page_min - 1
+
+ # If page_max is 0, use the last page of the document
+ if page_max == 0:
+ page_max = len(pymupdf_doc)
+
+ # Process ALL pages in the document, not just the ones with images
+ for page_no in range(len(pymupdf_doc)):
+ reported_page_no = page_no + 1
+ pymupdf_page = pymupdf_doc.load_page(page_no)
+ original_cropboxes.append(pymupdf_page.cropbox) # Save original CropBox
+
+ # Check if this page was processed for image creation
+ is_page_in_range = page_min <= page_no < page_max
+ image_index = page_no - page_min if is_page_in_range else None
+
+ # Create a page_sizes_object for every page
+ out_page_image_sizes = {
+ "page": reported_page_no,
+ "mediabox_width": pymupdf_page.mediabox.width,
+ "mediabox_height": pymupdf_page.mediabox.height,
+ "cropbox_width": pymupdf_page.cropbox.width,
+ "cropbox_height": pymupdf_page.cropbox.height,
+ "original_cropbox": original_cropboxes[-1],
+ }
+
+ # cropbox_x_offset: Distance from MediaBox left edge to CropBox left edge
+ # This is simply the difference in their x0 coordinates.
+ out_page_image_sizes["cropbox_x_offset"] = (
+ pymupdf_page.cropbox.x0 - pymupdf_page.mediabox.x0
+ )
+
+ # cropbox_y_offset_from_top: Distance from MediaBox top edge to CropBox top edge
+ out_page_image_sizes["cropbox_y_offset_from_top"] = (
+ pymupdf_page.mediabox.y1 - pymupdf_page.cropbox.y1
+ )
+
+ # Set image path and dimensions based on whether this page was processed
+ if (
+ is_page_in_range
+ and image_index is not None
+ and image_index < len(image_file_paths)
+ ):
+ # This page was processed for image creation
+ out_page_image_sizes["image_path"] = image_file_paths[image_index]
+
+ # Add image dimensions if available
+ if (
+ image_sizes_width
+ and image_sizes_height
+ and image_index < len(image_sizes_width)
+ and image_index < len(image_sizes_height)
+ ):
+ out_page_image_sizes["image_width"] = image_sizes_width[image_index]
+ out_page_image_sizes["image_height"] = image_sizes_height[image_index]
+ else:
+ # This page was not processed for image creation - use placeholder
+ out_page_image_sizes["image_path"] = f"image_placeholder_{page_no}.png"
+ # No image dimensions for placeholder pages
+
+ page_sizes.append(out_page_image_sizes)
+
+ return page_sizes, original_cropboxes
+
+
+def prepare_images_for_pages(
+ file_path: str,
+ pages_1based: List[int],
+ input_folder: str,
+ pymupdf_doc: Document,
+ page_sizes: List[dict],
+ progress: Progress = Progress(track_tqdm=True),
+) -> Tuple[List[str], List[dict]]:
+ """
+ Create images only for the given pages (e.g. EFFICIENT_OCR pages that need OCR).
+ Updates page_sizes in place and returns a full-length pdf_image_file_paths list
+ (real paths only for the requested pages, empty string for others).
+
+ Args:
+ file_path: Path to the PDF.
+ pages_1based: 1-based page numbers to convert to images.
+ input_folder: Folder used for image output paths.
+ pymupdf_doc: Open PyMuPDF document (used for page count).
+ page_sizes: List of page size dicts (one per page), updated in place.
+ progress: Progress callback.
+
+ Returns:
+ (pdf_image_file_paths, page_sizes) where pdf_image_file_paths has length
+ len(pymupdf_doc) with real path at index (p-1) for each p in pages_1based.
+ """
+ if not pages_1based:
+ return [""] * len(pymupdf_doc), page_sizes
+
+ page_numbers_0based = [p - 1 for p in pages_1based if 1 <= p <= len(pymupdf_doc)]
+ if not page_numbers_0based:
+ return [""] * len(pymupdf_doc), page_sizes
+
+ _, _, _, results = convert_pdf_to_images(
+ file_path,
+ prepare_for_review=False,
+ create_images=True,
+ input_folder=input_folder,
+ progress=progress,
+ page_numbers=page_numbers_0based,
+ )
+
+ num_pages = len(pymupdf_doc)
+ pdf_image_file_paths = [""] * num_pages
+ for page_num, img_path, width, height in results:
+ if (
+ 0 <= page_num < num_pages
+ and img_path
+ and "placeholder" not in str(img_path)
+ ):
+ pdf_image_file_paths[page_num] = img_path
+ page_sizes[page_num]["image_path"] = img_path
+ if pd.notna(width) and pd.notna(height):
+ page_sizes[page_num]["image_width"] = width
+ page_sizes[page_num]["image_height"] = height
+
+ return pdf_image_file_paths, page_sizes
+
+
+def _get_bbox(d: dict) -> list:
+ """Get bounding box list from dict; support both 'bounding_box' and 'boundingBox'."""
+ return d.get("bounding_box") or d.get("boundingBox") or [0, 0, 0, 0]
+
+
+def word_level_ocr_output_to_dataframe(ocr_results: dict) -> pd.DataFrame:
+ """
+ Convert a json of ocr results to a dataframe
+
+ Args:
+ ocr_results (dict): A dictionary containing OCR results.
+
+ Returns:
+ pd.DataFrame: A dataframe containing the OCR results.
+ """
+ rows = list()
+ ocr_results[0]
+
+ for ocr_result in ocr_results:
+
+ page_number = int(ocr_result["page"])
+
+ for line_key, line_data in ocr_result["results"].items():
+
+ line_number = int(line_data["line"])
+ # Support both "confidence" (Textract/json_to_ocrresult) and "conf" (other OCR)
+ line_conf = line_data.get("confidence", line_data.get("conf", 100.0))
+ line_bbox = _get_bbox(line_data)
+ for word in line_data["words"]:
+ word_conf = word.get("confidence", word.get("conf", 100.0))
+ word_bbox = _get_bbox(word)
+ rows.append(
+ {
+ "page": page_number,
+ "line": line_number,
+ "word_text": word["text"],
+ "word_x0": word_bbox[0],
+ "word_y0": word_bbox[1],
+ "word_x1": word_bbox[2],
+ "word_y1": word_bbox[3],
+ "word_conf": word_conf,
+ "line_text": "", # line_data['text'], # This data is too large to include
+ "line_x0": line_bbox[0],
+ "line_y0": line_bbox[1],
+ "line_x1": line_bbox[2],
+ "line_y1": line_bbox[3],
+ "line_conf": line_conf,
+ }
+ )
+
+ return pd.DataFrame(rows)
+
+
+def word_level_ocr_df_to_line_level_ocr_df(
+ word_level_df: pd.DataFrame,
+) -> pd.DataFrame:
+ """
+ Convert word-level OCR results dataframe to line-level OCR results dataframe.
+
+ Word-level format has one row per word (page, line, word_text, word_x0, word_y0,
+ word_x1, word_y1, word_conf, ...). Line-level format has one row per line with
+ aggregated text and bounding box (page, text, left, top, width, height, line, conf).
+
+ Args:
+ word_level_df: DataFrame with columns including page, line, word_text,
+ word_x0, word_y0, word_x1, word_y1, and word_conf (or line_conf).
+
+ Returns:
+ DataFrame with columns page, text, left, top, width, height, line, conf.
+ """
+ required = ["page", "line", "word_text", "word_x0", "word_y0", "word_x1", "word_y1"]
+ for col in required:
+ if col not in word_level_df.columns:
+ raise ValueError(
+ f"word_level_df must contain column '{col}'. "
+ f"Found: {list(word_level_df.columns)}"
+ )
+
+ def agg_line(group: pd.DataFrame) -> pd.Series:
+ text = " ".join(group["word_text"].astype(str).dropna())
+ x0 = group["word_x0"].min()
+ y0 = group["word_y0"].min()
+ x1 = group["word_x1"].max()
+ y1 = group["word_y1"].max()
+ if "line_conf" in group.columns and group["line_conf"].notna().any():
+ conf = group["line_conf"].dropna().iloc[0]
+ else:
+ conf = group["word_conf"].mean() if "word_conf" in group.columns else 100.0
+ return pd.Series(
+ {
+ "text": text,
+ "left": x0,
+ "top": y0,
+ "width": x1 - x0,
+ "height": y1 - y0,
+ "conf": conf,
+ }
+ )
+
+ line_level = (
+ word_level_df.groupby(["page", "line"], sort=False)
+ .apply(agg_line)
+ .reset_index()
+ )
+ # Match expected column order: page, text, left, top, width, height, line, conf
+ return line_level[
+ ["page", "text", "left", "top", "width", "height", "line", "conf"]
+ ]
+
+
+def extract_redactions(
+ doc: Document, page_sizes: List[Dict[str, Any]] = None
+) -> Tuple[List[Dict[str, Any]], Document]:
+ """
+ Extracts all redaction annotations from a PDF document and converts them
+ to Gradio Annotation JSON format.
+
+ Note: This function identifies the *markings* for redaction. It does not
+ tell you if the redaction has been *applied* (i.e., the underlying
+ content is permanently removed).
+
+ Args:
+ doc: The PyMuPDF document object.
+ page_sizes: List of dictionaries containing page information with keys:
+ 'page', 'image_path', 'image_width', 'image_height'.
+ If None, will create placeholder structure.
+
+ Returns:
+ List of dictionaries suitable for Gradio Annotation output, one dict per image/page.
+ PyMuPDF document object.
+ Each dict has structure: {"image": image_path, "boxes": [list of annotation boxes]}
+ """
+
+ # Helper function to generate unique IDs
+ def _generate_unique_ids(num_ids: int, existing_ids: set = None) -> List[str]:
+ if existing_ids is None:
+ existing_ids = set()
+
+ id_length = 12
+ character_set = string.ascii_letters + string.digits
+ unique_ids = list()
+
+ for _ in range(num_ids):
+ while True:
+ candidate_id = "".join(random.choices(character_set, k=id_length))
+ if candidate_id not in existing_ids:
+ existing_ids.add(candidate_id)
+ unique_ids.append(candidate_id)
+ break
+
+ return unique_ids
+
+ # Extract redaction annotations from the document
+ redactions_by_page = dict()
+ existing_ids = set()
+
+ for page_num, page in enumerate(doc):
+ page_redactions = list()
+
+ # The page.annots() method is a generator for all annotations on the page
+ for annot in page.annots():
+ # The type of a redaction annotation is 12
+ if annot.type[0] == pymupdf.PDF_ANNOT_REDACT:
+
+ # Get annotation info with fallbacks
+ annot_info = annot.info or {}
+ annot_colors = annot.colors or {}
+
+ # Extract coordinates from the annotation rectangle (PDF space, same units as mediabox)
+ rect = annot.rect
+ x0, y0, x1, y1 = rect.x0, rect.y0, rect.x1, rect.y1
+
+ # Convert PDF coordinates to image pixel coordinates (always scale by image size)
+ page_size_info = None
+ if page_sizes:
+ for ps in page_sizes:
+ if ps.get("page") == page_num + 1:
+ page_size_info = ps
+ break
+
+ if not page_size_info:
+ raise ValueError(
+ f"extract_redactions: no page_sizes entry for page {page_num + 1}. "
+ "Ensure page_sizes is built and images exist before extracting redactions."
+ )
+
+ mediabox_width = page_size_info.get("mediabox_width", 1)
+ mediabox_height = page_size_info.get("mediabox_height", 1)
+ image_width = page_size_info.get("image_width")
+ image_height = page_size_info.get("image_height")
+
+ try:
+ w = float(image_width) if image_width is not None else 0
+ h = float(image_height) if image_height is not None else 0
+ has_valid_image_dims = w > 0 and h > 0
+ except (TypeError, ValueError):
+ has_valid_image_dims = False
+
+ if not has_valid_image_dims:
+ scale_x = 1
+ scale_y = 1
+ rel_x0 = x0
+ rel_y0 = y0
+ rel_x1 = x1
+ rel_y1 = y1
+ # raise ValueError(
+ # f"extract_redactions: page {page_num + 1} has no valid image dimensions "
+ # "(image_width/image_height). Create images for all pages before loading redactions."
+ # )
+ else:
+ scale_x = w / mediabox_width
+ scale_y = h / mediabox_height
+ rel_x0 = x0 * scale_x
+ rel_y0 = y0 * scale_y
+ rel_x1 = x1 * scale_x
+ rel_y1 = y1 * scale_y
+
+ # Get color and convert from 0-1 range to 0-255 range
+ fill_color = annot_colors.get(
+ "fill", (0, 0, 0)
+ ) # Default to black if no color
+ if isinstance(fill_color, (tuple, list)) and len(fill_color) >= 3:
+ # Convert from 0-1 range to 0-255 range
+ color_255 = tuple(
+ int(component * 255) if component <= 1 else int(component)
+ for component in fill_color[:3]
+ )
+ else:
+ color_255 = (0, 0, 0) # Default to black
+
+ # Create annotation box in the required format
+ redaction_box = {
+ "label": annot_info.get(
+ "title", f"Redaction {len(page_redactions) + 1}"
+ ),
+ "color": str(color_255),
+ "xmin": rel_x0,
+ "ymin": rel_y0,
+ "xmax": rel_x1,
+ "ymax": rel_y1,
+ "text": annot_info.get("content", ""),
+ "id": None, # Will be filled after generating IDs
+ }
+
+ page_redactions.append(redaction_box)
+
+ # Remove the redaction annotation from the pymupdf document
+ page.delete_annot(annot)
+
+ # if page.annots:
+ # page.annots = [
+ # annot
+ # for annot in page.annots()
+ # if annot.type[0] != pymupdf.PDF_ANNOT_REDACT
+ # ]
+
+ if page_redactions:
+ redactions_by_page[page_num + 1] = page_redactions
+
+ # Generate unique IDs for all redaction boxes
+ all_boxes = list()
+ for page_redactions in redactions_by_page.values():
+ all_boxes.extend(page_redactions)
+
+ if all_boxes:
+ unique_ids = _generate_unique_ids(len(all_boxes), existing_ids)
+
+ # Assign IDs to boxes
+ box_idx = 0
+ for page_num, page_redactions in redactions_by_page.items():
+ for box in page_redactions:
+ box["id"] = unique_ids[box_idx]
+ box_idx += 1
+
+ # Build JSON structure based on page_sizes or create placeholder structure
+ json_data = list()
+
+ if page_sizes:
+ # Use provided page_sizes to build structure
+ for page_info in page_sizes:
+ page_num = page_info.get("page", 1)
+ image_path = page_info.get(
+ "image_path", f"placeholder_image_{page_num}.png"
+ )
+
+ # Get redactions for this page
+ annotation_boxes = redactions_by_page.get(page_num, [])
+
+ json_data.append({"image": image_path, "boxes": annotation_boxes})
+ else:
+ # Create placeholder structure based on document pages
+ for page_num in range(1, doc.page_count + 1):
+ image_path = f"placeholder_image_{page_num}.png"
+ annotation_boxes = redactions_by_page.get(page_num, [])
+
+ json_data.append({"image": image_path, "boxes": annotation_boxes})
+
+ total_redactions = sum(len(boxes) for boxes in redactions_by_page.values())
+ print(f"Found {total_redactions} redactions in the document")
+
+ # Convert the gradio annotation boxes to relative coordinates
+ page_sizes_df = pd.DataFrame(page_sizes)
+ page_sizes_df.loc[:, "page"] = pd.to_numeric(page_sizes_df["page"], errors="coerce")
+
+ all_image_annotations_df = convert_annotation_data_to_dataframe(json_data)
+ all_image_annotations_df = divide_coordinates_by_page_sizes(
+ all_image_annotations_df,
+ page_sizes_df,
+ xmin="xmin",
+ xmax="xmax",
+ ymin="ymin",
+ ymax="ymax",
+ )
+ annotations_all_pages_divide = create_annotation_dicts_from_annotation_df(
+ all_image_annotations_df, page_sizes
+ )
+
+ return annotations_all_pages_divide, doc
+
+
+def _rects_match(rect_a, rect_b, tolerance: float = 0.5) -> bool:
+ """Return True if two PyMuPDF rects are the same within tolerance (in points)."""
+ return (
+ abs(rect_a.x0 - rect_b.x0) <= tolerance
+ and abs(rect_a.y0 - rect_b.y0) <= tolerance
+ and abs(rect_a.x1 - rect_b.x1) <= tolerance
+ and abs(rect_a.y1 - rect_b.y1) <= tolerance
+ )
+
+
+def _dst_page_has_duplicate_redact(dst_page, rect, title: str, content: str) -> bool:
+ """Return True if dst_page already has a redaction annot with same rect, title, and content."""
+ title = (title or "").strip()
+ content = (content or "").strip()
+ for existing in dst_page.annots():
+ if existing.type[0] != pymupdf.PDF_ANNOT_REDACT:
+ continue
+ if not _rects_match(rect, existing.rect):
+ continue
+ info = existing.info or {}
+ existing_title = (info.get("title") or "").strip()
+ existing_content = (info.get("content") or "").strip()
+ if existing_title == title and existing_content == content:
+ return True
+ return False
+
+
+def _get_base_name_from_review_pdf_path(file_path: str) -> str:
+ """
+ Extract the base file name from a '_redactions_for_review...' path.
+ E.g. 'mydoc_redactions_for_review.pdf' -> 'mydoc',
+ 'mydoc_redactions_for_review_pages_1-2.pdf' -> 'mydoc'.
+ """
+ basename = os.path.basename(file_path)
+ name_without_ext = os.path.splitext(basename)[0]
+ suffix = "_redactions_for_review"
+ if suffix in name_without_ext:
+ return name_without_ext.split(suffix)[0]
+ return name_without_ext
+
+
+def _parse_review_pdf_page_suffix(
+ file_path: str,
+) -> Tuple[bool, Optional[int], Optional[int]]:
+ """
+ If the review PDF path ends with a page-range suffix _N_M (e.g. _2_4), return
+ (True, N, M). Otherwise return (False, None, None).
+ E.g. 'mydoc_redactions_for_review_2_4.pdf' -> (True, 2, 4)
+ 'mydoc_redactions_for_review.pdf' -> (False, None, None)
+ """
+ basename = os.path.basename(file_path)
+ name_without_ext = os.path.splitext(basename)[0]
+ match = re.search(r"_(\d+)_(\d+)$", name_without_ext)
+ if match:
+ return True, int(match.group(1)), int(match.group(2))
+ return False, None, None
+
+
+def _get_review_pdf_combined_output_base(file_path: str) -> str:
+ """
+ From a review PDF path, get the base for the combined output filename:
+ everything up to and including '_redactions_for_review', excluding any
+ text after that (e.g. " (1)", " (2)", "_2_4").
+ E.g. 'file_redactions_for_review (1).pdf' -> 'file_redactions_for_review'
+ 'file_FINAL_redactions_for_review.pdf' -> 'file_FINAL_redactions_for_review'
+ """
+ basename = os.path.basename(file_path)
+ name_without_ext = os.path.splitext(basename)[0]
+ suffix = "_redactions_for_review"
+ if suffix in name_without_ext:
+ idx = name_without_ext.index(suffix) + len(suffix)
+ return name_without_ext[:idx]
+ return name_without_ext
+
+
+def combine_review_pdf_files(file_list, output_folder: str = OUTPUT_FOLDER):
+ """
+ Combine redaction comments from multiple '_redactions_for_review' PDFs into one PDF.
+
+ Only validates that all files have the same number of pages. File names may
+ differ (e.g. file_redactions_for_review (1).pdf, file_redactions_for_review (2).pdf,
+ or file_FINAL_redactions_for_review.pdf). The output filename is derived from
+ the first input file: the name up to and including 'redactions_for_review' is
+ taken (anything after that is dropped), then '_combined' is appended, e.g.
+ file_redactions_for_review_combined.pdf.
+
+ Args:
+ file_list: List of file paths or Gradio FileData-like objects with .name.
+ output_folder: Folder to write the combined PDF.
+
+ Returns:
+ List containing the path to the combined PDF for use as gr.File output.
+ On validation error, raises ValueError (e.g. page count mismatch).
+ """
+ if not file_list:
+ return []
+
+ # Normalise to paths (Gradio may pass FileData with .name or dict with "name")
+ paths = []
+ for f in file_list:
+ p = (
+ getattr(f, "name", None)
+ or (f.get("name") if isinstance(f, dict) else None)
+ or f
+ )
+ if isinstance(p, str):
+ paths.append(p)
+ if not paths:
+ return []
+
+ output_base = _get_review_pdf_combined_output_base(paths[0])
+ first_doc = pymupdf.open(paths[0])
+ page_count = len(first_doc)
+
+ for p in paths[1:]:
+ other_doc = pymupdf.open(p)
+ if len(other_doc) != page_count:
+ other_doc.close()
+ first_doc.close()
+ raise ValueError(
+ f"All files must have the same number of pages. "
+ f"'{os.path.basename(paths[0])}' has {page_count} pages but "
+ f"'{os.path.basename(p)}' has {len(other_doc)} pages."
+ )
+ # Copy redaction annotations from each page of other_doc into first_doc
+ for page_num in range(page_count):
+ src_page = other_doc[page_num]
+ dst_page = first_doc[page_num]
+ # Collect annots so we don't modify while iterating
+ annots = list(src_page.annots())
+ for annot in annots:
+ if annot.type[0] != pymupdf.PDF_ANNOT_REDACT:
+ continue
+ rect = annot.rect
+ annot_colors = annot.colors or {}
+ annot_info = annot.info or {}
+ title = annot_info.get("title", "Redaction")
+ content = annot_info.get("content", "")
+ # Skip duplicate: same position and same label/text content
+ if _dst_page_has_duplicate_redact(dst_page, rect, title, content):
+ continue
+ stroke = annot_colors.get("stroke", (0, 0, 0))
+ fill = annot_colors.get("fill", (0, 0, 0))
+ new_annot = dst_page.add_redact_annot(rect)
+ new_annot.set_colors(stroke=stroke, fill=fill, colors=fill)
+ new_annot.set_name(title)
+ new_annot.set_info(
+ info=title,
+ title=title,
+ subject=annot_info.get("subject", "Redaction"),
+ content=content,
+ creationDate=annot_info.get("creationDate", ""),
+ )
+ new_annot.update(opacity=0.5, cross_out=False)
+ other_doc.close()
+
+ out_path = os.path.join(output_folder, output_base + "_combined.pdf")
+ first_doc.save(out_path, clean=True)
+ first_doc.close()
+ return [out_path]
+
+
+def prepare_image_or_pdf_with_efficient_ocr(
+ file_paths,
+ text_extract_method,
+ all_page_line_level_ocr_results_df_base,
+ all_page_line_level_ocr_results_with_words_df_base,
+ latest_file_completed_num,
+ out_message,
+ first_loop_state,
+ number_of_pages,
+ all_annotations_object,
+ prepare_for_review,
+ in_fully_redacted_list,
+ output_folder,
+ input_folder,
+ efficient_ocr,
+ prepare_images_bool_false,
+ page_sizes,
+ pymupdf_doc,
+ page_min,
+ page_max,
+):
+ """When EFFICIENT_OCR is enabled, skip loading all images; they are created later only for pages that need OCR."""
+ prepare_images = (
+ False
+ if efficient_ocr
+ else (
+ prepare_images_bool_false if prepare_images_bool_false is not None else True
+ )
+ )
+ return prepare_image_or_pdf(
+ file_paths,
+ text_extract_method,
+ all_page_line_level_ocr_results_df_base,
+ all_page_line_level_ocr_results_with_words_df_base,
+ latest_file_completed_num,
+ out_message,
+ first_loop_state,
+ number_of_pages,
+ all_annotations_object,
+ prepare_for_review,
+ in_fully_redacted_list,
+ output_folder,
+ input_folder,
+ prepare_images,
+ page_sizes,
+ pymupdf_doc,
+ page_min,
+ page_max,
+ )
+
+
+def prepare_image_or_pdf(
+ file_paths: List[str],
+ text_extract_method: str,
+ all_line_level_ocr_results_df: pd.DataFrame = None,
+ all_page_line_level_ocr_results_with_words_df: pd.DataFrame = None,
+ latest_file_completed: int = 0,
+ out_message: List[str] = list(),
+ first_loop_state: bool = False,
+ number_of_pages: int = 0,
+ all_annotations_object: List = list(),
+ prepare_for_review: bool = False,
+ in_fully_redacted_list: List[int] = list(),
+ output_folder: str = OUTPUT_FOLDER,
+ input_folder: str = INPUT_FOLDER,
+ prepare_images: bool = True,
+ page_sizes: list[dict] = list(),
+ pymupdf_doc: Document = list(),
+ textract_output_found: bool = False,
+ relevant_ocr_output_with_words_found: bool = False,
+ page_min: int = 0,
+ page_max: int = 0,
+ progress: Progress = Progress(track_tqdm=True),
+) -> tuple[List[str], List[str]]:
+ """
+ Prepare and process image or text PDF files for redaction.
+
+ This function takes a list of file paths, processes each file based on the specified redaction method,
+ and returns the output messages and processed file paths.
+
+ Args:
+ file_paths (List[str]): List of file paths to process.
+ text_extract_method (str): The redaction method to use.
+ latest_file_completed (optional, int): Index of the last completed file.
+ out_message (optional, List[str]): List to store output messages.
+ first_loop_state (optional, bool): Flag indicating if this is the first iteration.
+ number_of_pages (optional, int): integer indicating the number of pages in the document
+ all_annotations_object(optional, List of annotation objects): All annotations for current document
+ prepare_for_review(optional, bool): Is this preparation step preparing pdfs and json files to review current redactions?
+ in_fully_redacted_list(optional, List of int): A list of pages to fully redact
+ output_folder (optional, str): The output folder for file save
+ prepare_images (optional, bool): A boolean indicating whether to create images for each PDF page. Defaults to True.
+ page_sizes(optional, List[dict]): A list of dicts containing information about page sizes in various formats.
+ pymupdf_doc(optional, Document): A pymupdf document object that indicates the existing PDF document object.
+ textract_output_found (optional, bool): A boolean indicating whether Textract analysis output has already been found. Defaults to False.
+ relevant_ocr_output_with_words_found (optional, bool): A boolean indicating whether local OCR analysis output has already been found. Defaults to False.
+ page_min (optional, int): The minimum page number to process (0-indexed). If 0, uses the first page. Defaults to 0.
+ page_max (optional, int): The maximum page number to process (0-indexed). If 0, uses the last page of the document. Defaults to 0.
+ progress (optional, Progress): Progress tracker for the operation
+
+
+ Returns:
+ tuple[List[str], List[str]]: A tuple containing the output messages and processed file paths.
+ """
+
+ tic = time.perf_counter()
+ json_from_csv = False
+ original_cropboxes = list() # Store original CropBox values
+ converted_file_paths = list()
+ image_file_paths = list()
+ all_img_details = list()
+ review_file_csv = pd.DataFrame()
+ out_textract_path = ""
+ combined_out_message = ""
+ final_out_message = ""
+ log_files_output_paths = list()
+
+ if isinstance(in_fully_redacted_list, pd.DataFrame):
+ if not in_fully_redacted_list.empty:
+ in_fully_redacted_list = in_fully_redacted_list.iloc[:, 0].tolist()
+
+ # If this is the first time around, set variables to 0/blank
+ if first_loop_state is True:
+ latest_file_completed = 0
+ out_message = list()
+ all_annotations_object = list()
+ else:
+ print("Now redacting file", str(latest_file_completed))
+
+ # If combined out message or converted_file_paths are blank, change to a list so it can be appended to
+ if isinstance(out_message, str):
+ out_message = [out_message]
+
+ if not file_paths:
+ file_paths = list()
+
+ if isinstance(file_paths, dict):
+ file_paths = os.path.abspath(file_paths["name"])
+
+ if isinstance(file_paths, str):
+ file_path_number = 1
+ else:
+ file_path_number = len(file_paths)
+
+ if file_path_number > MAX_SIMULTANEOUS_FILES:
+ out_message = f"Number of files loaded is greater than {MAX_SIMULTANEOUS_FILES}. Please submit a smaller number of files."
+ print(out_message)
+ raise Exception(out_message)
+
+ latest_file_completed = int(latest_file_completed)
+
+ # If we have already redacted the last file, return the input out_message and file list to the relevant components
+ if latest_file_completed >= file_path_number:
+ print("Last file reached, returning files:", str(latest_file_completed))
+ if isinstance(out_message, list):
+ final_out_message = "\n".join(out_message)
+ else:
+ final_out_message = out_message
+
+ return (
+ final_out_message,
+ converted_file_paths,
+ image_file_paths,
+ number_of_pages,
+ number_of_pages,
+ pymupdf_doc,
+ all_annotations_object,
+ review_file_csv,
+ original_cropboxes,
+ page_sizes,
+ textract_output_found,
+ all_img_details,
+ all_line_level_ocr_results_df,
+ relevant_ocr_output_with_words_found,
+ all_page_line_level_ocr_results_with_words_df,
+ )
+
+ progress(0.1, desc="Preparing file")
+
+ def _file_item_to_path(item):
+ """Normalize Gradio file input (str, dict with 'name'/'path', or object with .name/.path) to path string."""
+ if isinstance(item, str):
+ return item
+ if isinstance(item, dict):
+ return item.get("name") or item.get("path") or ""
+ return getattr(item, "name", None) or getattr(item, "path", None) or ""
+
+ if isinstance(file_paths, str):
+ file_paths_list = [file_paths]
+ file_paths_loop = file_paths_list
+ else:
+ file_paths_list = [_file_item_to_path(f) for f in file_paths if f is not None]
+ file_paths_list = [p for p in file_paths_list if p and str(p).strip()]
+ file_paths_loop = sorted(
+ file_paths_list,
+ key=lambda x: (
+ os.path.splitext(x)[1] != ".pdf",
+ os.path.splitext(x)[1] != ".json",
+ ),
+ )
+
+ # Loop through files to load in
+ for file in file_paths_loop:
+ converted_file_path = list()
+ image_file_path = list()
+
+ file_path = file if isinstance(file, str) else _file_item_to_path(file)
+ file_path_without_ext = get_file_name_without_type(file_path)
+ file_name_with_ext = os.path.basename(file_path)
+
+ print("Loading file:", file_name_with_ext)
+
+ if not file_path:
+ out_message = "Please select at least one file."
+ print(out_message)
+ raise Warning(out_message)
+
+ file_extension = os.path.splitext(file_path)[1].lower()
+
+ progress(0.2, desc="Preparing file")
+
+ # If a pdf, load as a pymupdf document
+ if is_pdf(file_path):
+ print(f"File {file_name_with_ext} is a PDF")
+ pymupdf_doc = pymupdf.open(file_path)
+
+ converted_file_path = file_path
+
+ if prepare_images is True:
+ (
+ image_file_paths,
+ image_sizes_width,
+ image_sizes_height,
+ all_img_details,
+ ) = process_file_for_image_creation(
+ file_path,
+ prepare_for_review,
+ input_folder,
+ create_images=True,
+ page_min=page_min,
+ page_max=page_max,
+ progress=progress,
+ )
+ else:
+ (
+ image_file_paths,
+ image_sizes_width,
+ image_sizes_height,
+ all_img_details,
+ ) = process_file_for_image_creation(
+ file_path,
+ prepare_for_review,
+ input_folder,
+ create_images=False,
+ page_min=page_min,
+ page_max=page_max,
+ progress=progress,
+ )
+
+ page_sizes, original_cropboxes = create_page_size_objects(
+ pymupdf_doc,
+ image_sizes_width,
+ image_sizes_height,
+ image_file_paths,
+ page_min,
+ page_max,
+ )
+
+ # Create base version of the annotation object that doesn't have any annotations in it
+ if (not all_annotations_object) & (prepare_for_review is True):
+ all_annotations_object = list()
+
+ for image_path in image_file_paths:
+ annotation = dict()
+ annotation["image"] = image_path
+ annotation["boxes"] = list()
+
+ all_annotations_object.append(annotation)
+
+ # If we are loading redactions from the pdf, extract the redactions
+ if LOAD_REDACTION_ANNOTATIONS_FROM_PDF and prepare_for_review is True:
+
+ redactions_list, pymupdf_doc = extract_redactions(
+ pymupdf_doc, page_sizes
+ )
+ all_annotations_object = redactions_list
+
+ review_file_csv = convert_annotation_json_to_review_df(
+ all_annotations_object
+ )
+
+ elif is_pdf_or_image(file_path): # Alternatively, if it's an image
+ print(f"File {file_name_with_ext} is an image")
+ # Check if the file is an image type and the user selected text ocr option
+ if (
+ file_extension in [".jpg", ".jpeg", ".png"]
+ and text_extract_method == SELECTABLE_TEXT_EXTRACT_OPTION
+ ):
+ text_extract_method = LOCAL_OCR_MODEL_TEXT_EXTRACT_OPTION
+
+ # Convert image to a pymupdf document
+ pymupdf_doc = pymupdf.open() # Create a new empty document
+
+ img = Image.open(file_path) # Open the image file
+ rect = pymupdf.Rect(
+ 0, 0, img.width, img.height
+ ) # Create a rectangle for the image
+ pymupdf_page = pymupdf_doc.new_page(
+ width=img.width, height=img.height
+ ) # Add a new page
+ pymupdf_page.insert_image(
+ rect, filename=file_path
+ ) # Insert the image into the page
+ pymupdf_page = pymupdf_doc.load_page(0)
+
+ file_path_str = str(file_path)
+
+ image_file_paths, image_sizes_width, image_sizes_height, all_img_details = (
+ process_file_for_image_creation(
+ file_path_str,
+ prepare_for_review,
+ input_folder,
+ create_images=True,
+ progress=progress,
+ )
+ )
+
+ # Create a page_sizes_object
+ page_sizes, original_cropboxes = create_page_size_objects(
+ pymupdf_doc, image_sizes_width, image_sizes_height, image_file_paths
+ )
+
+ # Create base version of the annotation object for review (same as PDF branch)
+ if (not all_annotations_object) and (prepare_for_review is True):
+ all_annotations_object = list()
+ for image_path in image_file_paths:
+ annotation = dict()
+ annotation["image"] = image_path
+ annotation["boxes"] = list()
+ all_annotations_object.append(annotation)
+
+ converted_file_path = output_folder + file_name_with_ext
+
+ pymupdf_doc.save(converted_file_path, garbage=4, deflate=True, clean=True)
+
+ # Loading in review files, ocr_outputs, or ocr_outputs_with_words
+ elif file_extension in [".csv"]:
+ if "_review_file" in file_path_without_ext:
+ review_file_csv = read_file(file_path)
+ all_annotations_object = convert_review_df_to_annotation_json(
+ review_file_csv, image_file_paths, page_sizes
+ )
+ json_from_csv = True
+ elif "_ocr_output" in file_path_without_ext:
+ all_line_level_ocr_results_df = read_file(file_path)
+
+ if "line" not in all_line_level_ocr_results_df.columns:
+ all_line_level_ocr_results_df["line"] = ""
+
+ json_from_csv = False
+ elif "_ocr_results_with_words" in file_path_without_ext:
+ all_page_line_level_ocr_results_with_words_df = read_file(file_path)
+ json_from_csv = False
+
+ # Convert word-level OCR results to line-level if line-level is empty
+ if all_line_level_ocr_results_df is None or (
+ isinstance(all_line_level_ocr_results_df, pd.DataFrame)
+ and all_line_level_ocr_results_df.empty
+ ):
+ all_line_level_ocr_results_df = (
+ word_level_ocr_df_to_line_level_ocr_df(
+ all_page_line_level_ocr_results_with_words_df
+ )
+ )
+ if "line" not in all_line_level_ocr_results_df.columns:
+ all_line_level_ocr_results_df["line"] = ""
+
+ # If the file name ends with .json, check if we are loading for review. If yes, assume it is an annotations object, overwrite the current annotations object. If false, assume this is a Textract object, load in to Textract
+
+ if (file_extension in [".json"]) | (json_from_csv is True):
+
+ if (file_extension in [".json"]) & (prepare_for_review is True):
+ if isinstance(file_path, str):
+ # Split the path into base directory and filename for security
+ file_path_obj = Path(file_path)
+ base_dir = file_path_obj.parent
+ filename = file_path_obj.name
+
+ json_content = secure_file_read(base_dir, filename)
+ all_annotations_object = json.loads(json_content)
+ else:
+ # Assuming file_path is a NamedString or similar
+ all_annotations_object = json.loads(
+ file_path
+ ) # Use loads for string content
+
+ # Save Textract file to folder
+ elif (
+ file_extension in [".json"]
+ ) and "_textract" in file_path_without_ext: # (prepare_for_review != True):
+ print("Saving Textract output")
+ # Copy it to the output folder so it can be used later.
+ # If the path already ends with _textract.json (e.g. _sig_textract.json), preserve the basename;
+ # otherwise append _textract.json. Use endswith instead of regex to avoid ReDoS (CodeQL py/polynomial-redos).
+ if file_path.endswith("_textract.json"):
+ # File already has a textract suffix, preserve it
+ output_textract_json_file_name = file_path_without_ext + ".json"
+ else:
+ # No textract suffix found, add default one
+ output_textract_json_file_name = (
+ file_path_without_ext + "_textract.json"
+ )
+
+ out_textract_path = secure_join(
+ output_folder, output_textract_json_file_name
+ )
+
+ # Use shutil to copy the file directly
+ shutil.copy2(file_path, out_textract_path) # Preserves metadata
+ textract_output_found = True
+ continue
+
+ elif (
+ file_extension in [".json"]
+ ) and "_ocr_results_with_words" in file_path_without_ext: # (prepare_for_review != True):
+ print("Saving local OCR output with words")
+ # Copy it to the output folder so it can be used later.
+ output_ocr_results_with_words_json_file_name = (
+ file_path_without_ext + ".json"
+ )
+
+ out_ocr_results_with_words_path = secure_join(
+ output_folder, output_ocr_results_with_words_json_file_name
+ )
+
+ # Use shutil to copy the file directly
+ shutil.copy2(
+ file_path, out_ocr_results_with_words_path
+ ) # Preserves metadata
+
+ if prepare_for_review is True:
+ print("Converting local OCR output with words to csv")
+ page_sizes_df = pd.DataFrame(page_sizes)
+ (
+ all_page_line_level_ocr_results_with_words,
+ is_missing,
+ log_files_output_paths,
+ ) = load_and_convert_ocr_results_with_words_json(
+ out_ocr_results_with_words_path,
+ log_files_output_paths,
+ page_sizes_df,
+ )
+ all_page_line_level_ocr_results_with_words_df = (
+ word_level_ocr_output_to_dataframe(
+ all_page_line_level_ocr_results_with_words
+ )
+ )
+
+ # Use mediabox for division when loading text-extraction output (PDF-point coords)
+ coords_in_pdf_points = file_path.endswith(
+ "_ocr_results_with_words_local_text.json"
+ )
+ all_page_line_level_ocr_results_with_words_df = (
+ divide_coordinates_by_page_sizes(
+ all_page_line_level_ocr_results_with_words_df,
+ page_sizes_df,
+ xmin="word_x0",
+ xmax="word_x1",
+ ymin="word_y0",
+ ymax="word_y1",
+ coordinates_in_pdf_points=coords_in_pdf_points,
+ )
+ )
+ all_page_line_level_ocr_results_with_words_df = (
+ divide_coordinates_by_page_sizes(
+ all_page_line_level_ocr_results_with_words_df,
+ page_sizes_df,
+ xmin="line_x0",
+ xmax="line_x1",
+ ymin="line_y0",
+ ymax="line_y1",
+ coordinates_in_pdf_points=coords_in_pdf_points,
+ )
+ )
+
+ if (
+ text_extract_method == SELECTABLE_TEXT_EXTRACT_OPTION
+ and file_path.endswith("_ocr_results_with_words_local_text.json")
+ ):
+ relevant_ocr_output_with_words_found = True
+ if (
+ text_extract_method == LOCAL_OCR_MODEL_TEXT_EXTRACT_OPTION
+ and file_path.endswith("_ocr_results_with_words_local_ocr.json")
+ ):
+ relevant_ocr_output_with_words_found = True
+ if (
+ text_extract_method == TEXTRACT_TEXT_EXTRACT_OPTION
+ and file_path.endswith("_ocr_results_with_words_textract.json")
+ ):
+ relevant_ocr_output_with_words_found = True
+ continue
+
+ # If you have an annotations object from the above code
+ if all_annotations_object:
+
+ image_file_paths_pages = [
+ safe_extract_page_number_from_path(s)
+ for s in image_file_paths
+ if safe_extract_page_number_from_path(s) is not None
+ ]
+ image_file_paths_pages = [int(i) for i in image_file_paths_pages]
+
+ # If PDF pages have been converted to image files, replace the current image paths in the json to this.
+ if image_file_paths:
+ for i, image_file_path in enumerate(image_file_paths):
+
+ if i < len(all_annotations_object):
+ annotation = all_annotations_object[i]
+ else:
+ annotation = dict()
+ all_annotations_object.append(annotation)
+
+ try:
+ if not annotation:
+ annotation = {"image": "", "boxes": []}
+ annotation_page_number = (
+ safe_extract_page_number_from_path(image_file_path)
+ )
+ if annotation_page_number is None:
+ continue
+ else:
+ annotation_page_number = (
+ safe_extract_page_number_from_path(
+ annotation["image"]
+ )
+ )
+ if annotation_page_number is None:
+ continue
+ except Exception as e:
+ print("Extracting page number from image failed due to:", e)
+ annotation_page_number = 0
+
+ # Check if the annotation page number exists in the image file paths pages
+ if annotation_page_number in image_file_paths_pages:
+
+ # Set the correct image page directly since we know it's in the list
+ correct_image_page = annotation_page_number
+ annotation["image"] = image_file_paths[correct_image_page]
+ else:
+ print(
+ "Page", annotation_page_number, "image file not found."
+ )
+
+ all_annotations_object[i] = annotation
+
+ # Write the response to a JSON file in output folder
+ out_folder = output_folder + file_path_without_ext + ".json"
+ # with open(out_folder, 'w') as json_file:
+ # json.dump(all_annotations_object, json_file, separators=(",", ":"))
+ continue
+
+ # If it's a zip, it could be extract from a Textract bulk API call. Check it's this, and load in json if found
+ if file_extension in [".zip"]:
+
+ # Assume it's a Textract response object. Copy it to the output folder so it can be used later.
+ out_folder = secure_join(
+ output_folder, file_path_without_ext + "_textract.json"
+ )
+
+ # Use shutil to copy the file directly
+ # Open the ZIP file to check its contents
+ with zipfile.ZipFile(file_path, "r") as zip_ref:
+ json_files = [
+ f for f in zip_ref.namelist() if f.lower().endswith(".json")
+ ]
+
+ if len(json_files) == 1: # Ensure only one JSON file exists
+ json_filename = json_files[0]
+
+ # Extract the JSON file to the same directory as the ZIP file
+ extracted_path = secure_join(
+ os.path.dirname(file_path), json_filename
+ )
+ zip_ref.extract(json_filename, os.path.dirname(file_path))
+
+ # Move the extracted JSON to the intended output location
+ shutil.move(extracted_path, out_folder)
+
+ textract_output_found = True
+ else:
+ print(
+ f"Skipping {file_path}: Expected 1 JSON file, found {len(json_files)}"
+ )
+
+ converted_file_paths.append(converted_file_path)
+ image_file_paths.extend(image_file_path)
+
+ toc = time.perf_counter()
+ out_time = f"File '{file_name_with_ext}' prepared in {toc - tic:0.1f} seconds."
+
+ print(out_time)
+
+ if not out_message:
+ out_message = list()
+
+ out_message.append(out_time)
+ combined_out_message = "\n".join(out_message).strip() if out_message else ""
+
+ if not page_sizes:
+ number_of_pages = 1
+ else:
+ number_of_pages = len(page_sizes)
+
+ if first_loop_state is True:
+ print(f"Finished loading in {file_path_number} file(s)")
+ gr.Info(f"Finished loading in {file_path_number} file(s)")
+
+ return (
+ combined_out_message,
+ converted_file_paths,
+ image_file_paths,
+ number_of_pages,
+ number_of_pages,
+ pymupdf_doc,
+ all_annotations_object,
+ review_file_csv,
+ original_cropboxes,
+ page_sizes,
+ textract_output_found,
+ all_img_details,
+ all_line_level_ocr_results_df,
+ relevant_ocr_output_with_words_found,
+ all_page_line_level_ocr_results_with_words_df,
+ )
+
+
+def load_and_convert_ocr_results_with_words_json(
+ ocr_results_with_words_json_file_path: str,
+ log_files_output_paths: str,
+ page_sizes_df: pd.DataFrame,
+):
+ """
+ Loads Textract JSON from a file, detects if conversion is needed, and converts if necessary.
+ """
+
+ if not os.path.exists(ocr_results_with_words_json_file_path):
+ print("No existing OCR results file found.")
+ return (
+ [],
+ True,
+ log_files_output_paths,
+ ) # Return empty dict and flag indicating missing file
+
+ print("Found existing OCR results json results file.")
+
+ # Track log files
+ if ocr_results_with_words_json_file_path not in log_files_output_paths:
+ log_files_output_paths.append(ocr_results_with_words_json_file_path)
+
+ try:
+ with open(
+ ocr_results_with_words_json_file_path, "r", encoding="utf-8"
+ ) as json_file:
+ ocr_results_with_words_data = json.load(json_file)
+ except json.JSONDecodeError:
+ print("Error: Failed to parse OCR results JSON file. Returning empty data.")
+ return [], True, log_files_output_paths # Indicate failure
+
+ # Check if conversion is needed
+ if "page" and "results" in ocr_results_with_words_data[0]:
+ print("JSON already in the correct format for app. No changes needed.")
+ return (
+ ocr_results_with_words_data,
+ False,
+ log_files_output_paths,
+ ) # No conversion required
+
+ else:
+ print("Invalid OCR result JSON format: 'page' or 'results' key missing.")
+
+ return (
+ [],
+ True,
+ log_files_output_paths,
+ ) # Return empty data if JSON is not recognized
+
+
+def convert_text_pdf_to_img_pdf(
+ in_file_path: str,
+ out_text_file_path: List[str],
+ image_dpi: float = image_dpi,
+ output_folder: str = OUTPUT_FOLDER,
+ input_folder: str = INPUT_FOLDER,
+):
+ file_path_without_ext = get_file_name_without_type(in_file_path)
+
+ out_file_paths = out_text_file_path
+
+ # Convert annotated text pdf back to image to give genuine redactions
+ pdf_text_image_paths, image_sizes_width, image_sizes_height, all_img_details = (
+ process_file_for_image_creation(out_file_paths[0], input_folder=input_folder)
+ )
+ out_text_image_file_path = (
+ output_folder + file_path_without_ext + "_text_redacted_as_img.pdf"
+ )
+ pdf_text_image_paths[0].save(
+ out_text_image_file_path,
+ "PDF",
+ resolution=image_dpi,
+ save_all=True,
+ append_images=pdf_text_image_paths[1:],
+ )
+
+ out_file_paths = [out_text_image_file_path]
+
+ out_message = "PDF " + file_path_without_ext + " converted to image-based file."
+ print(out_message)
+
+ return out_message, out_file_paths
+
+
+def save_pdf_with_or_without_compression(
+ pymupdf_doc: object,
+ out_redacted_pdf_file_path,
+ COMPRESS_REDACTED_PDF: bool = COMPRESS_REDACTED_PDF,
+):
+ """
+ Save a pymupdf document with basic cleaning or with full compression options. Can be useful for low memory systems to do minimal cleaning to avoid crashing with large PDFs.
+ """
+ if COMPRESS_REDACTED_PDF is True:
+ try:
+ pymupdf_doc.save(
+ out_redacted_pdf_file_path, garbage=4, deflate=True, clean=True
+ )
+ except Exception as e:
+ print(
+ f"Error saving PDF with compression: {e}, trying again without compression"
+ )
+ pymupdf_doc.save(out_redacted_pdf_file_path, clean=True)
+ else:
+ try:
+ pymupdf_doc.save(out_redacted_pdf_file_path, garbage=1, clean=True)
+ except Exception as e:
+ print(f"Error saving PDF without compression: {e}, trying again")
+ pymupdf_doc.save(out_redacted_pdf_file_path, clean=True)
+
+
+def join_values_within_threshold(df1: pd.DataFrame, df2: pd.DataFrame):
+ # Threshold for matching
+ threshold = 5
+
+ # Perform a cross join
+ df1["key"] = 1
+ df2["key"] = 1
+ merged = pd.merge(df1, df2, on="key").drop(columns=["key"])
+
+ # Apply conditions for all columns
+ conditions = (
+ (abs(merged["xmin_x"] - merged["xmin_y"]) <= threshold)
+ & (abs(merged["xmax_x"] - merged["xmax_y"]) <= threshold)
+ & (abs(merged["ymin_x"] - merged["ymin_y"]) <= threshold)
+ & (abs(merged["ymax_x"] - merged["ymax_y"]) <= threshold)
+ )
+
+ # Filter rows that satisfy all conditions
+ filtered = merged[conditions]
+
+ # Drop duplicates if needed (e.g., keep only the first match for each row in df1)
+ result = filtered.drop_duplicates(subset=["xmin_x", "xmax_x", "ymin_x", "ymax_x"])
+
+ # Merge back into the original DataFrame (if necessary)
+ final_df = pd.merge(
+ df1,
+ result,
+ left_on=["xmin", "xmax", "ymin", "ymax"],
+ right_on=["xmin_x", "xmax_x", "ymin_x", "ymax_x"],
+ how="left",
+ )
+
+ # Clean up extra columns
+ final_df = final_df.drop(columns=["key"])
+
+
+def _pick_one_item_per_image(image: str, items: List[dict]) -> dict:
+ """Choose one item per image (prefer non-empty boxes); safe to run in a thread."""
+ non_empty_boxes = [item for item in items if item.get("boxes")]
+ return non_empty_boxes[0] if non_empty_boxes else items[0]
+
+
+def remove_duplicate_images_with_blank_boxes(data: List[dict]) -> List[dict]:
+ """
+ Remove items from the annotator object where the same page exists twice.
+ """
+ image_groups = defaultdict(list)
+ for item in data:
+ image_groups[item["image"]].append(item)
+
+ if not image_groups:
+ return []
+
+ groups_list = list(image_groups.items())
+ max_workers = min(MAX_WORKERS, len(groups_list))
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
+ result = list(
+ executor.map(
+ lambda pair: _pick_one_item_per_image(pair[0], pair[1]),
+ groups_list,
+ )
+ )
+ return result
+
+
+def divide_coordinates_by_page_sizes_pl(
+ df: pl.DataFrame,
+ page_sizes_df: pd.DataFrame,
+ xmin: str = "xmin",
+ xmax: str = "xmax",
+ ymin: str = "ymin",
+ ymax: str = "ymax",
+ coordinates_in_pdf_points: bool = False,
+ pages_in_pdf_points: Optional[Set[int]] = None,
+) -> pl.DataFrame:
+ """
+ Polars-only coordinate division: absolute coords (>1) to relative (<=1).
+ Expects df to have numeric coord columns. Returns pl.DataFrame.
+ """
+ coord_cols = [xmin, xmax, ymin, ymax]
+ for col in coord_cols:
+ if col in df.columns:
+ df = df.with_columns(pl.col(col).cast(pl.Float64, strict=False))
+
+ # Clip to 0 and round
+ df = df.with_columns(
+ [
+ pl.col(c).clip(0, float("inf")).round(6)
+ for c in coord_cols
+ if c in df.columns
+ ]
+ )
+
+ # Identify absolute coordinates (any coord > 1 and not null)
+ is_absolute = (
+ (pl.col(xmin) > 1) & pl.col(xmin).is_not_nan()
+ | (pl.col(xmax) > 1) & pl.col(xmax).is_not_nan()
+ | (pl.col(ymin) > 1) & pl.col(ymin).is_not_nan()
+ | (pl.col(ymax) > 1) & pl.col(ymax).is_not_nan()
+ )
+ df_rel = df.filter(~is_absolute)
+ df_abs = df.filter(is_absolute)
+
+ if not df_abs.is_empty() and not page_sizes_df.empty:
+ merge_cols = [
+ "page",
+ "image_width",
+ "image_height",
+ "mediabox_width",
+ "mediabox_height",
+ ]
+ available = [c for c in merge_cols if c in page_sizes_df.columns]
+ if "page" in available:
+ ps = pl.from_pandas(page_sizes_df[available].copy())
+ for c in [
+ "page",
+ "image_width",
+ "image_height",
+ "mediabox_width",
+ "mediabox_height",
+ ]:
+ if c in ps.columns:
+ # Cast page to Int64 so join key matches df_abs; cast sizes to Float64
+ dtype = pl.Int64 if c == "page" else pl.Float64
+ ps = ps.with_columns(pl.col(c).cast(dtype, strict=False))
+ df_abs = df_abs.join(ps, on="page", how="left")
+
+ if "mediabox_width" in df_abs.columns and "mediabox_height" in df_abs.columns:
+ if coordinates_in_pdf_points:
+ df_abs = df_abs.with_columns(
+ pl.col("mediabox_width").alias("image_width"),
+ pl.col("mediabox_height").alias("image_height"),
+ )
+ elif pages_in_pdf_points is not None:
+ # Normalize to int set so 1-based page matches (e.g. 1.0 -> 1)
+ _pdf_pts = {int(p) for p in pages_in_pdf_points}
+ use_mediabox = (
+ pl.col("page").cast(pl.Int64, strict=False).is_in(list(_pdf_pts))
+ )
+ img_w = pl.col("mediabox_width")
+ img_h = pl.col("mediabox_height")
+ if "image_width" in df_abs.columns:
+ img_w = pl.col("image_width").fill_null(pl.col("mediabox_width"))
+ if "image_height" in df_abs.columns:
+ img_h = pl.col("image_height").fill_null(pl.col("mediabox_height"))
+ # For pages_in_pdf_points always use mediabox so text-path coords (PDF points) divide correctly
+ df_abs = df_abs.with_columns(
+ pl.when(use_mediabox)
+ .then(pl.col("mediabox_width"))
+ .otherwise(img_w)
+ .alias("image_width"),
+ pl.when(use_mediabox)
+ .then(pl.col("mediabox_height"))
+ .otherwise(img_h)
+ .alias("image_height"),
+ )
+ # If join missed (nulls), fall back to mediabox for those pages so we don't divide by image pixels
+ df_abs = df_abs.with_columns(
+ pl.when(use_mediabox & pl.col("image_width").is_null())
+ .then(pl.col("mediabox_width"))
+ .otherwise(pl.col("image_width"))
+ .alias("image_width"),
+ pl.when(use_mediabox & pl.col("image_height").is_null())
+ .then(pl.col("mediabox_height"))
+ .otherwise(pl.col("image_height"))
+ .alias("image_height"),
+ )
+ elif "image_width" not in df_abs.columns:
+ df_abs = df_abs.with_columns(
+ pl.col("mediabox_width").alias("image_width"),
+ pl.col("mediabox_height").alias("image_height"),
+ )
+ else:
+ df_abs = df_abs.with_columns(
+ pl.col("image_width")
+ .fill_null(pl.col("mediabox_width"))
+ .alias("image_width"),
+ pl.col("image_height")
+ .fill_null(pl.col("mediabox_height"))
+ .alias("image_height"),
+ )
+
+ if "image_width" in df_abs.columns and "image_height" in df_abs.columns:
+ df_abs = df_abs.with_columns(
+ (pl.col(xmin) / pl.col("image_width")).round(6).alias(xmin),
+ (pl.col(xmax) / pl.col("image_width")).round(6).alias(xmax),
+ (pl.col(ymin) / pl.col("image_height")).round(6).alias(ymin),
+ (pl.col(ymax) / pl.col("image_height")).round(6).alias(ymax),
+ )
+ df_abs = df_abs.with_columns(
+ [
+ pl.when(pl.col(c).is_in([float("inf"), float("-inf")]))
+ .then(pl.lit(None).cast(pl.Float64))
+ .otherwise(pl.col(c))
+ .alias(c)
+ for c in coord_cols
+ ]
+ )
+ else:
+ print(
+ "Skipping coordinate division due to missing or non-numeric dimension columns."
+ )
+
+ if df_rel.is_empty() and df_abs.is_empty():
+ print(
+ "Warning: Both relative and absolute splits resulted in empty DataFrames."
+ )
+ return df_rel
+ # Drop dimension columns from df_abs so concat matches df_rel schema (Polars requires same width)
+ for c in ["image_width", "image_height", "mediabox_width", "mediabox_height"]:
+ if c in df_abs.columns:
+ df_abs = df_abs.drop(c)
+ out = pl.concat([df_rel, df_abs])
+
+ if not out.is_empty():
+ out = out.sort(["page", ymin, xmin], nulls_last=True)
+
+ # Clamp to [0,1] while preserving box dimensions.
+ # Cap ymax at 1 - 1e-6 so no box spans the full bottom (avoids single-char words with ymax=1).
+ _ymax_cap = 1.0 - 1e-6
+ out = out.with_columns(
+ pl.col(ymin).alias("_ymin_orig"),
+ pl.col(ymax).alias("_ymax_orig"),
+ pl.col(xmin).alias("_xmin_orig"),
+ pl.col(xmax).alias("_xmax_orig"),
+ )
+ out = out.with_columns(
+ pl.col(ymin).clip(0, float("inf")).alias(ymin),
+ pl.col(xmin).clip(0, float("inf")).alias(xmin),
+ pl.col(xmax).clip(float("-inf"), 1).alias(xmax),
+ pl.col(ymax).clip(float("-inf"), _ymax_cap).alias(ymax),
+ )
+ # Preserve height/width when clamping
+ out = out.with_columns(
+ pl.when(pl.col("_ymax_orig") > 1)
+ .then(
+ (pl.col(ymin) + (pl.col("_ymax_orig") - pl.col("_ymin_orig"))).clip(
+ float("-inf"), _ymax_cap
+ )
+ )
+ .otherwise(pl.col(ymax))
+ .alias(ymax),
+ pl.when(pl.col("_xmax_orig") > 1)
+ .then(
+ (pl.col(xmin) + (pl.col("_xmax_orig") - pl.col("_xmin_orig"))).clip(
+ float("-inf"), 1
+ )
+ )
+ .otherwise(pl.col(xmax))
+ .alias(xmax),
+ )
+ out = out.with_columns(
+ pl.when(pl.col("_ymin_orig") < 0)
+ .then(
+ (pl.col(ymax) - (pl.col("_ymax_orig") - pl.col("_ymin_orig"))).clip(
+ 0, float("inf")
+ )
+ )
+ .otherwise(pl.col(ymin))
+ .alias(ymin),
+ pl.when(pl.col("_xmin_orig") < 0)
+ .then(
+ (pl.col(xmax) - (pl.col("_xmax_orig") - pl.col("_xmin_orig"))).clip(
+ 0, float("inf")
+ )
+ )
+ .otherwise(pl.col(xmin))
+ .alias(xmin),
+ )
+ out = out.drop(["_ymin_orig", "_ymax_orig", "_xmin_orig", "_xmax_orig"])
+ out = out.with_columns(
+ [pl.col(c).round(6) for c in coord_cols if c in out.columns]
+ )
+
+ return out
+
+
+def divide_coordinates_by_page_sizes(
+ review_file_df: pd.DataFrame,
+ page_sizes_df: pd.DataFrame,
+ xmin="xmin",
+ xmax="xmax",
+ ymin="ymin",
+ ymax="ymax",
+ coordinates_in_pdf_points: bool = False,
+ pages_in_pdf_points: Optional[Set[int]] = None,
+) -> pd.DataFrame:
+ """
+ Optimized function to convert absolute image coordinates (>1) to relative coordinates (<=1).
+
+ Identifies rows with absolute coordinates, merges page size information,
+ divides coordinates by dimensions, and combines with already-relative rows.
+
+ Args:
+ review_file_df: Input DataFrame with potentially mixed coordinate systems.
+ page_sizes_df: DataFrame with page dimensions ('page', 'image_width',
+ 'image_height', 'mediabox_width', 'mediabox_height').
+ xmin, xmax, ymin, ymax: Names of the coordinate columns.
+ coordinates_in_pdf_points: If True, coordinates are in PDF space (points);
+ use mediabox_width/mediabox_height for division regardless of
+ image_width/image_height (e.g. when called from redact_text_pdf).
+ pages_in_pdf_points: If set, page numbers (1-based) whose coordinates are in PDF
+ points; all other pages use image dimensions. Used when EFFICIENT_OCR mixes
+ text-extracted pages (PDF points) and OCR pages (image pixels). Ignored if
+ coordinates_in_pdf_points is True for the whole dataframe.
+
+ Returns:
+ DataFrame with coordinates converted to relative system, sorted.
+ """
+ if review_file_df.empty or xmin not in review_file_df.columns:
+ return review_file_df
+
+ coord_cols = [xmin, xmax, ymin, ymax]
+ cols_to_convert = coord_cols + ["page"]
+ for col in cols_to_convert:
+ if col not in review_file_df.columns:
+ if col == "page" or col in coord_cols:
+ print(
+ f"Warning: Required column '{col}' not found in review_file_df. Returning original DataFrame."
+ )
+ return review_file_df
+
+ temp_pd = review_file_df.copy()
+ for col in cols_to_convert:
+ if col in temp_pd.columns:
+ temp_pd[col] = pd.to_numeric(temp_pd[col], errors="coerce")
+ for col in temp_pd.columns:
+ if col not in cols_to_convert and temp_pd[col].dtype == object:
+ temp_pd[col] = temp_pd[col].astype(str)
+ df = pl.from_pandas(temp_pd)
+ out = divide_coordinates_by_page_sizes_pl(
+ df,
+ page_sizes_df,
+ xmin=xmin,
+ xmax=xmax,
+ ymin=ymin,
+ ymax=ymax,
+ coordinates_in_pdf_points=coordinates_in_pdf_points,
+ pages_in_pdf_points=pages_in_pdf_points,
+ )
+ result = out.to_pandas()
+ if "page" in result.columns and not result.empty:
+ result["page"] = pd.to_numeric(result["page"], errors="coerce")
+ result["page"] = result["page"].astype("Int64")
+ for c in coord_cols:
+ if c in result.columns:
+ result[c] = result[c].astype(float)
+ return result
+
+
+def multiply_coordinates_by_page_sizes(
+ review_file_df: pd.DataFrame,
+ page_sizes_df: pd.DataFrame,
+ xmin="xmin",
+ xmax="xmax",
+ ymin="ymin",
+ ymax="ymax",
+):
+ """
+ Optimized function to convert relative coordinates to absolute based on page sizes.
+
+ Separates relative (<=1) and absolute (>1) coordinates, merges page sizes
+ for relative coordinates, calculates absolute pixel values, and recombines.
+ Implemented with Polars for performance; returns pandas DataFrame.
+ """
+ if review_file_df.empty or xmin not in review_file_df.columns:
+ return review_file_df # Return early if empty or key column missing
+
+ coord_cols = [xmin, xmax, ymin, ymax]
+ df = pl.from_pandas(review_file_df)
+
+ # Cast coordinates and page to numeric (single with_columns for less overhead)
+ cast_cols = [c for c in coord_cols + ["page"] if c in df.columns]
+ if cast_cols:
+ df = df.with_columns(
+ [pl.col(c).cast(pl.Float64, strict=False) for c in cast_cols]
+ )
+
+ # Identify relative coordinates (all <= 1 and not null)
+ is_relative = (
+ pl.col(xmin).le(1)
+ & pl.col(xmin).is_not_nan()
+ & pl.col(xmax).le(1)
+ & pl.col(xmax).is_not_nan()
+ & pl.col(ymin).le(1)
+ & pl.col(ymin).is_not_nan()
+ & pl.col(ymax).le(1)
+ & pl.col(ymax).is_not_nan()
+ )
+ df_abs = df.filter(~is_relative)
+ df_rel = df.filter(is_relative)
+
+ if df_rel.is_empty():
+ if not df_abs.is_empty() and {"page", xmin, ymin}.issubset(df_abs.columns):
+ df_abs = df_abs.sort(["page", xmin, ymin], nulls_last=True)
+ result_early = df_abs.to_pandas()
+ for c in coord_cols:
+ if c in result_early.columns:
+ result_early[c] = result_early[c].astype(float)
+ return result_early
+
+ # Join page sizes for relative rows
+ if (
+ not page_sizes_df.empty
+ and "image_width" in page_sizes_df.columns
+ and "image_height" in page_sizes_df.columns
+ ):
+ ps = pl.from_pandas(
+ page_sizes_df[["page", "image_width", "image_height"]].copy()
+ )
+ ps = ps.with_columns(
+ pl.col("page").cast(pl.Float64, strict=False),
+ pl.col("image_width").cast(pl.Float64, strict=False),
+ pl.col("image_height").cast(pl.Float64, strict=False),
+ )
+ df_rel = df_rel.join(ps, on="page", how="left")
+
+ # Multiply coordinates where dimensions exist
+ has_size = pl.col("image_width").is_not_nan() & pl.col("image_height").is_not_nan()
+ df_rel = df_rel.with_columns(
+ [
+ pl.when(has_size)
+ .then((pl.col(xmin) * pl.col("image_width")).round(6))
+ .otherwise(pl.col(xmin))
+ .alias(xmin),
+ pl.when(has_size)
+ .then((pl.col(xmax) * pl.col("image_width")).round(6))
+ .otherwise(pl.col(xmax))
+ .alias(xmax),
+ pl.when(has_size)
+ .then((pl.col(ymin) * pl.col("image_height")).round(6))
+ .otherwise(pl.col(ymin))
+ .alias(ymin),
+ pl.when(has_size)
+ .then((pl.col(ymax) * pl.col("image_height")).round(6))
+ .otherwise(pl.col(ymax))
+ .alias(ymax),
+ ]
+ )
+ drop_cols = [c for c in ["image_width", "image_height"] if c in df_rel.columns]
+ if drop_cols:
+ df_rel = df_rel.drop(drop_cols)
+
+ out = pl.concat([df_abs, df_rel])
+ out = out.sort(["page", xmin, ymin], nulls_last=True)
+ out = out.with_columns(
+ [
+ pl.col(c).clip(0, float("inf")).round(6)
+ for c in coord_cols
+ if c in out.columns
+ ]
+ )
+ result = out.to_pandas()
+ for c in coord_cols:
+ if c in result.columns:
+ result[c] = result[c].astype(float)
+ return result
+
+
+def do_proximity_match_by_page_for_text(df1: pd.DataFrame, df2: pd.DataFrame):
+ """
+ Match text from one dataframe to another based on proximity matching of coordinates page by page.
+ """
+
+ if "text" not in df2.columns:
+ df2["text"] = ""
+ if "text" not in df1.columns:
+ df1["text"] = ""
+
+ # Create a unique key based on coordinates and label for exact merge
+ merge_keys = ["xmin", "ymin", "xmax", "ymax", "label", "page"]
+ df1["key"] = df1[merge_keys].astype(str).agg("_".join, axis=1)
+ df2["key"] = df2[merge_keys].astype(str).agg("_".join, axis=1)
+
+ # Attempt exact merge first
+ merged_df = df1.merge(
+ df2[["key", "text"]], on="key", how="left", suffixes=("", "_duplicate")
+ )
+
+ # If a match is found, keep that text; otherwise, keep the original df1 text
+ merged_df["text"] = np.where(
+ merged_df["text"].isna() | (merged_df["text"] == ""),
+ merged_df.pop("text_duplicate"),
+ merged_df["text"],
+ )
+
+ # Define tolerance for proximity matching
+ tolerance = 0.02
+
+ # Precompute KDTree for each page in df2
+ page_trees = dict()
+ for page in df2["page"].unique():
+ df2_page = df2[df2["page"] == page]
+ coords = df2_page[["xmin", "ymin", "xmax", "ymax"]].values
+ if np.all(np.isfinite(coords)) and len(coords) > 0:
+ page_trees[page] = (cKDTree(coords), df2_page)
+
+ # Perform proximity matching
+ for i, row in df1.iterrows():
+ page_number = row["page"]
+
+ if page_number in page_trees:
+ tree, df2_page = page_trees[page_number]
+
+ # Query KDTree for nearest neighbor
+ dist, idx = tree.query(
+ [row[["xmin", "ymin", "xmax", "ymax"]].values],
+ distance_upper_bound=tolerance,
+ )
+
+ if dist[0] < tolerance and idx[0] < len(df2_page):
+ merged_df.at[i, "text"] = df2_page.iloc[idx[0]]["text"]
+
+ # Drop the temporary key column
+ merged_df.drop(columns=["key"], inplace=True)
+
+ return merged_df
+
+
+def do_proximity_match_all_pages_for_text(
+ df1: pd.DataFrame, df2: pd.DataFrame, threshold: float = 0.03
+):
+ """
+ Match text from one dataframe to another based on proximity matching of coordinates across all pages.
+ """
+
+ if "text" not in df2.columns:
+ df2["text"] = ""
+ if "text" not in df1.columns:
+ df1["text"] = ""
+
+ for col in ["xmin", "ymin", "xmax", "ymax"]:
+ df1[col] = pd.to_numeric(df1[col], errors="coerce")
+
+ for col in ["xmin", "ymin", "xmax", "ymax"]:
+ df2[col] = pd.to_numeric(df2[col], errors="coerce")
+
+ # Create a unique key based on coordinates and label for exact merge
+ merge_keys = ["xmin", "ymin", "xmax", "ymax", "label", "page"]
+ df1["key"] = df1[merge_keys].astype(str).agg("_".join, axis=1)
+ df2["key"] = df2[merge_keys].astype(str).agg("_".join, axis=1)
+
+ # Attempt exact merge first, renaming df2['text'] to avoid suffixes
+ merged_df = df1.merge(
+ df2[["key", "text"]], on="key", how="left", suffixes=("", "_duplicate")
+ )
+
+ # If a match is found, keep that text; otherwise, keep the original df1 text
+ merged_df["text"] = np.where(
+ merged_df["text"].isna() | (merged_df["text"] == ""),
+ merged_df.pop("text_duplicate"),
+ merged_df["text"],
+ )
+
+ # Handle missing matches using a proximity-based approach
+ # Convert coordinates to numpy arrays for KDTree lookup
+
+ query_coords = np.array(df1[["xmin", "ymin", "xmax", "ymax"]].values, dtype=float)
+
+ # Check for NaN or infinite values in query_coords and filter them out
+ finite_mask = np.isfinite(query_coords).all(axis=1)
+ if not finite_mask.all():
+ # print("Warning: query_coords contains non-finite values. Filtering out non-finite entries.")
+ query_coords = query_coords[
+ finite_mask
+ ] # Filter out rows with NaN or infinite values
+ else:
+ pass
+
+ # Proceed only if query_coords is not empty
+ if query_coords.size > 0:
+ # Ensure df2 is filtered for finite values before creating the KDTree
+ finite_mask_df2 = np.isfinite(df2[["xmin", "ymin", "xmax", "ymax"]].values).all(
+ axis=1
+ )
+ df2_finite = df2[finite_mask_df2]
+
+ # Create the KDTree with the filtered data
+ tree = cKDTree(df2_finite[["xmin", "ymin", "xmax", "ymax"]].values)
+
+ # Find nearest neighbors within a reasonable tolerance (e.g., 1% of page)
+ tolerance = threshold
+ distances, indices = tree.query(query_coords, distance_upper_bound=tolerance)
+
+ # Assign text values where matches are found
+ for i, (dist, idx) in enumerate(zip(distances, indices)):
+ if dist < tolerance and idx < len(df2_finite):
+ merged_df.at[i, "text"] = df2_finite.iloc[idx]["text"]
+
+ # Drop the temporary key column
+ merged_df.drop(columns=["key"], inplace=True)
+
+ return merged_df
+
+
+def _extract_page_number(image_path: Any) -> int:
+ """Helper function to safely extract page number."""
+ if not isinstance(image_path, str):
+ return 1
+ match = IMAGE_NUM_REGEX.search(image_path)
+ if match:
+ try:
+ return int(match.group(1)) + 1
+ except (ValueError, TypeError):
+ return 1
+ return 1
+
+
+def convert_annotation_data_to_dataframe(all_annotations: List[Dict[str, Any]]):
+ """
+ Convert annotation list to DataFrame using Polars for performance.
+ Returns a pandas DataFrame with columns image, page, label, color, xmin, xmax, ymin, ymax, text, id.
+ """
+ if not all_annotations:
+ print("No annotations found, returning empty dataframe")
+ return pd.DataFrame(
+ columns=[
+ "image",
+ "page",
+ "label",
+ "color",
+ "xmin",
+ "xmax",
+ "ymin",
+ "ymax",
+ "text",
+ "id",
+ ]
+ )
+
+ records = []
+ for anno in all_annotations:
+ image = anno.get("image")
+ page_from_image = _extract_page_number(image)
+ boxes = anno.get("boxes")
+ if not isinstance(boxes, list):
+ boxes = [boxes] if isinstance(boxes, dict) else []
+ # Do not add a placeholder box when boxes is empty; that created blank annotations
+ # in review_file_state when changing page or saving.
+ for box in boxes:
+ if isinstance(box, dict):
+ # Skip blank/zero-area boxes (e.g. from image_annotator with 0,0,0,0 or None).
+ def _num(v):
+ if v is None:
+ return None
+ try:
+ return float(v)
+ except (TypeError, ValueError):
+ return None
+
+ xmin, ymin, xmax, ymax = (
+ _num(box.get("xmin")),
+ _num(box.get("ymin")),
+ _num(box.get("xmax")),
+ _num(box.get("ymax")),
+ )
+ if xmin is None and ymin is None and xmax is None and ymax is None:
+ continue
+ if (xmin or 0) == (xmax or 0) and (ymin or 0) == (ymax or 0):
+ continue
+ if (xmin or 0) >= (xmax or 0) or (ymin or 0) >= (ymax or 0):
+ continue
+
+ # Use per-box page when present (e.g. text-path with empty image so all don't become page 1).
+ # Reject 0 or negative (UI/state use 1-based pages); fall back to page_from_image.
+ box_page = box.get("page")
+ if box_page is not None:
+ try:
+ p = int(float(box_page))
+ page = p if p >= 1 else page_from_image
+ except (TypeError, ValueError):
+ page = page_from_image
+ else:
+ page = page_from_image
+ row = {"image": image, "page": page}
+ for k, v in box.items():
+ if k != "page" and k != "image":
+ # Normalise colour to list so Polars gets a consistent schema
+ # (some boxes have color as list [r,g,b], tuple, or string)
+ if k == "color" and v is not None:
+ if isinstance(v, (list, tuple)) and len(v) >= 3:
+ v = [int(float(x)) for x in v[:3]]
+ elif isinstance(v, str):
+ s = v.strip("()").replace(" ", "")
+ # e.g. "(128,128,128)" or "128,128,128"
+ parts = s.split(",")
+ if len(parts) >= 3:
+ v = [int(float(p)) for p in parts[:3]]
+ elif s.startswith("#") and len(s) in (4, 7):
+ # Hex #rgb or #rrggbb (from gradio_image_annotation label_colors)
+ hex_s = s[1:]
+ if len(hex_s) == 3:
+ v = [
+ int(hex_s[i : i + 1] * 2, 16)
+ for i in (0, 1, 2)
+ ]
+ else:
+ v = [
+ int(hex_s[i : i + 2], 16) for i in (0, 2, 4)
+ ]
+ else:
+ v = [0, 0, 0]
+ else:
+ v = [0, 0, 0]
+ elif k == "color" and v is None:
+ v = [0, 0, 0]
+ if k == "color":
+ # Store as string "(r, g, b)" so column survives Polars/pandas
+ # round-trip (list columns can be lost or corrupted)
+ v = (
+ f"({int(v[0])}, {int(v[1])}, {int(v[2])})"
+ if isinstance(v, (list, tuple)) and len(v) >= 3
+ else "(0, 0, 0)"
+ )
+ row[k] = v
+ if "color" not in row:
+ row["color"] = "(0, 0, 0)"
+ records.append(row)
+
+ if not records:
+ return pd.DataFrame(
+ columns=[
+ "image",
+ "page",
+ "label",
+ "color",
+ "xmin",
+ "xmax",
+ "ymin",
+ "ymax",
+ "text",
+ "id",
+ ]
+ )
+
+ df = pl.from_dicts(records)
+ essential_box_cols = ["xmin", "xmax", "ymin", "ymax", "text", "id", "label"]
+ for col in essential_box_cols:
+ if col not in df.columns:
+ df = df.with_columns(pl.lit(None).alias(col))
+ # Drop rows where all of the essential box fields are null (matches pandas dropna(..., how="all"))
+ null_mask = pl.all_horizontal(
+ [pl.col(c).is_null() for c in essential_box_cols if c in df.columns]
+ )
+ df = df.filter(~null_mask)
+ base_cols = ["image"]
+ extra_box_cols = sorted(
+ [c for c in df.columns if c not in base_cols and c not in essential_box_cols]
+ )
+ final_col_order = base_cols + essential_box_cols + extra_box_cols
+ final_col_order = [c for c in final_col_order if c in df.columns]
+ df = df.select(final_col_order)
+ return df.to_pandas()
+
+
+def create_annotation_dicts_from_annotation_df(
+ all_image_annotations_df: pd.DataFrame, page_sizes: List[Dict[str, Any]]
+) -> List[Dict[str, Any]]:
+ """
+ Convert annotation DataFrame back to list of dicts using dictionary lookup.
+ Ensures all images from page_sizes are present without duplicates.
+ """
+ # 1. Create a dictionary keyed by image path for efficient lookup & update
+ # Initialize with all images from page_sizes. Use .get for safety.
+ image_dict: Dict[str, Dict[str, Any]] = dict()
+ for item in page_sizes:
+ image_path = item.get("image_path")
+ if image_path: # Only process if image_path exists and is not None/empty
+ image_dict[image_path] = {"image": image_path, "boxes": []}
+
+ # Check if the DataFrame is empty or lacks necessary columns
+ if (
+ all_image_annotations_df.empty
+ or "image" not in all_image_annotations_df.columns
+ ):
+ # print("Warning: Annotation DataFrame is empty or missing 'image' column.")
+ return list(image_dict.values()) # Return based on page_sizes only
+
+ # 2. Define columns to extract for boxes and check availability
+ # Make sure these columns actually exist in the DataFrame
+ box_cols = ["xmin", "ymin", "xmax", "ymax", "color", "label", "text", "id"]
+ available_cols = [
+ col for col in box_cols if col in all_image_annotations_df.columns
+ ]
+
+ if "text" in all_image_annotations_df.columns:
+ all_image_annotations_df["text"] = all_image_annotations_df["text"].fillna("")
+ # all_image_annotations_df.loc[all_image_annotations_df['text'].isnull(), 'text'] = ''
+
+ if not available_cols:
+ print(
+ f"Warning: None of the expected box columns ({box_cols}) found in DataFrame."
+ )
+ return list(image_dict.values()) # Return based on page_sizes only
+
+ # 3. Group the DataFrame by image and update the dictionary
+ coord_cols = ["xmin", "ymin", "xmax", "ymax"]
+ valid_box_df = all_image_annotations_df.dropna(
+ subset=[col for col in coord_cols if col in available_cols]
+ ).copy()
+
+ if valid_box_df.empty:
+ print(
+ "Warning: No valid annotation rows found in DataFrame after dropping NA coordinates."
+ )
+ return list(image_dict.values())
+
+ # Ensure every image path in the dataframe has an entry (e.g. EFFICIENT_OCR text-path
+ # pages may use a different path in annotations than in page_sizes, so boxes would be dropped).
+ for image_path in valid_box_df["image"].unique():
+ if image_path and image_path not in image_dict:
+ image_dict[image_path] = {"image": image_path, "boxes": []}
+
+ # Build list of (image_path, group) for all images in the dataframe
+ group_items = [
+ (image_path, group)
+ for image_path, group in valid_box_df.groupby(
+ "image", observed=True, sort=False
+ )
+ ]
+
+ if group_items:
+ max_workers = min(MAX_WORKERS, len(group_items))
+
+ def _boxes_for_group(item):
+ _image_path, _group = item
+ boxes = _group[available_cols].to_dict(orient="records")
+ return (_image_path, boxes)
+
+ try:
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
+ for image_path, boxes in executor.map(_boxes_for_group, group_items):
+ image_dict[image_path]["boxes"] = boxes
+ except KeyError:
+ print("Error: Issue grouping DataFrame by 'image'.")
+ return list(image_dict.values())
+
+ return list(image_dict.values())
+
+
+def convert_annotation_json_to_review_df(
+ all_annotations: List[dict],
+ redaction_decision_output: pd.DataFrame = pd.DataFrame(),
+ page_sizes: List[dict] = list(),
+ do_proximity_match: bool = True,
+ prebuilt_df: Optional[pd.DataFrame] = None,
+) -> pd.DataFrame:
+ """
+ Convert the annotation json data to a dataframe format.
+ Add on any text from the initial review_file dataframe by joining based on 'id' if available
+ in both sources, otherwise falling back to joining on pages/co-ordinates (if option selected).
+
+ Refactored for improved efficiency, prioritizing ID-based join and conditionally applying
+ coordinate division and proximity matching.
+
+ When prebuilt_df is provided (e.g. from chunked parallel build), it is used as the initial
+ DataFrame and the annotation-to-DataFrame conversion is skipped.
+ """
+
+ # 1. Convert annotations to DataFrame (or use prebuilt from chunked parallel build)
+ if prebuilt_df is not None:
+ review_file_df = prebuilt_df.copy()
+ else:
+ review_file_df = convert_annotation_data_to_dataframe(
+ all_annotations if all_annotations else []
+ )
+
+ # Only keep rows in review_df where there are coordinates (assuming xmin is representative)
+ # Use .notna() for robustness with potential None or NaN values
+ review_file_df.dropna(
+ subset=["xmin", "ymin", "xmax", "ymax"], how="any", inplace=True
+ )
+
+ # Drop blank/zero-area annotations (e.g. image_annotator sometimes sends 0,0,0,0 boxes)
+ if not review_file_df.empty and all(
+ c in review_file_df.columns for c in ["xmin", "ymin", "xmax", "ymax"]
+ ):
+ xmin, ymin, xmax, ymax = (
+ pd.to_numeric(review_file_df["xmin"], errors="coerce"),
+ pd.to_numeric(review_file_df["ymin"], errors="coerce"),
+ pd.to_numeric(review_file_df["xmax"], errors="coerce"),
+ pd.to_numeric(review_file_df["ymax"], errors="coerce"),
+ )
+ zero_area = (xmin >= xmax) | (ymin >= ymax)
+ review_file_df = review_file_df.loc[~zero_area]
+
+ # Exit early if the initial conversion results in an empty DataFrame
+ if review_file_df.empty:
+ # Define standard columns for an empty return DataFrame
+ # Ensure 'id' is included if it was potentially expected based on input structure
+ # We don't know the columns from convert_annotation_data_to_dataframe without seeing it,
+ # but let's assume a standard set and add 'id' if it appeared.
+ standard_cols = [
+ "image",
+ "page",
+ "label",
+ "color",
+ "xmin",
+ "ymin",
+ "xmax",
+ "ymax",
+ "text",
+ ]
+ if "id" in review_file_df.columns:
+ standard_cols.append("id")
+ return pd.DataFrame(columns=standard_cols)
+
+ # Ensure 'id' column exists for logic flow, even if empty
+ if "id" not in review_file_df.columns:
+ review_file_df["id"] = ""
+ # Do the same for redaction_decision_output if it's not empty
+ if (
+ not redaction_decision_output.empty
+ and "id" not in redaction_decision_output.columns
+ ):
+ redaction_decision_output["id"] = ""
+
+ # 2. Process page sizes if provided - needed potentially for coordinate division later
+ # Process this once upfront if the data is available
+ page_sizes_df = pd.DataFrame() # Initialize as empty
+ if page_sizes:
+ page_sizes_df = pd.DataFrame(page_sizes)
+ if not page_sizes_df.empty:
+ # Safely convert page column to numeric and then int
+ page_sizes_df["page"] = pd.to_numeric(
+ page_sizes_df["page"], errors="coerce"
+ )
+ page_sizes_df.dropna(subset=["page"], inplace=True)
+ if not page_sizes_df.empty: # Check again after dropping NaNs
+ page_sizes_df["page"] = page_sizes_df["page"].astype(int)
+ else:
+ print(
+ "Warning: Page sizes DataFrame became empty after processing, coordinate division will be skipped."
+ )
+
+ # 3. Join additional data from redaction_decision_output if provided
+ text_added_successfully = False # Flag to track if text was added by any method
+
+ if not redaction_decision_output.empty:
+ # --- Attempt to join data based on 'id' column first ---
+
+ # Check if 'id' columns are present and have non-null values in *both* dataframes
+ id_col_exists_in_review = (
+ "id" in review_file_df.columns
+ and not review_file_df["id"].isnull().all()
+ and not (review_file_df["id"] == "").all()
+ )
+ id_col_exists_in_redaction = (
+ "id" in redaction_decision_output.columns
+ and not redaction_decision_output["id"].isnull().all()
+ and not (redaction_decision_output["id"] == "").all()
+ )
+
+ if id_col_exists_in_review and id_col_exists_in_redaction:
+ # print("Attempting to join data based on 'id' column.")
+ try:
+ # Ensure 'id' columns are of string type for robust merging
+ review_file_df["id"] = review_file_df["id"].astype(str)
+ # Make a copy if needed, but try to avoid if redaction_decision_output isn't modified later
+ # Let's use a copy for safety as in the original code
+ redaction_copy = redaction_decision_output.copy()
+ redaction_copy["id"] = redaction_copy["id"].astype(str)
+
+ # Select columns to merge from redaction output. Prioritize 'text'.
+ cols_to_merge = ["id"]
+ if "text" in redaction_copy.columns:
+ cols_to_merge.append("text")
+ else:
+ print(
+ "Warning: 'text' column not found in redaction_decision_output. Cannot merge text using 'id'."
+ )
+
+ # Perform a left merge to keep all annotations and add matching text
+ # Use a suffix for the text column from the right DataFrame
+ original_text_col_exists = "text" in review_file_df.columns
+ merge_suffix = "_redaction" if original_text_col_exists else ""
+
+ merged_df = pd.merge(
+ review_file_df,
+ redaction_copy[cols_to_merge],
+ on="id",
+ how="left",
+ suffixes=("", merge_suffix),
+ )
+
+ # Update the 'text' column if a new one was brought in
+ if "text" + merge_suffix in merged_df.columns:
+ redaction_text_col = "text" + merge_suffix
+ if original_text_col_exists:
+ # Combine: Use text from redaction where available, otherwise keep original
+ merged_df["text"] = merged_df[redaction_text_col].combine_first(
+ merged_df["text"]
+ )
+ # Drop the temporary column
+ merged_df = merged_df.drop(columns=[redaction_text_col])
+ else:
+ # Redaction output had text, but review_file_df didn't. Rename the new column.
+ merged_df = merged_df.rename(
+ columns={redaction_text_col: "text"}
+ )
+
+ text_added_successfully = (
+ True # Indicate text was potentially added
+ )
+
+ review_file_df = merged_df # Update the main DataFrame
+
+ # print("Successfully attempted to join data using 'id'.") # Note: Text might not have been in redaction data
+
+ except Exception as e:
+ print(
+ f"Error during 'id'-based merge: {e}. Checking for proximity match fallback."
+ )
+ # Fall through to proximity match logic below
+
+ # --- Fallback to proximity match if ID join wasn't possible/successful and enabled ---
+ # Note: If id_col_exists_in_review or id_col_exists_in_redaction was False,
+ # the block above was skipped, and we naturally fall here.
+ # If an error occurred in the try block, joined_by_id would implicitly be False
+ # because text_added_successfully wasn't set to True.
+
+ # Only attempt proximity match if text wasn't added by ID join and proximity is requested
+ if not text_added_successfully and do_proximity_match:
+ # print("Attempting proximity match to add text data.")
+
+ # Ensure 'page' columns are numeric before coordinate division and proximity match
+ # (Assuming divide_coordinates_by_page_sizes and do_proximity_match_all_pages_for_text need this)
+ if "page" in review_file_df.columns:
+ review_file_df["page"] = (
+ pd.to_numeric(review_file_df["page"], errors="coerce")
+ .fillna(-1)
+ .astype(int)
+ ) # Use -1 for NaN pages
+ review_file_df = review_file_df[
+ review_file_df["page"] != -1
+ ] # Drop rows where page conversion failed
+ if (
+ not redaction_decision_output.empty
+ and "page" in redaction_decision_output.columns
+ ):
+ redaction_decision_output["page"] = (
+ pd.to_numeric(redaction_decision_output["page"], errors="coerce")
+ .fillna(-1)
+ .astype(int)
+ )
+ redaction_decision_output = redaction_decision_output[
+ redaction_decision_output["page"] != -1
+ ]
+
+ # Perform coordinate division IF page_sizes were processed and DataFrame is not empty
+ if not page_sizes_df.empty:
+ # Apply coordinate division *before* proximity match
+ review_file_df = divide_coordinates_by_page_sizes(
+ review_file_df, page_sizes_df
+ )
+ if not redaction_decision_output.empty:
+ redaction_decision_output = divide_coordinates_by_page_sizes(
+ redaction_decision_output, page_sizes_df
+ )
+
+ # Now perform the proximity match
+ # Note: Potential DataFrame copies happen inside do_proximity_match based on its implementation
+ if not redaction_decision_output.empty:
+ try:
+ review_file_df = do_proximity_match_all_pages_for_text(
+ df1=review_file_df, # Pass directly, avoid caller copy if possible by modifying function signature
+ df2=redaction_decision_output, # Pass directly
+ )
+ # Assuming do_proximity_match_all_pages_for_text adds/updates the 'text' column
+ if "text" in review_file_df.columns:
+ text_added_successfully = True
+ # print("Proximity match completed.")
+ except Exception as e:
+ print(
+ f"Error during proximity match: {e}. Text data may not be added."
+ )
+
+ elif not text_added_successfully and not do_proximity_match:
+ print(
+ "Skipping joining text data (ID join not possible/failed, proximity match disabled)."
+ )
+
+ # 4. Ensure required columns exist and are ordered
+ # Define base required columns. 'id' and 'text' are conditionally added.
+ required_columns_base = [
+ "image",
+ "page",
+ "label",
+ "color",
+ "xmin",
+ "ymin",
+ "xmax",
+ "ymax",
+ ]
+ final_columns = required_columns_base[:] # Start with base columns
+
+ # Add 'id' and 'text' if they exist in the DataFrame at this point
+ if "id" in review_file_df.columns:
+ final_columns.append("id")
+ if "text" in review_file_df.columns:
+ final_columns.append("text") # Add text column if it was created/merged
+
+ # Add any missing required columns with a default value (e.g., blank string)
+ for col in final_columns:
+ if col not in review_file_df.columns:
+ # Use appropriate default based on expected type, '' for text/id, np.nan for coords?
+ # Sticking to '' as in original for simplicity, but consider data types.
+ review_file_df[col] = (
+ "" # Or np.nan for numerical, but coords already checked by dropna
+ )
+
+ # Select and order the final set of columns
+ # Ensure all selected columns actually exist after adding defaults
+ review_file_df = review_file_df[
+ [col for col in final_columns if col in review_file_df.columns]
+ ]
+
+ # 5. Final processing and sorting
+ # Convert colours from list to tuple if necessary - apply is okay here unless lists are vast
+ if "color" in review_file_df.columns:
+ # Check if the column actually contains lists before applying lambda
+ if review_file_df["color"].apply(lambda x: isinstance(x, list)).any():
+ review_file_df.loc[:, "color"] = review_file_df.loc[:, "color"].apply(
+ lambda x: tuple(x) if isinstance(x, list) else x
+ )
+
+ # Sort the results
+ # Ensure sort columns exist before sorting
+ sort_columns = ["page", "ymin", "xmin", "label"]
+ valid_sort_columns = [col for col in sort_columns if col in review_file_df.columns]
+ if valid_sort_columns and not review_file_df.empty: # Only sort non-empty df
+ # Convert potential numeric sort columns to appropriate types if necessary
+ # (e.g., 'page', 'ymin', 'xmin') to ensure correct sorting.
+ # dropna(subset=[...], inplace=True) earlier should handle NaNs in coords.
+ # page conversion already done before proximity match.
+ try:
+ review_file_df = review_file_df.sort_values(valid_sort_columns)
+ except TypeError as e:
+ print(
+ f"Warning: Could not sort DataFrame due to type error in sort columns: {e}"
+ )
+ # Proceed without sorting
+
+ base_cols = ["xmin", "xmax", "ymin", "ymax", "text", "id", "label"]
+
+ for col in base_cols:
+ if col not in review_file_df.columns:
+ review_file_df[col] = pd.NA
+
+ review_file_df = review_file_df.dropna(subset=base_cols, how="all")
+
+ return review_file_df
+
+
+def fill_missing_ids_in_list(data_list: list) -> list:
+ """
+ Generates unique alphanumeric IDs for dictionaries in a list where the 'id' is
+ missing, blank, or not a 12-character string.
+
+ Args:
+ data_list (list): A list of dictionaries, each potentially with an 'id' key.
+
+ Returns:
+ list: The input list with missing/invalid IDs filled.
+ Note: The function modifies the input list in place.
+ """
+
+ # --- Input Validation ---
+ if not isinstance(data_list, list):
+ raise TypeError("Input 'data_list' must be a list.")
+
+ if not data_list:
+ return data_list # Return empty list as-is
+
+ id_length = 12
+ character_set = string.ascii_letters + string.digits # a-z, A-Z, 0-9
+
+ # --- Get Existing IDs to Ensure Uniqueness ---
+ # Collect all valid existing IDs first
+ existing_ids = set()
+ for item in data_list:
+ if not isinstance(item, dict):
+ continue # Skip non-dictionary items
+ item_id = item.get("id")
+ if isinstance(item_id, str) and len(item_id) == id_length:
+ existing_ids.add(item_id)
+
+ # --- Identify and Fill Items Needing IDs ---
+ generated_ids_set = set() # Keep track of IDs generated *in this run*
+ num_filled = 0
+
+ for item in data_list:
+ if not isinstance(item, dict):
+ continue # Skip non-dictionary items
+
+ item_id = item.get("id")
+
+ # Check if ID needs to be generated
+ # Needs ID if: key is missing, value is None, value is not a string,
+ # value is an empty string after stripping whitespace, or value is a string
+ # but not of the correct length.
+ needs_new_id = (
+ item_id is None
+ or not isinstance(item_id, str)
+ or item_id.strip() == ""
+ or len(item_id) != id_length
+ )
+
+ if needs_new_id:
+ # Generate a unique ID
+ attempts = 0
+ while True:
+ candidate_id = "".join(random.choices(character_set, k=id_length))
+ # Check against *all* existing valid IDs and *newly* generated ones in this run
+ if (
+ candidate_id not in existing_ids
+ and candidate_id not in generated_ids_set
+ ):
+ generated_ids_set.add(candidate_id)
+ item["id"] = (
+ candidate_id # Assign the new ID directly to the item dict
+ )
+ num_filled += 1
+ break # Found a unique ID
+ attempts += 1
+ # Safety break for unlikely infinite loop (though highly improbable with 12 chars)
+ if attempts > len(data_list) * 100 + 1000:
+ raise RuntimeError(
+ f"Failed to generate a unique ID after {attempts} attempts. Check ID length or existing IDs."
+ )
+
+ if num_filled > 0:
+ pass
+ # print(f"Successfully filled {num_filled} missing or invalid IDs.")
+ else:
+ pass
+ # print("No missing or invalid IDs found.")
+
+ # The input list 'data_list' has been modified in place
+ return data_list
+
+
+def fill_missing_box_ids(data_input: dict) -> dict:
+ """
+ Generates unique alphanumeric IDs for bounding boxes in an input dictionary
+ where the 'id' is missing, blank, or not a 12-character string.
+
+ Args:
+ data_input (dict): The input dictionary containing 'image' and 'boxes' keys.
+ 'boxes' should be a list of dictionaries, each potentially
+ with an 'id' key.
+
+ Returns:
+ dict: The input dictionary with missing/invalid box IDs filled.
+ Note: The function modifies the input dictionary in place.
+ """
+
+ # --- Input Validation ---
+ if not isinstance(data_input, dict):
+ raise TypeError("Input 'data_input' must be a dictionary.")
+ # if 'boxes' not in data_input or not isinstance(data_input.get('boxes'), list):
+ # raise ValueError("Input dictionary must contain a 'boxes' key with a list value.")
+
+ boxes = data_input # ['boxes']
+ id_length = 12
+ character_set = string.ascii_letters + string.digits # a-z, A-Z, 0-9
+
+ # --- Get Existing IDs to Ensure Uniqueness ---
+ # Collect all valid existing IDs first
+ existing_ids = set()
+ # for box in boxes:
+ # Check if 'id' exists, is a string, and is the correct length
+ box_id = boxes.get("id")
+ if isinstance(box_id, str) and len(box_id) == id_length:
+ existing_ids.add(box_id)
+
+ # --- Identify and Fill Rows Needing IDs ---
+ generated_ids_set = set() # Keep track of IDs generated *in this run*
+ num_filled = 0
+
+ # for box in boxes:
+ box_id = boxes.get("id")
+
+ # Check if ID needs to be generated
+ # Needs ID if: key is missing, value is None, value is not a string,
+ # value is an empty string after stripping whitespace, or value is a string
+ # but not of the correct length.
+ needs_new_id = (
+ box_id is None
+ or not isinstance(box_id, str)
+ or box_id.strip() == ""
+ or len(box_id) != id_length
+ )
+
+ if needs_new_id:
+ # Generate a unique ID
+ attempts = 0
+ while True:
+ candidate_id = "".join(random.choices(character_set, k=id_length))
+ # Check against *all* existing valid IDs and *newly* generated ones in this run
+ if (
+ candidate_id not in existing_ids
+ and candidate_id not in generated_ids_set
+ ):
+ generated_ids_set.add(candidate_id)
+ boxes["id"] = candidate_id # Assign the new ID directly to the box dict
+ num_filled += 1
+ break # Found a unique ID
+ attempts += 1
+ # Safety break for unlikely infinite loop (though highly improbable with 12 chars)
+ if attempts > len(boxes) * 100 + 1000:
+ raise RuntimeError(
+ f"Failed to generate a unique ID after {attempts} attempts. Check ID length or existing IDs."
+ )
+
+ if num_filled > 0:
+ pass
+ # print(f"Successfully filled {num_filled} missing or invalid box IDs.")
+ else:
+ pass
+ # print("No missing or invalid box IDs found.")
+
+ # The input dictionary 'data_input' has been modified in place
+ return data_input
+
+
+def fill_missing_box_ids_each_box(data_input: Dict) -> Dict:
+ """
+ Generates unique alphanumeric IDs for bounding boxes in a list
+ where the 'id' is missing, blank, or not a 12-character string.
+
+ Args:
+ data_input (Dict): The input dictionary containing 'image' and 'boxes' keys.
+ 'boxes' should be a list of dictionaries, each potentially
+ with an 'id' key.
+
+ Returns:
+ Dict: The input dictionary with missing/invalid box IDs filled.
+ Note: The function modifies the input dictionary in place.
+ """
+ # --- Input Validation ---
+ if not isinstance(data_input, dict):
+ raise TypeError("Input 'data_input' must be a dictionary.")
+ if "boxes" not in data_input or not isinstance(data_input.get("boxes"), list):
+ # If there are no boxes, there's nothing to do.
+ return data_input
+
+ boxes_list = data_input["boxes"]
+ id_length = 12
+ character_set = string.ascii_letters + string.digits
+
+ # --- 1. Get ALL Existing IDs to Ensure Uniqueness ---
+ # Collect all valid existing IDs from the entire list first.
+ existing_ids = set()
+ for box in boxes_list:
+ if isinstance(box, dict):
+ box_id = box.get("id")
+ if isinstance(box_id, str) and len(box_id) == id_length:
+ existing_ids.add(box_id)
+
+ # --- 2. Iterate and Fill IDs for each box ---
+ generated_ids_this_run = set() # Keep track of IDs generated in this run
+ num_filled = 0
+
+ for box in boxes_list:
+ if not isinstance(box, dict):
+ continue # Skip items in the list that are not dictionaries
+
+ box_id = box.get("id")
+
+ # Check if this specific box needs a new ID
+ needs_new_id = (
+ box_id is None
+ or not isinstance(box_id, str)
+ or box_id.strip() == ""
+ or len(box_id) != id_length
+ )
+
+ if needs_new_id:
+ # Generate a truly unique ID
+ while True:
+ candidate_id = "".join(random.choices(character_set, k=id_length))
+ # Check against original IDs and newly generated IDs
+ if (
+ candidate_id not in existing_ids
+ and candidate_id not in generated_ids_this_run
+ ):
+ generated_ids_this_run.add(candidate_id)
+ box["id"] = candidate_id # Assign the ID to the individual box
+ num_filled += 1
+ break # Move to the next box
+
+ if num_filled > 0:
+ pass
+ # print(f"Successfully filled {num_filled} missing or invalid box IDs.")
+
+ # The input dictionary 'data_input' has been modified in place
+ return data_input
+
+
+def fill_missing_ids(
+ df: pd.DataFrame, column_name: str = "id", length: int = 12
+) -> pd.DataFrame:
+ """
+ Optimized: Generates unique alphanumeric IDs for rows in a DataFrame column
+ where the value is missing (NaN, None) or an empty/whitespace string.
+
+ Args:
+ df (pd.DataFrame): The input Pandas DataFrame.
+ column_name (str): The name of the column to check and fill (defaults to 'id').
+ This column will be added if it doesn't exist.
+ length (int): The desired length of the generated IDs (defaults to 12).
+
+ Returns:
+ pd.DataFrame: The DataFrame with missing/empty IDs filled in the specified column.
+ Note: The function modifies the DataFrame directly (in-place).
+ """
+
+ # --- Input Validation ---
+ if not isinstance(df, pd.DataFrame):
+ raise TypeError("Input 'df' must be a Pandas DataFrame.")
+ if not isinstance(column_name, str) or not column_name:
+ raise ValueError("'column_name' must be a non-empty string.")
+ if not isinstance(length, int) or length <= 0:
+ raise ValueError("'length' must be a positive integer.")
+
+ # --- Ensure Column Exists ---
+ original_dtype = None
+ if column_name not in df.columns:
+ # print(f"Column '{column_name}' not found. Adding it to the DataFrame.")
+ # Initialize with None (which Pandas often treats as NaN but allows object dtype)
+ df[column_name] = None
+ # Set original_dtype to object so it likely becomes string later
+ original_dtype = object
+ else:
+ original_dtype = df[column_name].dtype
+
+ # --- Identify Rows Needing IDs ---
+ # 1. Check for actual null values (NaN, None, NaT)
+ is_null = df[column_name].isna()
+
+ # 2. Check for empty or whitespace-only strings AFTER converting potential values to string
+ # Only apply string checks on rows that are *not* null to avoid errors/warnings
+ # Fill NaN temporarily for string operations, then check length or equality
+ is_empty_str = pd.Series(False, index=df.index) # Default to False
+ if not is_null.all(): # Only check strings if there are non-null values
+ temp_str_col = df.loc[~is_null, column_name].astype(str).str.strip()
+ is_empty_str.loc[~is_null] = temp_str_col == ""
+
+ # Combine the conditions
+ is_missing_or_empty = is_null | is_empty_str
+
+ rows_to_fill_index = df.index[is_missing_or_empty]
+ num_needed = len(rows_to_fill_index)
+
+ if num_needed == 0:
+ # Ensure final column type is consistent if nothing was done
+ if pd.api.types.is_object_dtype(original_dtype) or pd.api.types.is_string_dtype(
+ original_dtype
+ ):
+ pass # Likely already object or string
+ else:
+ # If original was numeric/etc., but might contain strings now? Unlikely here.
+ pass # Or convert to object: df[column_name] = df[column_name].astype(object)
+ # print(f"No missing or empty values found requiring IDs in column '{column_name}'.")
+ return df
+
+ # print(f"Found {num_needed} rows requiring a unique ID in column '{column_name}'.")
+
+ # --- Get Existing IDs to Ensure Uniqueness ---
+ # Consider only rows that are *not* missing/empty
+ valid_rows = df.loc[~is_missing_or_empty, column_name]
+ # Drop any remaining nulls (shouldn't be any based on mask, but belts and braces)
+ valid_rows = valid_rows.dropna()
+ # Convert to string *only* if not already string/object, then filter out empty strings again
+ if not pd.api.types.is_object_dtype(
+ valid_rows.dtype
+ ) and not pd.api.types.is_string_dtype(valid_rows.dtype):
+ existing_ids = set(valid_rows.astype(str).str.strip())
+ else: # Already string or object, just strip and convert to set
+ existing_ids = set(
+ valid_rows.astype(str).str.strip()
+ ) # astype(str) handles mixed types in object column
+
+ # Remove empty string from existing IDs if it's there after stripping
+ existing_ids.discard("")
+
+ # --- Generate Unique IDs ---
+ character_set = string.ascii_letters + string.digits # a-z, A-Z, 0-9
+ generated_ids_set = set() # Keep track of IDs generated *in this run*
+ new_ids_list = list() # Store the generated IDs in order
+
+ max_possible_ids = len(character_set) ** length
+ if num_needed > max_possible_ids:
+ raise ValueError(
+ f"Cannot generate {num_needed} unique IDs with length {length}. Maximum possible is {max_possible_ids}."
+ )
+
+ # Pre-calculate safety break limit
+ max_attempts_per_id = max(1000, num_needed * 10) # Adjust multiplier as needed
+
+ # print(f"Generating {num_needed} unique IDs of length {length}...")
+ for i in range(num_needed):
+ attempts = 0
+ while True:
+ candidate_id = "".join(random.choices(character_set, k=length))
+ # Check against *all* known existing IDs and *newly* generated ones
+ if (
+ candidate_id not in existing_ids
+ and candidate_id not in generated_ids_set
+ ):
+ generated_ids_set.add(candidate_id)
+ new_ids_list.append(candidate_id)
+ break # Found a unique ID
+ attempts += 1
+ if attempts > max_attempts_per_id: # Safety break
+ raise RuntimeError(
+ f"Failed to generate a unique ID after {attempts} attempts. Check length, character set, or density of existing IDs."
+ )
+
+ # Optional progress update
+ # if (i + 1) % 1000 == 0:
+ # print(f"Generated {i+1}/{num_needed} IDs...")
+
+ # --- Assign New IDs ---
+ # Use the previously identified index to assign the new IDs correctly
+ # Assigning string IDs might change the column's dtype to 'object'
+ if not pd.api.types.is_object_dtype(
+ original_dtype
+ ) and not pd.api.types.is_string_dtype(original_dtype):
+ df["id"] = df["id"].astype(str, errors="ignore")
+ # warnings.warn(f"Column '{column_name}' dtype might change from '{original_dtype}' to 'object' due to string ID assignment.", UserWarning)
+
+ df.loc[rows_to_fill_index, column_name] = new_ids_list
+ # print(
+ # f"Successfully assigned {len(new_ids_list)} new unique IDs to column '{column_name}'."
+ # )
+
+ return df
+
+
+def convert_review_df_to_annotation_json(
+ review_file_df: pd.DataFrame,
+ image_paths: List[str], # List of image file paths
+ page_sizes: List[
+ Dict
+ ], # List of dicts like [{'page': 1, 'image_path': '...', 'image_width': W, 'image_height': H}, ...]
+ xmin="xmin",
+ xmax="xmax",
+ ymin="ymin",
+ ymax="ymax", # Coordinate column names
+) -> List[Dict]:
+ """
+ Optimized function to convert review DataFrame to Gradio Annotation JSON format.
+
+ Ensures absolute coordinates, handles missing IDs, deduplicates based on key fields,
+ selects final columns, and structures data per image/page based on page_sizes.
+
+ Args:
+ review_file_df: Input DataFrame with annotation data.
+ image_paths: List of image file paths (Note: currently unused if page_sizes provides paths).
+ page_sizes: REQUIRED list of dictionaries, each containing 'page',
+ 'image_path', 'image_width', and 'image_height'. Defines
+ output structure and dimensions for coordinate conversion.
+ xmin, xmax, ymin, ymax: Names of the coordinate columns.
+
+ Returns:
+ List of dictionaries suitable for Gradio Annotation output, one dict per image/page.
+ """
+ base_cols = ["xmin", "xmax", "ymin", "ymax", "text", "id", "label"]
+
+ for col in base_cols:
+ if col not in review_file_df.columns:
+ review_file_df[col] = pd.NA
+
+ review_file_df = review_file_df.dropna(
+ subset=["xmin", "xmax", "ymin", "ymax", "text", "id", "label"], how="all"
+ )
+
+ if not page_sizes:
+ raise ValueError("page_sizes argument is required and cannot be empty.")
+
+ # --- Prepare Page Sizes DataFrame ---
+ try:
+ page_sizes_df = pd.DataFrame(page_sizes)
+ required_ps_cols = {"page", "image_path", "image_width", "image_height"}
+ if not required_ps_cols.issubset(page_sizes_df.columns):
+ missing = required_ps_cols - set(page_sizes_df.columns)
+ raise ValueError(f"page_sizes is missing required keys: {missing}")
+ # Convert page sizes columns to appropriate numeric types early
+ page_sizes_df["page"] = pd.to_numeric(page_sizes_df["page"], errors="coerce")
+ page_sizes_df["image_width"] = pd.to_numeric(
+ page_sizes_df["image_width"], errors="coerce"
+ )
+ page_sizes_df["image_height"] = pd.to_numeric(
+ page_sizes_df["image_height"], errors="coerce"
+ )
+ # Use nullable Int64 for page number consistency
+ page_sizes_df["page"] = page_sizes_df["page"].astype("Int64")
+
+ except Exception as e:
+ raise ValueError(f"Error processing page_sizes: {e}") from e
+
+ # Handle empty input DataFrame gracefully
+ if review_file_df.empty:
+ print(
+ "Input review_file_df is empty. Proceeding to generate JSON structure with empty boxes."
+ )
+ # Ensure essential columns exist even if empty for later steps
+ for col in [xmin, xmax, ymin, ymax, "page", "label", "color", "id", "text"]:
+ if col not in review_file_df.columns:
+ review_file_df[col] = pd.NA
+ else:
+ # --- Coordinate Conversion (if needed) ---
+ coord_cols_to_check = [
+ c for c in [xmin, xmax, ymin, ymax] if c in review_file_df.columns
+ ]
+ needs_multiplication = False
+ if coord_cols_to_check:
+ temp_df_numeric = review_file_df[coord_cols_to_check].apply(
+ pd.to_numeric, errors="coerce"
+ )
+ if (
+ temp_df_numeric.le(1).any().any()
+ ): # Check if any numeric coord <= 1 exists
+ needs_multiplication = True
+
+ if needs_multiplication:
+ # print("Relative coordinates detected or suspected, running multiplication...")
+ review_file_df = multiply_coordinates_by_page_sizes(
+ review_file_df.copy(), # Pass a copy to avoid modifying original outside function
+ page_sizes_df,
+ xmin,
+ xmax,
+ ymin,
+ ymax,
+ )
+ else:
+ # print("No relative coordinates detected or required columns missing, skipping multiplication.")
+ # Still ensure essential coordinate/page columns are numeric if they exist
+ cols_to_convert = [
+ c
+ for c in [xmin, xmax, ymin, ymax, "page"]
+ if c in review_file_df.columns
+ ]
+ for col in cols_to_convert:
+ review_file_df[col] = pd.to_numeric(
+ review_file_df[col], errors="coerce"
+ )
+
+ # Handle potential case where multiplication returns an empty DF
+ if review_file_df.empty:
+ print("DataFrame became empty after coordinate processing.")
+ # Re-add essential columns if they were lost
+ for col in [xmin, xmax, ymin, ymax, "page", "label", "color", "id", "text"]:
+ if col not in review_file_df.columns:
+ review_file_df[col] = pd.NA
+
+ # --- Fill Missing IDs ---
+ review_file_df = fill_missing_ids(review_file_df.copy()) # Pass a copy
+
+ # --- Deduplicate Based on Key Fields ---
+ base_dedupe_cols = ["page", xmin, ymin, xmax, ymax, "label", "id"]
+ # Identify which deduplication columns actually exist in the DataFrame
+ cols_for_dedupe = [
+ col for col in base_dedupe_cols if col in review_file_df.columns
+ ]
+ # Add 'image' column for deduplication IF it exists (matches original logic intent)
+ if "image" in review_file_df.columns:
+ cols_for_dedupe.append("image")
+
+ # Ensure placeholder columns exist if they are needed for deduplication
+ # (e.g., 'label', 'id' should be present after fill_missing_ids)
+ for col in ["label", "id"]:
+ if col in cols_for_dedupe and col not in review_file_df.columns:
+ # This might indicate an issue in fill_missing_ids or prior steps
+ print(
+ f"Warning: Column '{col}' needed for dedupe but not found. Adding NA."
+ )
+ review_file_df[col] = "" # Add default empty string
+
+ if cols_for_dedupe: # Only attempt dedupe if we have columns to check
+ # print(f"Deduplicating based on columns: {cols_for_dedupe}")
+ # Convert relevant columns to string before dedupe to avoid type issues with mixed data (optional, depends on data)
+ # for col in cols_for_dedupe:
+ # review_file_df[col] = review_file_df[col].astype(str)
+ review_file_df = review_file_df.drop_duplicates(subset=cols_for_dedupe)
+ else:
+ print("Skipping deduplication: No valid columns found to deduplicate by.")
+
+ # --- Select and Prepare Final Output Columns ---
+ required_final_cols = [
+ "page",
+ "label",
+ "color",
+ xmin,
+ ymin,
+ xmax,
+ ymax,
+ "id",
+ "text",
+ ]
+ # Identify which of the desired final columns exist in the (now potentially deduplicated) DataFrame
+ available_final_cols = [
+ col for col in required_final_cols if col in review_file_df.columns
+ ]
+
+ # Ensure essential output columns exist, adding defaults if missing AFTER deduplication
+ for col in required_final_cols:
+ if col not in review_file_df.columns:
+ print(f"Adding missing final column '{col}' with default value.")
+ if col in ["label", "id", "text"]:
+ review_file_df[col] = "" # Default empty string
+ elif col == "color":
+ review_file_df[col] = None # Default None or a default color tuple
+ else: # page, coordinates
+ review_file_df[col] = pd.NA # Default NA for numeric/page
+ available_final_cols.append(col) # Add to list of available columns
+
+ # Select only the final desired columns in the correct order
+ review_file_df = review_file_df[available_final_cols]
+
+ # --- Final Formatting ---
+ if not review_file_df.empty:
+ # Convert list colors to tuples (important for some downstream uses)
+ if "color" in review_file_df.columns:
+ is_list = review_file_df["color"].apply(lambda x: isinstance(x, list))
+ if is_list.any():
+ review_file_df.loc[is_list, "color"] = review_file_df.loc[
+ is_list, "color"
+ ].apply(tuple)
+ # Ensure page column is nullable integer type for reliable grouping
+ if "page" in review_file_df.columns:
+ review_file_df["page"] = review_file_df["page"].astype("Int64")
+
+ # --- Group Annotations by Page ---
+ output_cols_for_boxes = [
+ col
+ for col in ["label", "color", xmin, ymin, xmax, ymax, "id", "text"]
+ if col in review_file_df.columns
+ ]
+
+ # Ensure coordinate columns are native Python floats (not np.float64) for JSON/dict
+ for c in [xmin, xmax, ymin, ymax]:
+ if c in review_file_df.columns:
+ review_file_df[c] = review_file_df[c].apply(
+ lambda x: float(x) if pd.notna(x) else x
+ )
+
+ if "page" in review_file_df.columns:
+ # Build page -> list of box dicts once (avoids iterrows + get_group per page)
+ page_to_boxes = {}
+ for page_num, group in review_file_df.groupby("page"):
+ if pd.notna(page_num):
+ page_to_boxes[page_num] = (
+ group[output_cols_for_boxes]
+ .replace({np.nan: None})
+ .to_dict(orient="records")
+ )
+ else:
+ print("Error: 'page' column missing, cannot group annotations.")
+ page_to_boxes = {}
+
+ # --- Build JSON Structure ---
+ # Iterate page_sizes by column (no iterrows); lookup boxes by page
+ json_data = [
+ {
+ "image": pdf_image_path,
+ "boxes": page_to_boxes.get(page_num, []) if pd.notna(page_num) else [],
+ }
+ for page_num, pdf_image_path in zip(
+ page_sizes_df["page"], page_sizes_df["image_path"]
+ )
+ ]
+
+ return json_data
diff --git a/tools/file_redaction.py b/tools/file_redaction.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb111c52c5e8119118172f2e0e48b4ea48450c8f
--- /dev/null
+++ b/tools/file_redaction.py
@@ -0,0 +1,12497 @@
+import copy
+import io
+import json
+import os
+import re
+import statistics
+import time
+from collections import defaultdict # For efficient grouping
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from datetime import datetime
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import boto3
+import cv2
+import gradio as gr
+import numpy as np
+import pandas as pd
+import pymupdf
+from gradio import Progress
+from pdfminer.high_level import extract_pages
+from pdfminer.layout import (
+ LTAnno,
+ LTTextContainer,
+ LTTextLine,
+ LTTextLineHorizontal,
+)
+from pikepdf import Dictionary, Name, Pdf
+from PIL import Image, ImageDraw, ImageFile, ImageFont
+from presidio_analyzer import AnalyzerEngine
+from pymupdf import Document, Page, Rect
+from tqdm import tqdm
+
+from tools.aws_textract import (
+ analyse_page_with_textract,
+ convert_page_question_answer_to_custom_image_recognizer_results,
+ convert_question_answer_to_dataframe,
+ json_to_ocrresult,
+ load_and_convert_textract_json,
+)
+from tools.config import (
+ APPLY_REDACTIONS_GRAPHICS,
+ APPLY_REDACTIONS_IMAGES,
+ APPLY_REDACTIONS_TEXT,
+ AWS_ACCESS_KEY,
+ AWS_LLM_PII_OPTION,
+ AWS_PII_OPTION,
+ AWS_REGION,
+ AWS_SECRET_KEY,
+ AZURE_OPENAI_VLM_TEXT_EXTRACT_OPTION,
+ BEDROCK_VLM_TEXT_EXTRACT_OPTION,
+ CLOUD_LLM_PII_MODEL_CHOICE,
+ CLOUD_VLM_MODEL_CHOICE,
+ CUSTOM_BOX_COLOUR,
+ CUSTOM_ENTITIES,
+ CUSTOM_VLM_BACKEND,
+ CUSTOM_VLM_MIN_CONFIDENCE,
+ DEFAULT_LANGUAGE,
+ DEFAULT_LOCAL_OCR_MODEL,
+ EFFICIENT_OCR,
+ EFFICIENT_OCR_MIN_IMAGE_COVERAGE_FRACTION,
+ EFFICIENT_OCR_MIN_WORDS,
+ GEMINI_VLM_TEXT_EXTRACT_OPTION,
+ HYBRID_TEXTRACT_BEDROCK_VLM,
+ HYBRID_TEXTRACT_BEDROCK_VLM_CONFIDENCE_THRESHOLD,
+ HYBRID_TEXTRACT_BEDROCK_VLM_PADDING,
+ IMAGES_DPI,
+ INCLUDE_OCR_VISUALISATION_IN_OUTPUT_FILES,
+ INFERENCE_SERVER_API_URL,
+ INFERENCE_SERVER_LLM_PII_MODEL_CHOICE,
+ INFERENCE_SERVER_PII_OPTION,
+ INPUT_FOLDER,
+ LOAD_TRUNCATED_IMAGES,
+ LOCAL_OCR_MODEL_TEXT_EXTRACT_OPTION,
+ LOCAL_PII_OPTION,
+ LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE,
+ LOCAL_TRANSFORMERS_LLM_PII_OPTION,
+ MAX_DOC_PAGES,
+ MAX_IMAGE_PIXELS,
+ MAX_SIMULTANEOUS_FILES,
+ MAX_TIME_VALUE,
+ MAX_WORKERS,
+ MERGE_BOUNDING_BOXES,
+ MERGE_SMALL_REDACTIONS,
+ NO_REDACTION_PII_OPTION,
+ OCR_FIRST_PASS_MAX_WORKERS,
+ OUTPUT_FOLDER,
+ OVERWRITE_EXISTING_OCR_RESULTS,
+ PADDLE_MAX_WORKERS,
+ PAGE_BREAK_VALUE,
+ PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS,
+ RETURN_PDF_FOR_REVIEW,
+ RETURN_REDACTED_PDF,
+ RUN_AWS_FUNCTIONS,
+ SAVE_PAGE_OCR_VISUALISATIONS,
+ SELECTABLE_TEXT_EXTRACT_OPTION,
+ TESSERACT_MAX_WORKERS,
+ TEXTRACT_TEXT_EXTRACT_OPTION,
+ USE_GUI_BOX_COLOURS_FOR_OUTPUTS,
+ aws_comprehend_language_choices,
+ textract_language_choices,
+)
+from tools.custom_image_analyser_engine import (
+ CustomImageAnalyzerEngine,
+ CustomImageRecognizerResult,
+ OCRResult,
+ _bedrock_page_ocr_predict,
+ _inference_server_page_ocr_predict,
+ _process_textract_page_with_hybrid_bedrock_vlm,
+ _vlm_page_ocr_predict,
+ combine_ocr_results,
+ recreate_page_line_level_ocr_results_with_page,
+ run_page_text_redaction,
+)
+from tools.file_conversion import (
+ convert_annotation_data_to_dataframe,
+ convert_annotation_json_to_review_df,
+ convert_pymupdf_to_image_coords,
+ create_annotation_dicts_from_annotation_df,
+ divide_coordinates_by_page_sizes,
+ fill_missing_box_ids,
+ fill_missing_ids,
+ is_pdf,
+ is_pdf_or_image,
+ load_and_convert_ocr_results_with_words_json,
+ prepare_image_or_pdf,
+ prepare_images_for_pages,
+ process_single_page_for_image_conversion,
+ remove_duplicate_images_with_blank_boxes,
+ save_pdf_with_or_without_compression,
+ word_level_ocr_output_to_dataframe,
+)
+from tools.helper_functions import (
+ clean_unicode_text,
+ get_file_name_without_type,
+ get_ocr_visualisation_font_path,
+ get_textract_file_suffix,
+)
+from tools.load_spacy_model_custom_recognisers import (
+ CustomWordFuzzyRecognizer,
+ create_nlp_analyser,
+ custom_word_list_recogniser,
+ download_tesseract_lang_pack,
+ load_spacy_model,
+ nlp_analyser,
+ score_threshold,
+)
+from tools.secure_path_utils import (
+ secure_file_write,
+ validate_folder_containment,
+ validate_path_containment,
+)
+
+# Extract numbers before 'seconds' using secure regex
+from tools.secure_regex_utils import safe_extract_numbers_with_seconds
+
+ImageFile.LOAD_TRUNCATED_IMAGES = LOAD_TRUNCATED_IMAGES
+if not MAX_IMAGE_PIXELS:
+ Image.MAX_IMAGE_PIXELS = None
+else:
+ Image.MAX_IMAGE_PIXELS = MAX_IMAGE_PIXELS
+image_dpi = float(IMAGES_DPI)
+
+custom_entities = CUSTOM_ENTITIES
+
+
+def bounding_boxes_overlap(box1, box2):
+ """Check if two bounding boxes overlap."""
+ return (
+ box1[0] < box2[2]
+ and box2[0] < box1[2]
+ and box1[1] < box2[3]
+ and box2[1] < box1[3]
+ )
+
+
+def sum_numbers_before_seconds(string: str):
+ """Extracts numbers that precede the word 'seconds' from a string and adds them up.
+
+ Args:
+ string: The input string.
+
+ Returns:
+ The sum of all numbers before 'seconds' in the string.
+ """
+
+ numbers = safe_extract_numbers_with_seconds(string)
+
+ # Sum up the extracted numbers
+ sum_of_numbers = round(sum(numbers), 1)
+
+ return sum_of_numbers
+
+
+def reverse_y_coords(df: pd.DataFrame, column: str):
+ df[column] = df[column]
+ df[column] = 1 - df[column].astype(float)
+
+ df[column] = df[column].round(6)
+
+ return df[column]
+
+
+def _merge_one_page_results(page: int, items: list) -> dict:
+ """Merge results for a single page; safe to run in a thread."""
+ merged = {"page": page, "results": {}}
+ for item in items:
+ merged["results"].update(item.get("results", {}))
+ return merged
+
+
+def merge_page_results(data: list):
+ if not data:
+ return []
+ # Group items by page
+ by_page = defaultdict(list)
+ for item in data:
+ page = item["page"]
+ by_page[page].append(item)
+ # Merge each page's items in parallel
+ pages = list(by_page.keys())
+ n = len(pages)
+ max_workers = min(MAX_WORKERS, n)
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
+ merged_list = list(
+ executor.map(
+ lambda p: _merge_one_page_results(p, by_page[p]),
+ pages,
+ )
+ )
+ return sorted(merged_list, key=lambda x: x["page"])
+
+
+def _word_count_by_page_from_ocr_results_with_words(
+ ocr_with_words: List[dict], page_numbers_1based: List[int]
+) -> Dict[int, int]:
+ """
+ Compute word count per page from the flat list of line-level OCR results with words.
+ Used by EFFICIENT_OCR to classify pages as above/below efficient_ocr_min_words threshold.
+
+ Args:
+ ocr_with_words: List of dicts with "page" and "results" (line_key -> {words: [...]}).
+ page_numbers_1based: 1-based page numbers to include in the returned dict.
+
+ Returns:
+ Dict mapping page number (1-based) to word count.
+ """
+ by_page = defaultdict(int)
+ for item in ocr_with_words or []:
+ if not item:
+ continue
+ p = item.get("page")
+ if p is None:
+ continue
+ page_num = int(p) if isinstance(p, str) else p
+ for line_data in item.get("results", {}).values():
+ words = line_data.get("words", [])
+ by_page[page_num] += len(words) if isinstance(words, list) else 0
+ return {p: by_page.get(p, 0) for p in page_numbers_1based}
+
+
+def _page_has_significant_embedded_image(
+ page: Page,
+ min_coverage_fraction: float,
+) -> bool:
+ """
+ True if any embedded image on the page covers at least min_coverage_fraction of the
+ MediaBox area (single placement). Uses PyMuPDF get_images / get_image_rects.
+ """
+ if min_coverage_fraction <= 0:
+ return False
+ mediabox = page.mediabox
+ page_area = float(abs(mediabox.width) * abs(mediabox.height))
+ if page_area <= 0:
+ return False
+
+ for img_item in page.get_images(full=True):
+ try:
+ rects = page.get_image_rects(img_item)
+ except Exception:
+ try:
+ xref = img_item[0] if img_item else None
+ rects = page.get_image_rects(xref) if xref is not None else []
+ except Exception:
+ continue
+ for r in rects:
+ if isinstance(r, tuple):
+ r = r[0]
+ try:
+ rect_area = float(abs(r.width) * abs(r.height))
+ except Exception:
+ continue
+ if rect_area / page_area >= min_coverage_fraction:
+ return True
+ return False
+
+
+def _efficient_ocr_pages_with_significant_embedded_images(
+ file_path: str,
+ pages_1based: List[int],
+ min_coverage_fraction: float,
+ pymupdf_doc: Optional[Document] = None,
+) -> set:
+ """
+ 1-based page numbers that should use the OCR path due to a significant embedded image,
+ regardless of selectable word count.
+ """
+ if min_coverage_fraction <= 0 or not pages_1based:
+ return set()
+
+ doc: Optional[Document] = None
+ close_doc = False
+ if isinstance(pymupdf_doc, Document) and getattr(pymupdf_doc, "page_count", 0) > 0:
+ doc = pymupdf_doc
+ else:
+ try:
+ doc = pymupdf.open(file_path)
+ close_doc = True
+ except Exception:
+ return set()
+
+ out = set()
+ try:
+ for p1 in pages_1based:
+ p0 = int(p1) - 1
+ if p0 < 0 or p0 >= doc.page_count:
+ continue
+ page = doc.load_page(p0)
+ if _page_has_significant_embedded_image(page, min_coverage_fraction):
+ out.add(int(p1))
+ return out
+ finally:
+ if close_doc and doc is not None:
+ doc.close()
+
+
+def _build_full_merged_pdf_image_paths_for_vlm(
+ file_path: str,
+ num_pages: int,
+ pages_needing_ocr_1based: List[int],
+ pages_with_text_1based: List[int],
+ pdf_image_file_paths: List[str],
+ all_pages_1based: List[int],
+ input_folder: str,
+ pymupdf_doc: Document,
+ page_sizes: List[dict],
+ progress,
+) -> Tuple[List[str], List[dict]]:
+ """
+ Merge OCR-page images with text-route page images so every page has an image path
+ (full-document pass). Used for CUSTOM_VLM_SIGNATURE under EFFICIENT_OCR when some
+ pages used the selectable-text path.
+ """
+ if pages_needing_ocr_1based:
+ if pages_with_text_1based:
+ text_page_image_paths, page_sizes = prepare_images_for_pages(
+ file_path,
+ pages_with_text_1based,
+ input_folder,
+ pymupdf_doc,
+ page_sizes,
+ progress,
+ )
+ return [
+ (pdf_image_file_paths[i] if i < len(pdf_image_file_paths) else "")
+ or (text_page_image_paths[i] if i < len(text_page_image_paths) else "")
+ for i in range(num_pages)
+ ], page_sizes
+ return pdf_image_file_paths, page_sizes
+ full_paths, page_sizes = prepare_images_for_pages(
+ file_path,
+ all_pages_1based,
+ input_folder,
+ pymupdf_doc,
+ page_sizes,
+ progress,
+ )
+ return full_paths, page_sizes
+
+
+def add_page_range_suffix_to_file_path(
+ file_path: str,
+ page_min: int,
+ current_loop_page: int,
+ number_of_pages: int,
+ page_max: int = None,
+) -> str:
+ """
+ Add page range suffix to file path if redaction didn't complete all pages.
+
+ Args:
+ file_path: The original file path
+ page_min: The minimum page number to start redaction from (0-indexed, after conversion in redact_image_pdf/redact_text_pdf)
+ current_loop_page: The number of pages processed (0-indexed count)
+ number_of_pages: Total number of pages in the document (1-indexed)
+ page_max: The maximum page number to end redaction at (1-indexed)
+
+ Returns:
+ File path with page range suffix if partial processing, otherwise original path
+ """
+
+ # if page_min == 0 and page_max == 0:
+ # return file_path
+
+ # If we processed all pages, don't add suffix
+ if current_loop_page >= number_of_pages:
+ return file_path
+
+ # Calculate the page range that was actually processed (for display in filename)
+ # page_min from UI: 0 means "first page", 1+ means that page. Never show 0 in suffix (not a real page number).
+ start_page = page_min if page_min >= 1 else 1
+
+ # Calculate end_page: page_min is 1-indexed (UI), current_loop_page is count of pages processed
+ # Last page processed (1-indexed) = start + count - 1
+ last_page_processed_1_indexed = page_min + current_loop_page - 1
+ if page_min < 1:
+ last_page_processed_1_indexed = (
+ current_loop_page # started from "first page" (0)
+ )
+ end_page = (
+ min(page_max, last_page_processed_1_indexed)
+ if page_max and page_max > 0
+ else last_page_processed_1_indexed
+ )
+ # Never show 0 in suffix (not a real page number)
+ if end_page < 1:
+ end_page = 1
+ if end_page < start_page:
+ end_page = start_page
+
+ # Add suffix before file extension
+ if "." in file_path:
+ name, ext = file_path.rsplit(".", 1)
+ return f"{name}_{start_page}_{end_page}.{ext}"
+ else:
+ return f"{file_path}_{start_page}_{end_page}"
+
+
+def _parse_vlm_person_signature_result(ocr_result, entity_type: str, text_label: str):
+ """Parse VLM OCR result dict into list of CustomImageRecognizerResult. Shared across backends.
+
+ Rows are expected to use canonical labels in the parallel ``text`` list (``[FACE]`` /
+ ``[SIGNATURE]``) as produced by ``_parse_vlm_page_ocr_response`` and local/inference
+ page OCR paths; the engine coerces person/signature JSON so mis-keyed text fields
+ do not drop valid boxes.
+ """
+ boxes = []
+ if isinstance(ocr_result, tuple) and len(ocr_result) >= 1:
+ ocr_result = ocr_result[0]
+ if not isinstance(ocr_result, dict):
+ return boxes
+ texts = ocr_result.get("text", [])
+ lefts = ocr_result.get("left", [])
+ tops = ocr_result.get("top", [])
+ widths = ocr_result.get("width", [])
+ heights = ocr_result.get("height", [])
+ confs = ocr_result.get("conf", [])
+ for idx, text in enumerate(texts):
+ if text != text_label:
+ continue
+ try:
+ left = int(lefts[idx])
+ top = int(tops[idx])
+ width = int(widths[idx])
+ height = int(heights[idx])
+ conf_raw = float(confs[idx]) if idx < len(confs) else 0.0
+ except Exception:
+ continue
+ conf = conf_raw / 100.0 if conf_raw > 1.0 else conf_raw
+ conf = max(0.0, min(1.0, conf))
+ if conf < CUSTOM_VLM_MIN_CONFIDENCE:
+ continue
+ boxes.append(
+ CustomImageRecognizerResult(
+ entity_type,
+ 0,
+ 0,
+ conf,
+ left,
+ top,
+ width,
+ height,
+ text_label,
+ )
+ )
+ return boxes
+
+
+def _gui_tqdm_subphase(
+ tqdm_bar, gradio_progress, phase: str, page_str: str, total_str: str
+):
+ """Show face/signature (or other) sub-phases in tqdm postfix and Gradio progress."""
+ detail = f"{phase} · page {page_str}/{total_str}"
+ try:
+ if tqdm_bar is not None and hasattr(tqdm_bar, "set_postfix_str"):
+ tqdm_bar.set_postfix_str(detail, refresh=True)
+ except Exception:
+ pass
+ try:
+ if gradio_progress is not None:
+ frac = 0.5
+ if tqdm_bar is not None:
+ total = getattr(tqdm_bar, "total", None) or 0
+ n = getattr(tqdm_bar, "n", 0)
+ if total and total > 0:
+ frac = min(0.999, max(0.0, (float(n) + 0.05) / float(total)))
+ gradio_progress(frac, desc=detail)
+ except Exception:
+ pass
+
+
+def _run_vlm_only_pass_one_page(
+ args: tuple,
+) -> tuple:
+ """
+ Worker for one page: run VLM person/signature detection using the backend set in
+ CUSTOM_VLM_BACKEND (transformers_vlm, inference_vlm, or bedrock_vlm). Returns
+ (page_no, image_path, page_width, page_height, vlm_boxes, input_tokens, output_tokens, model_name).
+ """
+ (
+ page_no,
+ image_path,
+ file_name,
+ run_person,
+ run_signature,
+ normalised_coords_range,
+ output_folder,
+ bedrock_runtime,
+ custom_vlm_backend,
+ inference_server_vlm_model,
+ ) = args
+ reported_page_number = page_no + 1
+ vlm_boxes = []
+ ti, to, name = 0, 0, ""
+
+ try:
+ image = Image.open(image_path)
+ except Exception:
+ return (page_no, image_path, 0, 0, [], 0, 0, "")
+
+ page_width, page_height = image.size
+ image_name = (
+ os.path.basename(image_path)
+ if isinstance(image_path, str)
+ else f"{file_name}_{reported_page_number}.png"
+ )
+
+ def add_tokens(result_tuple):
+ nonlocal ti, to, name
+ if isinstance(result_tuple, tuple) and len(result_tuple) == 4:
+ _, pi, po, pname = result_tuple
+ ti, to, name = ti + pi, to + po, pname or name
+
+ if custom_vlm_backend == "bedrock_vlm" and bedrock_runtime:
+ if run_person:
+ try:
+ people_ocr_result = _bedrock_page_ocr_predict(
+ image,
+ image_name=image_name,
+ normalised_coords_range=normalised_coords_range,
+ output_folder=output_folder,
+ detect_people_only=True,
+ model_choice=CLOUD_VLM_MODEL_CHOICE or None,
+ bedrock_runtime=bedrock_runtime,
+ page_index_0=page_no,
+ )
+ add_tokens(people_ocr_result)
+ vlm_boxes.extend(
+ _parse_vlm_person_signature_result(
+ people_ocr_result, "CUSTOM_VLM_FACES", "[FACE]"
+ )
+ )
+ except Exception as e:
+ print(
+ f"Warning: Bedrock VLM person detection failed on page {reported_page_number}: {e}"
+ )
+ if run_signature:
+ try:
+ sig_ocr_result = _bedrock_page_ocr_predict(
+ image,
+ image_name=image_name,
+ normalised_coords_range=normalised_coords_range,
+ output_folder=output_folder,
+ detect_signatures_only=True,
+ model_choice=CLOUD_VLM_MODEL_CHOICE or None,
+ bedrock_runtime=bedrock_runtime,
+ page_index_0=page_no,
+ )
+ add_tokens(sig_ocr_result)
+ vlm_boxes.extend(
+ _parse_vlm_person_signature_result(
+ sig_ocr_result, "CUSTOM_VLM_SIGNATURE", "[SIGNATURE]"
+ )
+ )
+ except Exception as e:
+ print(
+ f"Warning: Bedrock VLM signature detection failed on page {reported_page_number}: {e}"
+ )
+ elif custom_vlm_backend == "inference_vlm":
+ model_name = inference_server_vlm_model or None
+ if run_person:
+ try:
+ people_ocr_result = _inference_server_page_ocr_predict(
+ image,
+ image_name=image_name,
+ normalised_coords_range=normalised_coords_range,
+ output_folder=output_folder,
+ detect_people_only=True,
+ detect_signatures_only=False,
+ model_name=model_name,
+ page_index_0=page_no,
+ )
+ add_tokens(people_ocr_result)
+ vlm_boxes.extend(
+ _parse_vlm_person_signature_result(
+ people_ocr_result, "CUSTOM_VLM_FACES", "[FACE]"
+ )
+ )
+ except Exception as e:
+ print(
+ f"Warning: Inference VLM person detection failed on page {reported_page_number}: {e}"
+ )
+ if run_signature:
+ try:
+ sig_ocr_result = _inference_server_page_ocr_predict(
+ image,
+ image_name=image_name,
+ normalised_coords_range=normalised_coords_range,
+ output_folder=output_folder,
+ detect_people_only=False,
+ detect_signatures_only=True,
+ model_name=model_name,
+ page_index_0=page_no,
+ )
+ add_tokens(sig_ocr_result)
+ vlm_boxes.extend(
+ _parse_vlm_person_signature_result(
+ sig_ocr_result, "CUSTOM_VLM_SIGNATURE", "[SIGNATURE]"
+ )
+ )
+ except Exception as e:
+ print(
+ f"Warning: Inference VLM signature detection failed on page {reported_page_number}: {e}"
+ )
+ elif custom_vlm_backend == "transformers_vlm":
+ if run_person:
+ try:
+ people_ocr_result = _vlm_page_ocr_predict(
+ image,
+ image_name=image_name,
+ normalised_coords_range=normalised_coords_range,
+ output_folder=output_folder,
+ detect_people_only=True,
+ detect_signatures_only=False,
+ page_index_0=page_no,
+ )
+ add_tokens(people_ocr_result)
+ vlm_boxes.extend(
+ _parse_vlm_person_signature_result(
+ people_ocr_result, "CUSTOM_VLM_FACES", "[FACE]"
+ )
+ )
+ except Exception as e:
+ print(
+ f"Warning: Transformers VLM person detection failed on page {reported_page_number}: {e}"
+ )
+ if run_signature:
+ try:
+ sig_ocr_result = _vlm_page_ocr_predict(
+ image,
+ image_name=image_name,
+ normalised_coords_range=normalised_coords_range,
+ output_folder=output_folder,
+ detect_people_only=False,
+ detect_signatures_only=True,
+ page_index_0=page_no,
+ )
+ add_tokens(sig_ocr_result)
+ vlm_boxes.extend(
+ _parse_vlm_person_signature_result(
+ sig_ocr_result, "CUSTOM_VLM_SIGNATURE", "[SIGNATURE]"
+ )
+ )
+ except Exception as e:
+ print(
+ f"Warning: Transformers VLM signature detection failed on page {reported_page_number}: {e}"
+ )
+
+ return (page_no, image_path, page_width, page_height, vlm_boxes, ti, to, name)
+
+
+def run_custom_vlm_only_pass(
+ file_path: str,
+ pdf_image_file_paths: list,
+ pymupdf_doc,
+ page_sizes_df: pd.DataFrame,
+ chosen_redact_entities: list,
+ bedrock_runtime,
+ output_folder: str,
+ input_folder: str,
+ number_of_pages: int,
+ page_min: int = 0,
+ page_max: int = 0,
+ progress=None,
+ inference_server_vlm_model: str = "",
+) -> tuple:
+ """
+ Run only the CUSTOM_VLM (face, signature) detection on page images and apply
+ those redactions to the existing pages in pymupdf_doc. Used when text extraction
+ is selectable text but user selected CUSTOM_VLM_* entities (so text comes from
+ PDF, VLM detection from images).
+ Backend is chosen by config CUSTOM_VLM_BACKEND: 'transformers_vlm', 'inference_vlm', or 'bedrock_vlm'.
+ Only ``bedrock_vlm`` runs page VLM calls in parallel (ThreadPoolExecutor); ``inference_vlm`` and
+ ``transformers_vlm`` process pages sequentially.
+ Returns (vlm_total_input_tokens, vlm_total_output_tokens, vlm_model_name,
+ vlm_annotations_list, vlm_decision_rows) for merging into annotations_all_pages
+ and all_pages_decision_process_table.
+ """
+ vlm_total_input_tokens = 0
+ vlm_total_output_tokens = 0
+ vlm_model_name = ""
+ vlm_annotations_list = [] # list of {"image": path, "boxes": [dict, ...]}
+ vlm_decision_rows = [] # list of dicts for decision process table
+
+ file_name = get_file_name_without_type(file_path)
+ normalised_coords_range = 999
+ custom_vlm_backend = CUSTOM_VLM_BACKEND
+
+ run_person = "CUSTOM_VLM_FACES" in (chosen_redact_entities or [])
+ run_signature = "CUSTOM_VLM_SIGNATURE" in (chosen_redact_entities or [])
+
+ # Skip if chosen backend is not available
+ if custom_vlm_backend == "bedrock_vlm" and not bedrock_runtime:
+ return (
+ vlm_total_input_tokens,
+ vlm_total_output_tokens,
+ vlm_model_name,
+ vlm_annotations_list,
+ vlm_decision_rows,
+ )
+
+ start_page_0 = (page_min - 1) if page_min > 0 else 0
+ end_page_0 = min(
+ number_of_pages,
+ (page_max if page_max > 0 else number_of_pages),
+ )
+
+ # Build list of (page_no, image_path, ...) for pages we can process
+ page_args = []
+ for page_no in range(start_page_0, end_page_0):
+ if page_no >= len(pdf_image_file_paths):
+ break
+ image_path = pdf_image_file_paths[page_no]
+ if not image_path or not os.path.exists(image_path):
+ continue
+ page_args.append(
+ (
+ page_no,
+ image_path,
+ file_name,
+ run_person,
+ run_signature,
+ normalised_coords_range,
+ output_folder,
+ bedrock_runtime,
+ custom_vlm_backend,
+ inference_server_vlm_model or "",
+ )
+ )
+
+ # Only Bedrock runs page VLM calls in parallel (separate cloud requests). Inference server and
+ # local Transformers VLM run sequentially to avoid overloading the server or local GPU/memory.
+ if not page_args:
+ return (
+ vlm_total_input_tokens,
+ vlm_total_output_tokens,
+ vlm_model_name,
+ vlm_annotations_list,
+ vlm_decision_rows,
+ )
+
+ _vlm_sub = []
+ if run_person:
+ _vlm_sub.append("face")
+ if run_signature:
+ _vlm_sub.append("signature")
+ _vlm_phase_label = (
+ " & ".join(_vlm_sub).title() + " detection"
+ if _vlm_sub
+ else "CUSTOM_VLM detection"
+ )
+
+ _n_vlm_pages = len(page_args)
+
+ def _vlm_inference_progress_frac(completed: int) -> float:
+ """First half of the VLM sub-progress band (parallel API calls)."""
+ if _n_vlm_pages <= 0:
+ return 0.79
+ return min(0.79, 0.68 + 0.11 * float(completed) / float(_n_vlm_pages))
+
+ def _vlm_apply_progress_frac(i_done: int) -> float:
+ """Second half of the VLM sub-progress band (sequential pymupdf apply)."""
+ if _n_vlm_pages <= 0:
+ return 0.92
+ return min(0.92, 0.79 + 0.11 * float(i_done + 1) / float(_n_vlm_pages))
+
+ _use_parallel_vlm = custom_vlm_backend == "bedrock_vlm"
+
+ if _use_parallel_vlm:
+ futures_map = {}
+ max_workers = min(MAX_WORKERS, _n_vlm_pages)
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
+ for arg in page_args:
+ fut = executor.submit(_run_vlm_only_pass_one_page, arg)
+ futures_map[fut] = arg[0]
+ results_by_page: Dict[int, tuple] = {}
+ _completed = 0
+ for fut in as_completed(futures_map):
+ page_no = futures_map[fut]
+ try:
+ res = fut.result()
+ except Exception as e:
+ print(
+ f"Warning: CUSTOM_VLM worker failed on page {page_no + 1}: {e}"
+ )
+ _failed_path = next(
+ (a[1] for a in page_args if a[0] == page_no), ""
+ )
+ res = (page_no, _failed_path, 0, 0, [], 0, 0, "")
+ results_by_page[page_no] = res
+ _completed += 1
+ if progress is not None:
+ try:
+ progress(
+ _vlm_inference_progress_frac(_completed),
+ desc=(
+ f"{_vlm_phase_label} · VLM page {page_no + 1}/"
+ f"{number_of_pages} ({_completed}/{_n_vlm_pages} done)"
+ ),
+ )
+ except Exception:
+ pass
+ page_results = [results_by_page[arg[0]] for arg in page_args]
+ else:
+ page_results = []
+ for _completed, arg in enumerate(page_args, start=1):
+ page_no = arg[0]
+ try:
+ res = _run_vlm_only_pass_one_page(arg)
+ except Exception as e:
+ print(f"Warning: CUSTOM_VLM failed on page {page_no + 1}: {e}")
+ _failed_path = arg[1] if len(arg) > 1 else ""
+ res = (page_no, _failed_path, 0, 0, [], 0, 0, "")
+ page_results.append(res)
+ if progress is not None:
+ try:
+ progress(
+ _vlm_inference_progress_frac(_completed),
+ desc=(
+ f"{_vlm_phase_label} · VLM page {page_no + 1}/"
+ f"{number_of_pages} ({_completed}/{_n_vlm_pages} done)"
+ ),
+ )
+ except Exception:
+ pass
+
+ # Apply redactions and build annotations in page order (pymupdf is not thread-safe)
+ for _vlm_i, res in enumerate(page_results):
+ page_no, image_path, page_width, page_height, vlm_boxes, ti, to, name = res
+ try:
+ if progress is not None and _n_vlm_pages:
+ progress(
+ _vlm_apply_progress_frac(_vlm_i),
+ desc=(
+ f"{_vlm_phase_label} · applying redactions "
+ f"page {page_no + 1}/{number_of_pages}"
+ ),
+ )
+ except Exception:
+ pass
+ vlm_total_input_tokens += ti
+ vlm_total_output_tokens += to
+ if name and not vlm_model_name:
+ vlm_model_name = name
+
+ if not vlm_boxes:
+ continue
+
+ pymupdf_page = pymupdf_doc.load_page(page_no)
+ image_dimensions_override = {
+ "image_width": page_width,
+ "image_height": page_height,
+ }
+ redact_result = redact_page_with_pymupdf(
+ pymupdf_page,
+ {"boxes": vlm_boxes},
+ image_path,
+ page_sizes_df=page_sizes_df,
+ input_folder=input_folder,
+ image_dimensions_override=image_dimensions_override,
+ )
+ # In dual-output mode, capture the final redacted page copy so _redacted.pdf
+ # merge includes CUSTOM_VLM pages (signature/face post-pass as well).
+ try:
+ if (
+ isinstance(redact_result, tuple)
+ and len(redact_result) >= 1
+ and isinstance(redact_result[0], tuple)
+ and len(redact_result[0]) == 2
+ ):
+ _review_page, _applied_redaction_page = redact_result[0]
+ if not hasattr(redact_image_pdf, "_applied_redaction_pages"):
+ redact_image_pdf._applied_redaction_pages = list()
+ redact_image_pdf._applied_redaction_pages.append(
+ (_applied_redaction_page, page_no)
+ )
+ except Exception:
+ pass
+
+ # Build annotations and decision rows for review/outputs
+ reported_page_number = page_no + 1
+ w, h = max(1, page_width), max(1, page_height)
+ if (
+ CUSTOM_BOX_COLOUR
+ and isinstance(CUSTOM_BOX_COLOUR, (tuple, list))
+ and len(CUSTOM_BOX_COLOUR) >= 3
+ ):
+ vlm_box_color = tuple(int(CUSTOM_BOX_COLOUR[i]) for i in range(3))
+ else:
+ vlm_box_color = (0, 0, 0)
+ all_image_annotations_boxes = []
+ for box in vlm_boxes:
+ try:
+ xmin = box.left / w
+ ymin = box.top / h
+ xmax = (box.left + box.width) / w
+ ymax = (box.top + box.height) / h
+ label = getattr(box, "entity_type", "Redaction")
+ text = getattr(box, "text", "") or ""
+ img_annotation_box = {
+ "xmin": xmin,
+ "ymin": ymin,
+ "xmax": xmax,
+ "ymax": ymax,
+ "label": label,
+ "text": text,
+ "color": vlm_box_color,
+ }
+ filled = fill_missing_box_ids(img_annotation_box)
+ all_image_annotations_boxes.append(filled)
+ vlm_decision_rows.append(
+ {
+ "image_path": image_path,
+ "page": reported_page_number,
+ "label": label,
+ "xmin": xmin,
+ "xmax": xmax,
+ "ymin": ymin,
+ "ymax": ymax,
+ "boundingBox": [xmin, ymin, xmax, ymax],
+ "text": text,
+ "start": getattr(box, "start", 0),
+ "end": getattr(box, "end", 0),
+ "score": getattr(box, "score", 0.0),
+ "id": filled.get("id", ""),
+ }
+ )
+ except AttributeError:
+ continue
+ if all_image_annotations_boxes:
+ vlm_annotations_list.append(
+ {
+ "image": image_path,
+ "boxes": all_image_annotations_boxes,
+ # 0-based index for aligning placeholders with pymupdf / page_sizes
+ "page_index_0": page_no,
+ "page": reported_page_number,
+ }
+ )
+
+ return (
+ vlm_total_input_tokens,
+ vlm_total_output_tokens,
+ vlm_model_name,
+ vlm_annotations_list,
+ vlm_decision_rows,
+ )
+
+
+def choose_and_run_redactor(
+ file_paths: List[str],
+ prepared_pdf_file_paths: List[str],
+ pdf_image_file_paths: List[str],
+ chosen_redact_entities: List[str],
+ chosen_redact_comprehend_entities: List[str],
+ chosen_llm_entities: List[str] = None,
+ text_extraction_method: str = None,
+ in_allow_list: List[str] = list(),
+ in_deny_list: List[str] = list(),
+ redact_whole_page_list: List[str] = list(),
+ latest_file_completed: int = 0,
+ combined_out_message: List = list(),
+ out_file_paths: List = list(),
+ log_files_output_paths: List = list(),
+ first_loop_state: bool = False,
+ page_min: int = 0,
+ page_max: int = 0,
+ estimated_time_taken_state: float = 0.0,
+ handwrite_signature_checkbox: List[str] = list(["Extract handwriting"]),
+ all_request_metadata_str: str = "",
+ annotations_all_pages: List[dict] = list(),
+ all_page_line_level_ocr_results_df: pd.DataFrame = None,
+ all_pages_decision_process_table: pd.DataFrame = None,
+ pymupdf_doc=list(),
+ current_loop_page: int = 0,
+ page_break_return: bool = False,
+ pii_identification_method: str = "Local",
+ comprehend_query_number: int = 0,
+ max_fuzzy_spelling_mistakes_num: int = 1,
+ match_fuzzy_whole_phrase_bool: bool = True,
+ aws_access_key_textbox: str = "",
+ aws_secret_key_textbox: str = "",
+ annotate_max_pages: int = 1,
+ review_file_state: pd.DataFrame = list(),
+ output_folder: str = OUTPUT_FOLDER,
+ document_cropboxes: List = list(),
+ page_sizes: List[dict] = list(),
+ textract_output_found: bool = False,
+ text_extraction_only: bool = False,
+ duplication_file_path_outputs: list = list(),
+ review_file_path: str = "",
+ input_folder: str = INPUT_FOLDER,
+ total_textract_query_number: int = 0,
+ ocr_file_path: str = "",
+ all_page_line_level_ocr_results: list[dict] = list(),
+ all_page_line_level_ocr_results_with_words: list[dict] = list(),
+ all_page_line_level_ocr_results_with_words_df: pd.DataFrame = None,
+ chosen_local_ocr_model: str = DEFAULT_LOCAL_OCR_MODEL,
+ language: str = DEFAULT_LANGUAGE,
+ ocr_review_files: list = list(),
+ custom_llm_instructions: str = "",
+ inference_server_vlm_model: str = "",
+ efficient_ocr: bool = EFFICIENT_OCR,
+ efficient_ocr_min_words: Union[int, float, None] = EFFICIENT_OCR_MIN_WORDS,
+ efficient_ocr_min_image_coverage_fraction: Optional[float] = None,
+ hybrid_textract_bedrock_vlm: bool = HYBRID_TEXTRACT_BEDROCK_VLM,
+ overwrite_existing_ocr_results: bool = OVERWRITE_EXISTING_OCR_RESULTS,
+ llm_model_name="",
+ llm_total_input_tokens=0,
+ llm_total_output_tokens=0,
+ vlm_model_name="",
+ vlm_total_input_tokens=0,
+ vlm_total_output_tokens=0,
+ save_page_ocr_visualisations: bool = SAVE_PAGE_OCR_VISUALISATIONS,
+ ocr_first_pass_max_workers: Optional[int] = None,
+ prepare_images: bool = True,
+ RETURN_REDACTED_PDF: bool = RETURN_REDACTED_PDF,
+ RETURN_PDF_FOR_REVIEW: bool = RETURN_PDF_FOR_REVIEW,
+ progress=gr.Progress(track_tqdm=True),
+):
+ """
+ This function orchestrates the redaction process based on the specified method and parameters. It takes the following inputs:
+
+ - file_paths (List[str]): A list of paths to the files to be redacted.
+ - prepared_pdf_file_paths (List[str]): A list of paths to the PDF files prepared for redaction.
+ - pdf_image_file_paths (List[str]): A list of paths to the PDF files converted to images for redaction.
+ - chosen_redact_entities (List[str]): A list of entity types to redact from the files using the local model (spacy) with Microsoft Presidio.
+ - chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from files, chosen from the official list from AWS Comprehend service.
+ - text_extraction_method (str): The method to use to extract text from documents.
+ - in_allow_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to empty list. Can also be entered as a string path to a CSV file, or as a single column pandas dataframe.
+ - in_deny_list (List[List[str]], optional): A list of denied terms for redaction. Defaults to empty list. Can also be entered as a string path to a CSV file, or as a single column pandas dataframe.
+ - redact_whole_page_list (List[List[str]], optional): A list of whole page numbers for redaction. Defaults to empty list. Can also be entered as a string path to a CSV file, or as a single column pandas dataframe.
+ - latest_file_completed (int, optional): The index of the last completed file. Defaults to 0.
+ - combined_out_message (list, optional): A list to store output messages. Defaults to an empty list.
+ - out_file_paths (list, optional): A list to store paths to the output files. Defaults to an empty list.
+ - log_files_output_paths (list, optional): A list to store paths to the log files. Defaults to an empty list.
+ - first_loop_state (bool, optional): A flag indicating if this is the first iteration. Defaults to False.
+ - page_min (int, optional): The minimum page number to start redaction from. Defaults to 0 (first page).
+ - page_max (int, optional): The maximum page number to end redaction at. Defaults to 0 (last page).
+ - estimated_time_taken_state (float, optional): The estimated time taken for the redaction process. Defaults to 0.0.
+ - handwrite_signature_checkbox (List[str], optional): A list of options for redacting handwriting and signatures. Defaults to ["Extract handwriting", "Extract signatures"].
+ - all_request_metadata_str (str, optional): A string containing all request metadata. Defaults to an empty string.
+ - annotations_all_pages (List[dict], optional): A list of dictionaries containing all image annotations. Defaults to an empty list.
+ - all_page_line_level_ocr_results_df (pd.DataFrame, optional): A DataFrame containing all line-level OCR results. Defaults to an empty DataFrame.
+ - all_pages_decision_process_table (pd.DataFrame, optional): A DataFrame containing all decision process tables. Defaults to an empty DataFrame.
+ - pymupdf_doc (optional): A list containing the PDF document object. Defaults to an empty list.
+ - current_loop_page (int, optional): The current page being processed in the loop. Defaults to 0.
+ - page_break_return (bool, optional): A flag indicating if the function should return after a page break. Defaults to False.
+ - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
+ - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
+ - max_fuzzy_spelling_mistakes_num (int, optional): The maximum number of spelling mistakes allowed in a searched phrase for fuzzy matching. Can range from 0-9.
+ - match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
+ - aws_access_key_textbox (str, optional): AWS access key for account with Textract and Comprehend permissions.
+ - aws_secret_key_textbox (str, optional): AWS secret key for account with Textract and Comprehend permissions.
+ - annotate_max_pages (int, optional): Maximum page value for the annotation object.
+ - review_file_state (pd.DataFrame, optional): Output review file dataframe.
+ - output_folder (str, optional): Output folder for results.
+ - document_cropboxes (List, optional): List of document cropboxes for the PDF.
+ - page_sizes (List[dict], optional): List of dictionaries of PDF page sizes in PDF or image format.
+ - textract_output_found (bool, optional): Boolean is true when a textract OCR output for the file has been found.
+ - text_extraction_only (bool, optional): Boolean to determine if function should only extract text from the document, and not redact.
+ - duplication_file_outputs (list, optional): List to allow for export to the duplication function page.
+ - review_file_path (str, optional): The latest review file path created by the app
+ - input_folder (str, optional): The custom input path, if provided
+ - total_textract_query_number (int, optional): The number of textract queries up until this point.
+ - ocr_file_path (str, optional): The latest ocr file path created by the app.
+ - all_page_line_level_ocr_results (list, optional): All line level text on the page with bounding boxes.
+ - all_page_line_level_ocr_results_with_words (list, optional): All word level text on the page with bounding boxes.
+ - all_page_line_level_ocr_results_with_words_df (pd.Dataframe, optional): All word level text on the page with bounding boxes as a dataframe.
+ - chosen_local_ocr_model (str): Which local model is being used for OCR on images - uses the value of DEFAULT_LOCAL_OCR_MODEL by default, choices are "tesseract", "paddle" for PaddleOCR, or "hybrid-paddle" to combine both.
+ - language (str, optional): The language of the text in the files. Defaults to English.
+ - language (str, optional): The language to do AWS Comprehend calls. Defaults to value of language if not provided.
+ - ocr_review_files (list, optional): A list of OCR review files to be used for the redaction process. Defaults to an empty list.
+ - custom_llm_instructions (str, optional): Custom instructions for LLM-based entity detection. Defaults to an empty string.
+ - inference_server_vlm_model (str, optional): The name of the inference server VLM model to use for OCR. Defaults to an empty string.
+ - efficient_ocr (bool, optional): Boolean to determine whether to use efficient OCR.
+ - efficient_ocr_min_words (int, optional): The minimum number of words on a page for efficient OCR.
+ - llm_model_name (str, optional): The name of the LLM model to use for the redaction process. Defaults to an empty string.
+ - llm_total_input_tokens (int, optional): The total number of input tokens for the LLM model. Defaults to 0.
+ - llm_total_output_tokens (int, optional): The total number of output tokens for the LLM model. Defaults to 0.
+ - vlm_model_name (str, optional): The name of the VLM model to use for the redaction process. Defaults to an empty string.
+ - vlm_total_input_tokens (int, optional): The total number of input tokens for the VLM model. Defaults to 0.
+ - vlm_total_output_tokens (int, optional): The total number of output tokens for the VLM model. Defaults to 0.
+ - save_page_ocr_visualisations (bool, optional): Boolean to determine whether to save page OCR visualisations. Defaults to SAVE_PAGE_OCR_VISUALISATIONS.
+ - prepare_images (bool, optional): Boolean to determine whether to load images for the PDF.
+ - RETURN_REDACTED_PDF (bool, optional): Boolean to determine whether to return a redacted PDF at the end of the redaction process.
+ - RETURN_PDF_FOR_REVIEW (bool, optional): Boolean to determine whether to return a review PDF at the end of the redaction process.
+ - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
+ """
+
+ tic = time.perf_counter()
+
+ out_message = ""
+ pdf_file_name_with_ext = ""
+ pdf_file_name_without_ext = ""
+ page_break_return = False
+ blank_request_metadata = list()
+ custom_recogniser_word_list_flat = list()
+ # Ensure all_request_metadata_str is a string (handle case where list might be passed)
+ if isinstance(all_request_metadata_str, list):
+ all_request_metadata_str = (
+ "\n".join(str(item) for item in all_request_metadata_str)
+ if all_request_metadata_str
+ else ""
+ )
+ all_textract_request_metadata = (
+ all_request_metadata_str.split("\n") if all_request_metadata_str else []
+ )
+
+ task_textbox = "redact"
+ selection_element_results_list_df = pd.DataFrame()
+ form_key_value_results_list_df = pd.DataFrame()
+ out_review_pdf_file_path = ""
+ out_redacted_pdf_file_path = ""
+ if not ocr_review_files:
+ ocr_review_files = list()
+ current_loop_page = 0
+
+ # When EFFICIENT_OCR uses simple text extraction, OCR results are in PDF points; when
+ # OCR path runs, they are in image pixels. Used so divide_coordinates_by_page_sizes
+ # uses mediabox (not image) dimensions when appropriate.
+ ocr_results_use_pdf_points = None
+ # When EFFICIENT_OCR runs both paths, page numbers (1-based) that used text extraction
+ # (so their coordinates are in PDF points). Passed as pages_in_pdf_points for per-page division.
+ pages_with_text_extraction_1based = None
+
+ efficient_ocr_min_words = (
+ int(efficient_ocr_min_words)
+ if efficient_ocr_min_words is not None
+ else EFFICIENT_OCR_MIN_WORDS
+ )
+ if efficient_ocr_min_image_coverage_fraction is None:
+ efficient_ocr_min_image_coverage_fraction = float(
+ EFFICIENT_OCR_MIN_IMAGE_COVERAGE_FRACTION
+ )
+ else:
+ efficient_ocr_min_image_coverage_fraction = float(
+ efficient_ocr_min_image_coverage_fraction
+ )
+
+ # CLI mode may provide options to enter method names in a different format
+ if text_extraction_method == "AWS Textract":
+ text_extraction_method = TEXTRACT_TEXT_EXTRACT_OPTION
+ if text_extraction_method == "Local OCR":
+ text_extraction_method = LOCAL_OCR_MODEL_TEXT_EXTRACT_OPTION
+ print("Performing local OCR with" + chosen_local_ocr_model + " model.")
+ if text_extraction_method == "Local text":
+ text_extraction_method = SELECTABLE_TEXT_EXTRACT_OPTION
+
+ # Default chosen_llm_entities to chosen_redact_comprehend_entities if not provided
+ if chosen_llm_entities is None:
+ chosen_llm_entities = chosen_redact_comprehend_entities
+
+ # Auto-include CUSTOM_FUZZY in local entity list when fuzzy matching is enabled
+ chosen_redact_entities = list(chosen_redact_entities or [])
+ if (max_fuzzy_spelling_mistakes_num or 0) > 0:
+ if "CUSTOM_FUZZY" not in chosen_redact_entities:
+ chosen_redact_entities.append("CUSTOM_FUZZY")
+ if "CUSTOM_FUZZY" not in chosen_redact_comprehend_entities:
+ chosen_redact_comprehend_entities.append("CUSTOM_FUZZY")
+ if "CUSTOM_FUZZY" not in chosen_llm_entities:
+ chosen_llm_entities.append("CUSTOM_FUZZY")
+
+ # Any entity starting with CUSTOM_VLM (e.g. CUSTOM_VLM_FACES, CUSTOM_VLM_SIGNATURE) requires
+ # image-based analysis; when user selected simple text extraction we still run image path for these.
+ _has_any_custom_vlm_entity = any(
+ str(e).startswith("CUSTOM_VLM")
+ for lst in (
+ chosen_redact_entities,
+ chosen_redact_comprehend_entities or [],
+ chosen_llm_entities or [],
+ )
+ for e in lst
+ )
+
+ if pii_identification_method == "None":
+ pii_identification_method = NO_REDACTION_PII_OPTION
+
+ # Normalise the output folder path separator (handles Windows backslash paths)
+ if not output_folder.endswith(("/", os.sep)):
+ output_folder = output_folder + "/"
+
+ # Use provided language or default
+ language = language or DEFAULT_LANGUAGE
+
+ if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
+ if language not in textract_language_choices:
+ out_message = f"Language '{language}' is not supported by AWS Textract. Please select a different language."
+ raise Warning(out_message)
+ elif pii_identification_method == AWS_PII_OPTION:
+ if language not in aws_comprehend_language_choices:
+ out_message = f"Language '{language}' is not supported by AWS Comprehend. Please select a different language."
+ raise Warning(out_message)
+
+ if all_page_line_level_ocr_results_with_words_df is None:
+ all_page_line_level_ocr_results_with_words_df = pd.DataFrame()
+
+ # Create copies of out_file_path objects to avoid overwriting each other on append actions
+ out_file_paths = out_file_paths.copy()
+ log_files_output_paths = log_files_output_paths.copy()
+
+ # Ensure all_pages_decision_process_table is in correct format for downstream processes
+ if isinstance(all_pages_decision_process_table, list):
+ if not all_pages_decision_process_table:
+ all_pages_decision_process_table = pd.DataFrame(
+ columns=[
+ "image_path",
+ "page",
+ "label",
+ "xmin",
+ "xmax",
+ "ymin",
+ "ymax",
+ "boundingBox",
+ "text",
+ "start",
+ "end",
+ "score",
+ "id",
+ ]
+ )
+ elif isinstance(all_pages_decision_process_table, pd.DataFrame):
+ if all_pages_decision_process_table.empty:
+ all_pages_decision_process_table = pd.DataFrame(
+ columns=[
+ "image_path",
+ "page",
+ "label",
+ "xmin",
+ "xmax",
+ "ymin",
+ "ymax",
+ "boundingBox",
+ "text",
+ "start",
+ "end",
+ "score",
+ "id",
+ ]
+ )
+
+ # If this is the first time around, set variables to 0/blank
+ if first_loop_state is True:
+ # print("First_loop_state is True")
+ latest_file_completed = 0
+ current_loop_page = 0
+ out_file_paths = list()
+ log_files_output_paths = list()
+ estimated_time_taken_state = 0
+ comprehend_query_number = 0
+ total_textract_query_number = 0
+
+ # Initialize VLM and LLM token tracking variables
+ llm_model_name = ""
+ llm_total_input_tokens = 0
+ llm_total_output_tokens = 0
+ vlm_model_name = ""
+ vlm_total_input_tokens = 0
+ vlm_total_output_tokens = 0
+ # elif current_loop_page == 0:
+ # comprehend_query_number = 0
+ # Do not reset total_textract_query_number here: EFFICIENT_OCR and other paths
+ # accumulate Textract count during the run; resetting would overwrite the
+ # value when the run completes and report 0 to the UI.
+ # If not the first time around, and the current page loop has been set to a huge number (been through all pages), reset current page to 0
+ # elif (first_loop_state is False) & (current_loop_page == 999):
+ # current_loop_page = 0
+ # total_textract_query_number = 0
+ # comprehend_query_number = 0
+
+ if not file_paths:
+ raise Exception("No files to redact")
+
+ if prepared_pdf_file_paths:
+ review_out_file_paths = [prepared_pdf_file_paths[0]]
+ else:
+ review_out_file_paths = list()
+
+ # Choose the correct file to prepare
+ # Normalize so we support both path strings and Gradio file objects (dict with "name" or object with .name)
+ if isinstance(file_paths, str):
+ file_paths_list = [os.path.abspath(file_paths)]
+ elif isinstance(file_paths, dict):
+ file_paths = file_paths["name"]
+ file_paths_list = [os.path.abspath(file_paths)]
+ else:
+ # List from Gradio: can be list of path strings or list of file objects (dict / object with .name)
+ def _file_item_to_path(item):
+ if isinstance(item, str):
+ return item
+ if isinstance(item, dict):
+ return item.get("name") or item.get("path") or ""
+ return getattr(item, "name", None) or getattr(item, "path", None) or ""
+
+ file_paths_list = [_file_item_to_path(f) for f in file_paths if f is not None]
+ # Resolve to absolute paths so paths work consistently in Docker (cwd may differ)
+ file_paths_list = [
+ os.path.abspath(p) for p in file_paths_list if p and str(p).strip()
+ ]
+
+ if len(file_paths_list) > MAX_SIMULTANEOUS_FILES:
+ out_message = f"Number of files to redact is greater than {MAX_SIMULTANEOUS_FILES}. Please submit a smaller number of files."
+ print(out_message)
+ raise Exception(out_message)
+
+ valid_extensions = {".pdf", ".jpg", ".jpeg", ".png"}
+ # Filter only files with valid extensions. Currently only allowing one file to be redacted at a time
+ # Filter the file_paths_list to include only files with valid extensions
+ filtered_files = [
+ file
+ for file in file_paths_list
+ if os.path.splitext(file)[1].lower() in valid_extensions
+ ]
+
+ # Check if any files were found and assign to file_paths_list
+ file_paths_list = filtered_files if filtered_files else []
+
+ # print("Latest file completed:", latest_file_completed)
+
+ # If latest_file_completed is used, get the specific file
+ if not isinstance(file_paths, (str, dict)):
+ file_paths_loop = (
+ [file_paths_list[int(latest_file_completed)]]
+ if len(file_paths_list) > latest_file_completed
+ else []
+ )
+ else:
+ file_paths_loop = file_paths_list
+
+ latest_file_completed = int(latest_file_completed)
+
+ if isinstance(file_paths, str):
+ number_of_files = 1
+ else:
+ number_of_files = len(file_paths_list)
+
+ # If we have already redacted the last file, return the input out_message and file list to the relevant outputs
+ if latest_file_completed >= number_of_files:
+
+ print("Completed last file, performing final checks")
+ progress(0.95, "Completed last file, performing final checks")
+ current_loop_page = 0
+
+ if isinstance(combined_out_message, list):
+ combined_out_message = "\n".join(combined_out_message)
+
+ _sep = "\n" if combined_out_message else ""
+ if isinstance(out_message, list) and out_message:
+ combined_out_message = combined_out_message + _sep + "\n".join(out_message)
+ elif out_message:
+ combined_out_message = combined_out_message + _sep + out_message
+
+ from tools.secure_regex_utils import safe_remove_leading_newlines
+
+ combined_out_message = safe_remove_leading_newlines(combined_out_message)
+
+ end_message = "\n\nPlease review and modify the suggested redaction outputs on the 'Review redactions' tab of the app (you can find this under the introduction text at the top of the page)."
+
+ if end_message not in combined_out_message:
+ combined_out_message = combined_out_message + end_message
+
+ # Only send across review file if redaction has been done
+ if pii_identification_method != NO_REDACTION_PII_OPTION:
+
+ if len(review_out_file_paths) == 1:
+ if review_file_path:
+ review_out_file_paths.append(review_file_path)
+
+ # Use document page count when available (pymupdf_doc is set by state from last run).
+ # Default to 1 only when we have no valid document (e.g. initial state pymupdf_doc=list(),
+ # or no files were processed in this session).
+ number_of_pages = 1
+ if pymupdf_doc is not None and not isinstance(pymupdf_doc, list):
+ if hasattr(pymupdf_doc, "page_count"):
+ _cnt = pymupdf_doc.page_count
+ if _cnt and _cnt > 0:
+ number_of_pages = _cnt
+ if number_of_pages and number_of_pages > 0:
+ if total_textract_query_number > number_of_pages:
+ total_textract_query_number = number_of_pages
+ annotate_max_pages = number_of_pages
+ annotate_max_pages_bottom = number_of_pages
+
+ # No files to process: total page count stays 0 for usage logging
+ total_pages_for_ui = number_of_pages if number_of_pages else 1
+
+ print("number_of_pages:", number_of_pages)
+
+ sum_numbers_before_seconds(combined_out_message)
+
+ print(combined_out_message)
+ gr.Info(combined_out_message)
+
+ page_break_return = True
+
+ return (
+ combined_out_message,
+ out_file_paths,
+ out_file_paths,
+ latest_file_completed,
+ log_files_output_paths,
+ log_files_output_paths,
+ estimated_time_taken_state,
+ all_request_metadata_str,
+ pymupdf_doc,
+ annotations_all_pages,
+ current_loop_page,
+ page_break_return,
+ all_page_line_level_ocr_results_df,
+ all_pages_decision_process_table,
+ comprehend_query_number,
+ review_out_file_paths,
+ annotate_max_pages,
+ annotate_max_pages,
+ prepared_pdf_file_paths,
+ pdf_image_file_paths,
+ review_file_state,
+ page_sizes,
+ duplication_file_path_outputs,
+ duplication_file_path_outputs, # Write ocr_file_path to in_duplicate_pages
+ duplication_file_path_outputs, # Write ocr_file_path to in_summarisation_ocr_files
+ review_file_path,
+ total_textract_query_number,
+ ocr_file_path,
+ (
+ all_page_line_level_ocr_results
+ if all_page_line_level_ocr_results
+ else gr.update()
+ ),
+ (
+ all_page_line_level_ocr_results_with_words
+ if all_page_line_level_ocr_results_with_words
+ else gr.update()
+ ),
+ (
+ all_page_line_level_ocr_results_with_words_df
+ if (
+ isinstance(
+ all_page_line_level_ocr_results_with_words_df, pd.DataFrame
+ )
+ and not all_page_line_level_ocr_results_with_words_df.empty
+ )
+ else gr.update()
+ ),
+ review_file_state,
+ task_textbox,
+ ocr_review_files,
+ vlm_model_name,
+ vlm_total_input_tokens,
+ vlm_total_output_tokens,
+ llm_model_name,
+ llm_total_input_tokens,
+ llm_total_output_tokens,
+ total_pages_for_ui,
+ )
+ else:
+ # ocr_review_files will be replaced by latest file output
+ ocr_review_files = list()
+
+ # if first_loop_state == False:
+ # Prepare documents and images as required if they don't already exist
+ prepare_images_flag = None # Determines whether to call prepare_image_or_pdf
+
+ # When Textract + Extract signatures/forms/tables (not Face detection alone), we run all
+ # pages through redact_image_pdf (skip EFFICIENT_OCR). Face detection keeps EFFICIENT_OCR;
+ # CUSTOM_VLM_FACES then runs only on OCR-classified pages (see EFFICIENT_OCR VLM block).
+ _textract_needs_full_analysis_global = (
+ text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION
+ and any(
+ opt in (handwrite_signature_checkbox or [])
+ for opt in (
+ "Extract signatures",
+ "Extract forms",
+ "Extract tables",
+ )
+ )
+ )
+ # After EFFICIENT_OCR, run CUSTOM_VLM passes when selected or Textract + Face detection.
+ # Routing: CUSTOM_VLM_SIGNATURE = full-document images; CUSTOM_VLM_FACES = OCR pages only.
+ _run_vlm_pass_after_all_pages = _has_any_custom_vlm_entity or (
+ text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION
+ and "Face detection" in (handwrite_signature_checkbox or [])
+ )
+ # When user selected simple text extraction but any CUSTOM_VLM_* entity (face, signature, etc.),
+ # run all pages through redact_image_pdf so VLM-based detection can take place.
+ _custom_vlm_requires_image_global = (
+ text_extraction_method == SELECTABLE_TEXT_EXTRACT_OPTION
+ and _has_any_custom_vlm_entity
+ )
+ if (
+ efficient_ocr
+ and file_paths_loop
+ and not _textract_needs_full_analysis_global
+ and not _custom_vlm_requires_image_global
+ ):
+ # Defer image load: only create images for pages that need OCR, after word check
+ first_file = (
+ file_paths_loop[0]
+ if isinstance(file_paths_loop[0], str)
+ else getattr(file_paths_loop[0], "name", "")
+ )
+ if first_file and is_pdf(first_file):
+ print(
+ "EFFICIENT_OCR enabled: skipping initial image load; images will be created only for pages that need OCR."
+ )
+ prepare_images_flag = False
+
+ if (
+ prepare_images_flag is None
+ and textract_output_found
+ and text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION
+ ):
+ print("Existing Textract outputs found, not preparing images or documents.")
+ prepare_images_flag = False
+ # return # No need to call `prepare_image_or_pdf`, exit early
+
+ elif (
+ prepare_images_flag is None
+ and text_extraction_method == SELECTABLE_TEXT_EXTRACT_OPTION
+ and not _custom_vlm_requires_image_global
+ ):
+ print("Running text extraction analysis, not preparing images.")
+ prepare_images_flag = False
+
+ elif prepare_images_flag is None and prepare_images and not pdf_image_file_paths:
+ print("Prepared PDF images not found, loading from file")
+ prepare_images_flag = True
+
+ elif prepare_images_flag is None and not prepare_images:
+ print("Not loading images for file")
+ prepare_images_flag = False
+
+ elif prepare_images_flag is None:
+ print("Loading images for file")
+ prepare_images_flag = True
+
+ # Call prepare_image_or_pdf only if needed
+ if prepare_images_flag is not None:
+ (
+ out_message,
+ prepared_pdf_file_paths,
+ pdf_image_file_paths,
+ annotate_max_pages,
+ annotate_max_pages_bottom,
+ pymupdf_doc,
+ annotations_all_pages,
+ review_file_state,
+ document_cropboxes,
+ page_sizes,
+ textract_output_found,
+ all_img_details_state,
+ placeholder_ocr_results_df,
+ local_ocr_output_found_checkbox,
+ all_page_line_level_ocr_results_with_words_df,
+ ) = prepare_image_or_pdf(
+ file_paths_loop,
+ text_extraction_method,
+ all_page_line_level_ocr_results_df,
+ all_page_line_level_ocr_results_with_words_df,
+ 0,
+ out_message,
+ True,
+ annotate_max_pages,
+ annotations_all_pages,
+ prepare_for_review=False,
+ in_fully_redacted_list=redact_whole_page_list,
+ output_folder=output_folder,
+ prepare_images=prepare_images_flag,
+ page_sizes=page_sizes,
+ pymupdf_doc=pymupdf_doc,
+ input_folder=input_folder,
+ page_min=page_min,
+ page_max=page_max,
+ )
+
+ page_sizes_df = pd.DataFrame(page_sizes)
+
+ if page_sizes_df.empty:
+ page_sizes_df = pd.DataFrame(
+ columns=[
+ "page",
+ "image_path",
+ "image_width",
+ "image_height",
+ "mediabox_width",
+ "mediabox_height",
+ "cropbox_width",
+ "cropbox_height",
+ "original_cropbox",
+ ]
+ )
+ page_sizes_df[["page"]] = page_sizes_df[["page"]].apply(
+ pd.to_numeric, errors="coerce"
+ )
+
+ page_sizes = page_sizes_df.to_dict(orient="records")
+
+ number_of_pages = pymupdf_doc.page_count
+
+ # page_max 0 means "last page of document"
+ if page_min == 0 and page_max == 0:
+ number_of_pages_to_process = number_of_pages
+ elif page_max == 0:
+ number_of_pages_to_process = (number_of_pages - page_min) + 1
+ else:
+ number_of_pages_to_process = (page_max - page_min) + 1
+
+ if number_of_pages_to_process > MAX_DOC_PAGES:
+ out_message = f"Number of pages to process is greater than {MAX_DOC_PAGES}. Please submit a smaller document."
+ print(out_message)
+ raise Exception(out_message)
+
+ # If we have reached the last page, return message and outputs
+ if current_loop_page >= number_of_pages_to_process:
+ print("Reached last page of document to process")
+
+ if total_textract_query_number > number_of_pages:
+ total_textract_query_number = number_of_pages
+
+ # Reset current loop page to 0
+ current_loop_page = 0
+
+ if out_message:
+ combined_out_message = combined_out_message + "\n" + out_message
+
+ # Only send across review file if redaction has been done
+ if pii_identification_method != NO_REDACTION_PII_OPTION:
+ # If only pdf currently in review outputs, add on the latest review file
+ if len(review_out_file_paths) == 1:
+ if review_file_path:
+ review_out_file_paths.append(review_file_path)
+
+ page_break_return = False
+
+ return (
+ combined_out_message,
+ out_file_paths,
+ out_file_paths,
+ latest_file_completed,
+ log_files_output_paths,
+ log_files_output_paths,
+ estimated_time_taken_state,
+ all_request_metadata_str,
+ pymupdf_doc,
+ annotations_all_pages,
+ current_loop_page,
+ page_break_return,
+ all_page_line_level_ocr_results_df,
+ all_pages_decision_process_table,
+ comprehend_query_number,
+ review_out_file_paths,
+ annotate_max_pages,
+ annotate_max_pages,
+ prepared_pdf_file_paths,
+ pdf_image_file_paths,
+ review_file_state,
+ page_sizes,
+ duplication_file_path_outputs,
+ duplication_file_path_outputs,
+ duplication_file_path_outputs, # Write ocr_file_path to in_summarisation_ocr_files
+ review_file_path,
+ total_textract_query_number,
+ ocr_file_path,
+ all_page_line_level_ocr_results,
+ all_page_line_level_ocr_results_with_words,
+ all_page_line_level_ocr_results_with_words_df,
+ review_file_state,
+ task_textbox,
+ ocr_review_files,
+ vlm_model_name,
+ vlm_total_input_tokens,
+ vlm_total_output_tokens,
+ llm_model_name,
+ llm_total_input_tokens,
+ llm_total_output_tokens,
+ number_of_pages,
+ )
+
+ ### Load/create allow list, deny list, and whole page redaction list
+ # If string, assume file path
+ if isinstance(in_allow_list, str):
+ if in_allow_list:
+ in_allow_list = pd.read_csv(in_allow_list, header=None)
+ # Handle both DataFrame (legacy) and list (new Dropdown format)
+ if isinstance(in_allow_list, pd.DataFrame):
+ if not in_allow_list.empty:
+ in_allow_list_flat = in_allow_list.iloc[:, 0].tolist()
+ else:
+ in_allow_list_flat = list()
+ elif isinstance(in_allow_list, list):
+ # Dropdown component returns a list directly
+ in_allow_list_flat = (
+ [str(item) for item in in_allow_list if item] if in_allow_list else list()
+ )
+ else:
+ in_allow_list_flat = list()
+
+ ### Load/create deny list
+ # If string, assume file path
+ if isinstance(in_deny_list, str):
+ if in_deny_list:
+ in_deny_list = pd.read_csv(in_deny_list, header=None)
+
+ # Handle both DataFrame (legacy) and list (new Dropdown format)
+ if isinstance(in_deny_list, pd.DataFrame):
+ if not in_deny_list.empty:
+ custom_recogniser_word_list_flat = in_deny_list.iloc[:, 0].tolist()
+ else:
+ custom_recogniser_word_list_flat = list()
+ # Sort the strings in order from the longest string to the shortest
+ custom_recogniser_word_list_flat = sorted(
+ custom_recogniser_word_list_flat, key=len, reverse=True
+ )
+ elif isinstance(in_deny_list, list):
+ # Dropdown component returns a list directly
+ custom_recogniser_word_list_flat = (
+ [str(item) for item in in_deny_list if item] if in_deny_list else list()
+ )
+ # Sort the strings in order from the longest string to the shortest
+ custom_recogniser_word_list_flat = sorted(
+ custom_recogniser_word_list_flat, key=len, reverse=True
+ )
+ else:
+ custom_recogniser_word_list_flat = list()
+
+ ### Load/create whole page redaction list
+ # If string, assume file path
+ if isinstance(redact_whole_page_list, str):
+ if redact_whole_page_list:
+ redact_whole_page_list = pd.read_csv(redact_whole_page_list, header=None)
+ # Handle both DataFrame (legacy) and list (new Dropdown format)
+ if isinstance(redact_whole_page_list, pd.DataFrame):
+ if not redact_whole_page_list.empty:
+ try:
+ redact_whole_page_list_flat = (
+ redact_whole_page_list.iloc[:, 0].astype(int).tolist()
+ )
+ except Exception as e:
+ print(
+ "Could not convert whole page redaction data to number list due to:",
+ e,
+ )
+ redact_whole_page_list_flat = redact_whole_page_list.iloc[:, 0].tolist()
+ else:
+ redact_whole_page_list_flat = list()
+ elif isinstance(redact_whole_page_list, list):
+ # Dropdown component returns a list directly
+ if redact_whole_page_list:
+ try:
+ # Try to convert to integers for page numbers
+ redact_whole_page_list_flat = [
+ int(item) for item in redact_whole_page_list if item
+ ]
+ except (ValueError, TypeError) as e:
+ print(
+ "Could not convert whole page redaction data to number list due to:",
+ e,
+ )
+ # Fall back to string list if conversion fails
+ redact_whole_page_list_flat = [
+ str(item) for item in redact_whole_page_list if item
+ ]
+ else:
+ redact_whole_page_list_flat = list()
+ else:
+ redact_whole_page_list_flat = list()
+
+ ### Load/create PII identification method
+
+ # Try to connect to AWS services directly only if RUN_AWS_FUNCTIONS environmental variable is True, otherwise an environment variable or direct textbox input is needed.
+ if pii_identification_method == AWS_PII_OPTION:
+ if RUN_AWS_FUNCTIONS and PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS:
+ print("Connecting to Comprehend via existing SSO connection")
+ comprehend_client = boto3.client("comprehend", region_name=AWS_REGION)
+ elif aws_access_key_textbox and aws_secret_key_textbox:
+ print(
+ "Connecting to Comprehend using AWS access key and secret keys from user input."
+ )
+ comprehend_client = boto3.client(
+ "comprehend",
+ aws_access_key_id=aws_access_key_textbox,
+ aws_secret_access_key=aws_secret_key_textbox,
+ region_name=AWS_REGION,
+ )
+ elif RUN_AWS_FUNCTIONS:
+ print("Connecting to Comprehend via existing SSO connection")
+ comprehend_client = boto3.client("comprehend", region_name=AWS_REGION)
+ elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
+ print("Getting Comprehend credentials from environment variables")
+ comprehend_client = boto3.client(
+ "comprehend",
+ aws_access_key_id=AWS_ACCESS_KEY,
+ aws_secret_access_key=AWS_SECRET_KEY,
+ region_name=AWS_REGION,
+ )
+ else:
+ comprehend_client = ""
+ out_message = "Cannot connect to AWS Comprehend service. Please provide access keys under Textract settings on the Redaction settings tab, or choose another PII identification method."
+ print(out_message)
+ raise Exception(out_message)
+ else:
+ comprehend_client = ""
+
+ # Try to connect to AWS Bedrock Runtime Client if using LLM-based PII detection
+ if pii_identification_method == AWS_LLM_PII_OPTION or (
+ text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION
+ and hybrid_textract_bedrock_vlm
+ ):
+ if RUN_AWS_FUNCTIONS and PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS:
+ print("Connecting to Bedrock via existing SSO connection")
+ bedrock_runtime = boto3.client("bedrock-runtime", region_name=AWS_REGION)
+ elif aws_access_key_textbox and aws_secret_key_textbox:
+ print(
+ "Connecting to Bedrock using AWS access key and secret keys from user input."
+ )
+ bedrock_runtime = boto3.client(
+ "bedrock-runtime",
+ aws_access_key_id=aws_access_key_textbox,
+ aws_secret_access_key=aws_secret_key_textbox,
+ region_name=AWS_REGION,
+ )
+ elif RUN_AWS_FUNCTIONS:
+ print("Connecting to Bedrock via existing SSO connection")
+ bedrock_runtime = boto3.client("bedrock-runtime", region_name=AWS_REGION)
+ elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
+ print("Getting Bedrock credentials from environment variables")
+ bedrock_runtime = boto3.client(
+ "bedrock-runtime",
+ aws_access_key_id=AWS_ACCESS_KEY,
+ aws_secret_access_key=AWS_SECRET_KEY,
+ region_name=AWS_REGION,
+ )
+ else:
+ bedrock_runtime = None
+ out_message = "Cannot connect to AWS Bedrock service. Please provide access keys under Textract settings on the Redaction settings tab, or choose another PII identification method."
+ print(out_message)
+ raise Exception(out_message)
+ elif pii_identification_method == INFERENCE_SERVER_PII_OPTION:
+ # For inference server, we don't need bedrock_runtime
+ bedrock_runtime = None
+ print("Using inference server for PII detection")
+ elif pii_identification_method == LOCAL_TRANSFORMERS_LLM_PII_OPTION:
+ # For local transformers LLM, we don't need bedrock_runtime
+ bedrock_runtime = None
+ print("Using local transformers LLM for PII detection")
+ else:
+ bedrock_runtime = None
+
+ # If using AWS Comprehend and CUSTOM_VLM_FACES or CUSTOM_VLM_SIGNATURE is selected,
+ # ensure bedrock_runtime is available only when CUSTOM_VLM_BACKEND is bedrock_vlm.
+ if (
+ pii_identification_method == AWS_PII_OPTION
+ and bedrock_runtime is None
+ and CUSTOM_VLM_BACKEND == "bedrock_vlm"
+ and (
+ "CUSTOM_VLM_FACES" in chosen_redact_comprehend_entities
+ or "CUSTOM_VLM_SIGNATURE" in chosen_redact_comprehend_entities
+ )
+ ):
+ print(
+ "CUSTOM_VLM_FACES or CUSTOM_VLM_SIGNATURE selected with AWS Comprehend. Connecting to Bedrock for additional detection."
+ )
+ if RUN_AWS_FUNCTIONS and PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS:
+ print("Connecting to Bedrock via existing SSO connection for VLM detection")
+ bedrock_runtime = boto3.client("bedrock-runtime", region_name=AWS_REGION)
+ elif aws_access_key_textbox and aws_secret_key_textbox:
+ print(
+ "Connecting to Bedrock using AWS access key and secret keys from user input for VLM detection."
+ )
+ bedrock_runtime = boto3.client(
+ "bedrock-runtime",
+ aws_access_key_id=aws_access_key_textbox,
+ aws_secret_access_key=aws_secret_key_textbox,
+ region_name=AWS_REGION,
+ )
+ elif RUN_AWS_FUNCTIONS:
+ print("Connecting to Bedrock via existing SSO connection for VLM detection")
+ bedrock_runtime = boto3.client("bedrock-runtime", region_name=AWS_REGION)
+ elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
+ print(
+ "Getting Bedrock credentials from environment variables for VLM detection"
+ )
+ bedrock_runtime = boto3.client(
+ "bedrock-runtime",
+ aws_access_key_id=AWS_ACCESS_KEY,
+ aws_secret_access_key=AWS_SECRET_KEY,
+ region_name=AWS_REGION,
+ )
+ else:
+ out_message = "Cannot connect to AWS Bedrock service for VLM detection. Please provide access keys under Textract settings on the Redaction settings tab."
+ print(out_message)
+ raise Exception(out_message)
+
+ # Try to connect to AWS Textract Client if using that text extraction method
+ if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
+ if RUN_AWS_FUNCTIONS and PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS:
+ print("Connecting to Textract via existing SSO connection")
+ textract_client = boto3.client("textract", region_name=AWS_REGION)
+ elif aws_access_key_textbox and aws_secret_key_textbox:
+ print(
+ "Connecting to Textract using AWS access key and secret keys from user input."
+ )
+ textract_client = boto3.client(
+ "textract",
+ aws_access_key_id=aws_access_key_textbox,
+ aws_secret_access_key=aws_secret_key_textbox,
+ region_name=AWS_REGION,
+ )
+ elif RUN_AWS_FUNCTIONS:
+ print("Connecting to Textract via existing SSO connection")
+ textract_client = boto3.client("textract", region_name=AWS_REGION)
+ elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
+ print("Getting Textract credentials from environment variables.")
+ textract_client = boto3.client(
+ "textract",
+ aws_access_key_id=AWS_ACCESS_KEY,
+ aws_secret_access_key=AWS_SECRET_KEY,
+ region_name=AWS_REGION,
+ )
+ elif textract_output_found is True:
+ print(
+ "Existing Textract data found for file, no need to connect to AWS Textract"
+ )
+ textract_client = boto3.client("textract", region_name=AWS_REGION)
+ else:
+ textract_client = ""
+ out_message = "Cannot connect to AWS Textract service."
+ print(out_message)
+ raise Exception(out_message)
+ else:
+ textract_client = ""
+
+ # Try to connect to cloud VLM clients if using cloud VLM OCR
+ gemini_client = None
+ gemini_config = None
+ azure_openai_client = None
+
+ if text_extraction_method == BEDROCK_VLM_TEXT_EXTRACT_OPTION:
+ # Use the same bedrock_runtime that may have been created for LLM PII detection
+ if bedrock_runtime is None:
+ if RUN_AWS_FUNCTIONS and PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS:
+ print("Connecting to Bedrock via existing SSO connection for VLM OCR")
+ bedrock_runtime = boto3.client(
+ "bedrock-runtime", region_name=AWS_REGION
+ )
+ elif aws_access_key_textbox and aws_secret_key_textbox:
+ print(
+ "Connecting to Bedrock using AWS access key and secret keys from user input for VLM OCR."
+ )
+ bedrock_runtime = boto3.client(
+ "bedrock-runtime",
+ aws_access_key_id=aws_access_key_textbox,
+ aws_secret_access_key=aws_secret_key_textbox,
+ region_name=AWS_REGION,
+ )
+ elif RUN_AWS_FUNCTIONS:
+ print("Connecting to Bedrock via existing SSO connection for VLM OCR")
+ bedrock_runtime = boto3.client(
+ "bedrock-runtime", region_name=AWS_REGION
+ )
+ elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
+ print(
+ "Getting Bedrock credentials from environment variables for VLM OCR"
+ )
+ bedrock_runtime = boto3.client(
+ "bedrock-runtime",
+ aws_access_key_id=AWS_ACCESS_KEY,
+ aws_secret_access_key=AWS_SECRET_KEY,
+ region_name=AWS_REGION,
+ )
+ else:
+ bedrock_runtime = None
+ out_message = "Cannot connect to AWS Bedrock service for VLM OCR. Please provide access keys under Textract settings on the Redaction settings tab."
+ print(out_message)
+ raise Exception(out_message)
+
+ # If using image-based extraction (Textract, Bedrock VLM, etc.) or simple text + any CUSTOM_VLM_*
+ # entity, ensure bedrock_runtime is available only when actually needed: Face detection
+ # (Textract) uses Bedrock; CUSTOM_VLM_* uses Bedrock only when CUSTOM_VLM_BACKEND is bedrock_vlm.
+ _image_based_extraction = text_extraction_method in (
+ TEXTRACT_TEXT_EXTRACT_OPTION,
+ BEDROCK_VLM_TEXT_EXTRACT_OPTION,
+ LOCAL_OCR_MODEL_TEXT_EXTRACT_OPTION,
+ GEMINI_VLM_TEXT_EXTRACT_OPTION,
+ AZURE_OPENAI_VLM_TEXT_EXTRACT_OPTION,
+ )
+ _needs_bedrock_for_vlm = "Face detection" in (
+ handwrite_signature_checkbox or []
+ ) or (_has_any_custom_vlm_entity and CUSTOM_VLM_BACKEND == "bedrock_vlm")
+ if (
+ (_image_based_extraction or _custom_vlm_requires_image_global)
+ and bedrock_runtime is None
+ and _needs_bedrock_for_vlm
+ ):
+ print(
+ "CUSTOM_VLM entity (face/signature/etc.) or Face detection enabled. Connecting to Bedrock for additional detection."
+ )
+ if RUN_AWS_FUNCTIONS and PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS:
+ print("Connecting to Bedrock via existing SSO connection for VLM detection")
+ bedrock_runtime = boto3.client("bedrock-runtime", region_name=AWS_REGION)
+ elif aws_access_key_textbox and aws_secret_key_textbox:
+ print(
+ "Connecting to Bedrock using AWS access key and secret keys from user input for VLM detection."
+ )
+ bedrock_runtime = boto3.client(
+ "bedrock-runtime",
+ aws_access_key_id=aws_access_key_textbox,
+ aws_secret_access_key=aws_secret_key_textbox,
+ region_name=AWS_REGION,
+ )
+ elif RUN_AWS_FUNCTIONS:
+ print("Connecting to Bedrock via existing SSO connection for VLM detection")
+ bedrock_runtime = boto3.client("bedrock-runtime", region_name=AWS_REGION)
+ elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
+ print(
+ "Getting Bedrock credentials from environment variables for VLM detection"
+ )
+ bedrock_runtime = boto3.client(
+ "bedrock-runtime",
+ aws_access_key_id=AWS_ACCESS_KEY,
+ aws_secret_access_key=AWS_SECRET_KEY,
+ region_name=AWS_REGION,
+ )
+ else:
+ out_message = "Cannot connect to AWS Bedrock service for VLM detection. Please provide access keys under Textract settings on the Redaction settings tab."
+ print(out_message)
+ raise Exception(out_message)
+
+ elif text_extraction_method == GEMINI_VLM_TEXT_EXTRACT_OPTION:
+ from tools.llm_funcs import construct_gemini_generative_model
+
+ try:
+ gemini_client, gemini_config = construct_gemini_generative_model(
+ in_api_key="", # Will use environment variable
+ temperature=0.0, # Use low temperature for OCR
+ model_choice=CLOUD_VLM_MODEL_CHOICE,
+ system_prompt="", # No system prompt needed for OCR
+ max_tokens=4096, # Reasonable default for OCR
+ )
+ print("Connected to Google Gemini for VLM OCR")
+ except Exception as e:
+ out_message = f"Cannot connect to Google Gemini service for VLM OCR: {e}. Please ensure GEMINI_API_KEY is set."
+ print(out_message)
+ raise Exception(out_message)
+
+ elif text_extraction_method == AZURE_OPENAI_VLM_TEXT_EXTRACT_OPTION:
+ from tools.llm_funcs import construct_azure_client
+
+ try:
+ azure_openai_client, _ = construct_azure_client(
+ in_api_key="", # Will use environment variable
+ endpoint="", # Will use environment variable
+ )
+ print("Connected to Azure/OpenAI for VLM OCR")
+ except Exception as e:
+ out_message = f"Cannot connect to Azure/OpenAI service for VLM OCR: {e}. Please ensure AZURE_OPENAI_API_KEY and AZURE_OPENAI_ENDPOINT are set."
+ print(out_message)
+ raise Exception(out_message)
+
+ ### Language check - check if selected language packs exist
+ try:
+ if (
+ text_extraction_method == LOCAL_OCR_MODEL_TEXT_EXTRACT_OPTION
+ and chosen_local_ocr_model == "tesseract"
+ ):
+ if language != "en":
+ progress(
+ 0.1, desc=f"Downloading Tesseract language pack for {language}"
+ )
+ download_tesseract_lang_pack(language)
+
+ if language != "en":
+ progress(0.1, desc=f"Loading SpaCy model for {language}")
+ load_spacy_model(language)
+
+ except Exception as e:
+ print(f"Error downloading language packs for {language}: {e}")
+ raise Exception(f"Error downloading language packs for {language}: {e}")
+
+ # Check if output_folder exists, create it if it doesn't
+ if not os.path.exists(output_folder):
+ os.makedirs(output_folder)
+
+ progress(0.2, desc="Extracting text and redacting document")
+
+ all_pages_decision_process_table = pd.DataFrame(
+ columns=[
+ "image_path",
+ "page",
+ "label",
+ "xmin",
+ "xmax",
+ "ymin",
+ "ymax",
+ "boundingBox",
+ "text",
+ "start",
+ "end",
+ "score",
+ "id",
+ ]
+ )
+ all_page_line_level_ocr_results_df = pd.DataFrame(
+ columns=["page", "text", "left", "top", "width", "height", "line", "conf"]
+ )
+
+ # Run through file loop, redact each file at a time
+ for file in file_paths_loop:
+
+ # Get a string file path
+ if isinstance(file, str):
+ file_path = file
+ else:
+ file_path = file.name
+
+ if file_path:
+ pdf_file_name_without_ext = get_file_name_without_type(file_path)
+ pdf_file_name_with_ext = os.path.basename(file_path)
+ efficient_ocr_text_pages = (
+ None # set when efficient_ocr used; for combined_out_message
+ )
+ efficient_ocr_ocr_pages = None
+
+ is_a_pdf = is_pdf(file_path) is True
+ if (
+ is_a_pdf is False
+ and text_extraction_method == SELECTABLE_TEXT_EXTRACT_OPTION
+ ):
+ # If user has not submitted a pdf, assume it's an image
+ print(
+ "File is not a PDF, assuming that image analysis needs to be used."
+ )
+ text_extraction_method = LOCAL_OCR_MODEL_TEXT_EXTRACT_OPTION
+ else:
+ out_message = "No file selected"
+ print(out_message)
+ raise Exception(out_message)
+
+ # Output file paths names
+ orig_pdf_file_path = output_folder + pdf_file_name_without_ext
+
+ # Load in all_ocr_results_with_words if it exists as a file path and doesn't exist already
+
+ if text_extraction_method == SELECTABLE_TEXT_EXTRACT_OPTION:
+ file_ending = "local_text"
+ elif text_extraction_method == LOCAL_OCR_MODEL_TEXT_EXTRACT_OPTION:
+ file_ending = "local_ocr"
+ elif text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
+ file_ending = "textract"
+ elif text_extraction_method == BEDROCK_VLM_TEXT_EXTRACT_OPTION:
+ file_ending = "bedrock_vlm"
+ elif text_extraction_method == GEMINI_VLM_TEXT_EXTRACT_OPTION:
+ file_ending = "gemini_vlm"
+ elif text_extraction_method == AZURE_OPENAI_VLM_TEXT_EXTRACT_OPTION:
+ file_ending = "azure_openai_vlm"
+ else:
+ print(
+ "No valid text extraction method found. Defaulting to local text extraction."
+ )
+ text_extraction_method = SELECTABLE_TEXT_EXTRACT_OPTION
+ file_ending = "local_text"
+
+ all_page_line_level_ocr_results_with_words_json_file_path = (
+ output_folder
+ + pdf_file_name_without_ext
+ + "_ocr_results_with_words_"
+ + file_ending
+ + ".json"
+ )
+
+ if not all_page_line_level_ocr_results_with_words:
+ if (
+ not overwrite_existing_ocr_results
+ and local_ocr_output_found_checkbox is True
+ and os.path.exists(
+ all_page_line_level_ocr_results_with_words_json_file_path
+ )
+ ):
+ (
+ all_page_line_level_ocr_results_with_words,
+ is_missing,
+ log_files_output_paths,
+ ) = load_and_convert_ocr_results_with_words_json(
+ all_page_line_level_ocr_results_with_words_json_file_path,
+ log_files_output_paths,
+ page_sizes_df,
+ )
+ # original_all_page_line_level_ocr_results_with_words = all_page_line_level_ocr_results_with_words.copy()
+
+ # Remove any existing review_file paths from the review file outputs
+ # EFFICIENT_OCR: two-step process per page - try selectable text first, OCR only if needed.
+ # When text extraction is selectable-text only, skip EFFICIENT_OCR checks and run only redact_text_pdf.
+ # When Textract is selected with Extract signatures/forms/tables (not Face alone), skip
+ # EFFICIENT_OCR so all pages go through redact_image_pdf (full Textract analysis).
+ if (
+ efficient_ocr
+ and is_pdf(file_path)
+ and text_extraction_method != SELECTABLE_TEXT_EXTRACT_OPTION
+ and not _textract_needs_full_analysis_global
+ ):
+ print(
+ "Redacting file "
+ + pdf_file_name_with_ext
+ + " using efficient OCR (text extraction first, OCR fallback per page)"
+ )
+ # OCR method for pages that have no selectable text
+ ocr_fallback_method = (
+ text_extraction_method
+ if text_extraction_method
+ in (
+ LOCAL_OCR_MODEL_TEXT_EXTRACT_OPTION,
+ TEXTRACT_TEXT_EXTRACT_OPTION,
+ BEDROCK_VLM_TEXT_EXTRACT_OPTION,
+ GEMINI_VLM_TEXT_EXTRACT_OPTION,
+ AZURE_OPENAI_VLM_TEXT_EXTRACT_OPTION,
+ )
+ else LOCAL_OCR_MODEL_TEXT_EXTRACT_OPTION
+ )
+ start_page_0 = (page_min - 1) if page_min > 0 else 0
+ end_page_0 = start_page_0 + number_of_pages_to_process
+ all_pages_1based = list(range(start_page_0 + 1, end_page_0 + 1))
+
+ # Try to reuse existing OCR results (from a previous run) for the
+ # Efficient OCR word-count classification pass, so we do not
+ # re-extract text for all pages in subsequent sessions.
+ use_cached_word_counts = False
+ all_page_line_level_ocr_results_with_words_first = []
+ extraction_results = None
+
+ if not overwrite_existing_ocr_results:
+ if ocr_fallback_method == LOCAL_OCR_MODEL_TEXT_EXTRACT_OPTION:
+ cached_ocr_path = (
+ output_folder
+ + pdf_file_name_without_ext
+ + "_ocr_results_with_words_local_ocr.json"
+ )
+ if os.path.exists(cached_ocr_path):
+ print(
+ "EFFICIENT_OCR: Using existing local OCR results for word-count classification."
+ )
+ (
+ all_page_line_level_ocr_results_with_words_first,
+ _,
+ log_files_output_paths,
+ ) = load_and_convert_ocr_results_with_words_json(
+ cached_ocr_path,
+ log_files_output_paths,
+ page_sizes_df,
+ )
+ use_cached_word_counts = bool(
+ all_page_line_level_ocr_results_with_words_first
+ )
+ elif ocr_fallback_method == TEXTRACT_TEXT_EXTRACT_OPTION:
+ # Use OCR-results-with-words file (same format as local OCR), not raw
+ # Textract JSON, so _word_count_by_page_from_ocr_results_with_words gets
+ # a list of dicts with "page" and "results".
+ cached_ocr_path = (
+ output_folder
+ + pdf_file_name_without_ext
+ + "_ocr_results_with_words_textract.json"
+ )
+ if os.path.exists(cached_ocr_path):
+ print(
+ "EFFICIENT_OCR: Using existing Textract OCR results for word-count classification."
+ )
+ (
+ all_page_line_level_ocr_results_with_words_first,
+ _,
+ log_files_output_paths,
+ ) = load_and_convert_ocr_results_with_words_json(
+ cached_ocr_path,
+ log_files_output_paths,
+ page_sizes_df,
+ )
+ use_cached_word_counts = bool(
+ all_page_line_level_ocr_results_with_words_first
+ )
+
+ if use_cached_word_counts:
+ page_word_counts = _word_count_by_page_from_ocr_results_with_words(
+ all_page_line_level_ocr_results_with_words_first,
+ all_pages_1based,
+ )
+ else:
+ # EFFICIENT_OCR: use redact_text_pdf (extraction only) on all pages to get word counts;
+ # no separate check loop—extraction doubles as the check. Then classify pages by threshold.
+ progress(
+ 0.4,
+ desc="Efficient OCR: Extracting text on all pages (word-count check)",
+ )
+ (
+ pymupdf_doc,
+ _,
+ _,
+ annotations_all_pages,
+ _,
+ page_break_return,
+ comprehend_query_number,
+ all_page_line_level_ocr_results_with_words_first,
+ llm_model_name_text,
+ llm_total_input_tokens_text,
+ llm_total_output_tokens_text,
+ extraction_results,
+ ) = redact_text_pdf(
+ file_path,
+ language,
+ chosen_redact_entities,
+ chosen_redact_comprehend_entities,
+ in_allow_list_flat,
+ page_min,
+ page_max if page_max > 0 else number_of_pages,
+ 0,
+ page_break_return,
+ annotations_all_pages,
+ all_page_line_level_ocr_results_df,
+ all_pages_decision_process_table,
+ pymupdf_doc,
+ list(), # empty: first pass only used for word-count classification
+ pii_identification_method,
+ comprehend_query_number,
+ comprehend_client,
+ custom_recogniser_word_list_flat,
+ redact_whole_page_list_flat,
+ max_fuzzy_spelling_mistakes_num,
+ match_fuzzy_whole_phrase_bool,
+ page_sizes_df,
+ document_cropboxes,
+ True, # text_extraction_only
+ output_folder=output_folder,
+ input_folder=input_folder,
+ bedrock_runtime=bedrock_runtime,
+ model_choice=CLOUD_LLM_PII_MODEL_CHOICE,
+ custom_llm_instructions=custom_llm_instructions,
+ chosen_llm_entities=chosen_llm_entities,
+ efficient_ocr=efficient_ocr,
+ pages_to_process=all_pages_1based,
+ efficient_ocr_extraction_pass=True,
+ )
+ llm_total_input_tokens += llm_total_input_tokens_text
+ llm_total_output_tokens += llm_total_output_tokens_text
+ if llm_model_name_text and not llm_model_name:
+ llm_model_name = llm_model_name_text
+
+ page_word_counts = _word_count_by_page_from_ocr_results_with_words(
+ all_page_line_level_ocr_results_with_words_first, all_pages_1based
+ )
+ if efficient_ocr_min_image_coverage_fraction > 0:
+ progress(
+ 0.42,
+ desc="Efficient OCR: scanning pages for significant embedded images",
+ )
+ pages_flagged_for_image_ocr = (
+ _efficient_ocr_pages_with_significant_embedded_images(
+ file_path,
+ all_pages_1based,
+ efficient_ocr_min_image_coverage_fraction,
+ pymupdf_doc,
+ )
+ )
+ if pages_flagged_for_image_ocr:
+ print(
+ "EFFICIENT_OCR: "
+ + str(len(pages_flagged_for_image_ocr))
+ + " page(s) routed to OCR due to embedded images (coverage >= "
+ + f"{efficient_ocr_min_image_coverage_fraction:.1%}"
+ + " of page area)."
+ )
+ pages_with_text_1based = [
+ p
+ for p in all_pages_1based
+ if page_word_counts.get(p, 0) >= efficient_ocr_min_words
+ and p not in pages_flagged_for_image_ocr
+ ]
+ pages_needing_ocr_1based = [
+ p
+ for p in all_pages_1based
+ if page_word_counts.get(p, 0) < efficient_ocr_min_words
+ or p in pages_flagged_for_image_ocr
+ ]
+ efficient_ocr_text_pages = len(pages_with_text_1based)
+ efficient_ocr_ocr_pages = len(pages_needing_ocr_1based)
+
+ # Hold text-path outputs for merge after OCR when we have both paths
+ _text_path_decision_table = None
+ _text_path_annotations = None
+
+ if pages_with_text_1based:
+ progress(
+ 0.5,
+ desc="Processing pages with selectable text extraction (no OCR)",
+ )
+
+ print(
+ f"EFFICIENT_OCR: Processing {len(pages_with_text_1based)} page(s) with selectable text extraction (no OCR)."
+ )
+ pages_with_text_set = set(pages_with_text_1based)
+ if extraction_results is not None:
+ pre_extracted_for_text = [
+ r
+ for r in extraction_results
+ if (r[0] + 1) in pages_with_text_set
+ ]
+ else:
+ pre_extracted_for_text = None
+ (
+ pymupdf_doc,
+ all_pages_decision_process_table,
+ all_page_line_level_ocr_results_df,
+ annotations_all_pages,
+ current_loop_page,
+ page_break_return,
+ comprehend_query_number,
+ all_page_line_level_ocr_results_with_words,
+ llm_model_name_text,
+ llm_total_input_tokens_text,
+ llm_total_output_tokens_text,
+ ) = redact_text_pdf(
+ file_path,
+ language,
+ chosen_redact_entities,
+ chosen_redact_comprehend_entities,
+ in_allow_list_flat,
+ page_min,
+ page_max if page_max > 0 else number_of_pages,
+ 0,
+ page_break_return,
+ annotations_all_pages,
+ all_page_line_level_ocr_results_df,
+ all_pages_decision_process_table,
+ pymupdf_doc,
+ list(), # empty so only text-page data is accumulated (OCR pages filled by redact_image_pdf)
+ pii_identification_method,
+ comprehend_query_number,
+ comprehend_client,
+ custom_recogniser_word_list_flat,
+ redact_whole_page_list_flat,
+ max_fuzzy_spelling_mistakes_num,
+ match_fuzzy_whole_phrase_bool,
+ page_sizes_df,
+ document_cropboxes,
+ text_extraction_only,
+ output_folder=output_folder,
+ input_folder=input_folder,
+ bedrock_runtime=bedrock_runtime,
+ model_choice=CLOUD_LLM_PII_MODEL_CHOICE,
+ custom_llm_instructions=custom_llm_instructions,
+ chosen_llm_entities=chosen_llm_entities,
+ efficient_ocr=efficient_ocr,
+ pages_to_process=pages_with_text_1based,
+ pre_extracted_results=pre_extracted_for_text,
+ )
+ llm_total_input_tokens += llm_total_input_tokens_text
+ llm_total_output_tokens += llm_total_output_tokens_text
+ if llm_model_name_text and not llm_model_name:
+ llm_model_name = llm_model_name_text
+
+ # Save text-path outputs so we can merge them after OCR (ensures review file includes text-path redactions)
+ if pages_needing_ocr_1based:
+ _text_path_decision_table = all_pages_decision_process_table.copy()
+ _text_path_annotations = list(annotations_all_pages)
+
+ if pages_needing_ocr_1based:
+ progress(0.55, desc="Creating images for pages that need OCR")
+ pdf_image_file_paths, page_sizes = prepare_images_for_pages(
+ file_path,
+ pages_needing_ocr_1based,
+ input_folder,
+ pymupdf_doc,
+ page_sizes,
+ progress,
+ )
+ page_sizes_df = pd.DataFrame(page_sizes)
+ page_sizes_df[["page"]] = page_sizes_df[["page"]].apply(
+ pd.to_numeric, errors="coerce"
+ )
+ progress(0.6, desc="Processing pages with OCR")
+ if ocr_fallback_method == LOCAL_OCR_MODEL_TEXT_EXTRACT_OPTION:
+ _local_ocr_display = {
+ "tesseract": "Tesseract (local OCR)",
+ "paddle": "PaddleOCR (local)",
+ "hybrid-paddle": "Hybrid Paddle (local)",
+ "hybrid-vlm": "Hybrid VLM (local)",
+ "hybrid-paddle-vlm": "Hybrid Paddle+VLM (local)",
+ "hybrid-paddle-inference-server": "Hybrid Paddle + inference server",
+ "vlm": "Local VLM",
+ "inference-server": "Inference server VLM OCR",
+ }
+ ocr_method_label = _local_ocr_display.get(
+ chosen_local_ocr_model or "tesseract",
+ f"Local OCR ({chosen_local_ocr_model})",
+ )
+ else:
+ ocr_method_label = (
+ "AWS Textract"
+ if ocr_fallback_method == TEXTRACT_TEXT_EXTRACT_OPTION
+ else (
+ "Bedrock VLM"
+ if ocr_fallback_method == BEDROCK_VLM_TEXT_EXTRACT_OPTION
+ else (
+ "Gemini VLM"
+ if ocr_fallback_method == GEMINI_VLM_TEXT_EXTRACT_OPTION
+ else (
+ "Azure/OpenAI VLM"
+ if ocr_fallback_method
+ == AZURE_OPENAI_VLM_TEXT_EXTRACT_OPTION
+ else str(ocr_fallback_method)
+ )
+ )
+ )
+ )
+ print(
+ f"EFFICIENT_OCR: Processing {len(pages_needing_ocr_1based)} page(s) with OCR ({ocr_method_label})."
+ )
+ _defer_inline_vlm_detection_for_post_pass = (
+ _run_vlm_pass_after_all_pages
+ and (
+ CUSTOM_VLM_BACKEND != "bedrock_vlm"
+ or bedrock_runtime is not None
+ )
+ )
+ # When both text and OCR paths run, pass an empty decision table so the image path
+ # only returns OCR rows; we then merge with the saved text-path table (no overwrite).
+ _decision_table_for_ocr = (
+ pd.DataFrame()
+ if (
+ pages_with_text_1based and _text_path_decision_table is not None
+ )
+ else all_pages_decision_process_table
+ )
+ (
+ pymupdf_doc,
+ all_pages_decision_process_table,
+ log_files_output_paths,
+ new_textract_request_metadata,
+ annotations_all_pages,
+ current_loop_page,
+ page_break_return,
+ all_page_line_level_ocr_results_df,
+ comprehend_query_number,
+ all_page_line_level_ocr_results,
+ all_page_line_level_ocr_results_with_words,
+ selection_element_results_list_df,
+ form_key_value_results_list_df,
+ out_file_paths,
+ llm_model_name,
+ llm_total_input_tokens,
+ llm_total_output_tokens,
+ vlm_model_name_page,
+ vlm_total_input_tokens_page,
+ vlm_total_output_tokens_page,
+ ) = redact_image_pdf(
+ file_path,
+ pdf_image_file_paths,
+ language,
+ chosen_redact_entities,
+ chosen_redact_comprehend_entities,
+ in_allow_list_flat,
+ chosen_llm_entities,
+ page_min,
+ page_max if page_max > 0 else number_of_pages,
+ ocr_fallback_method,
+ handwrite_signature_checkbox,
+ blank_request_metadata,
+ 0,
+ page_break_return,
+ annotations_all_pages,
+ all_page_line_level_ocr_results_df,
+ _decision_table_for_ocr,
+ pymupdf_doc,
+ pii_identification_method,
+ comprehend_query_number,
+ comprehend_client,
+ bedrock_runtime,
+ textract_client,
+ gemini_client,
+ gemini_config,
+ azure_openai_client,
+ custom_recogniser_word_list_flat,
+ redact_whole_page_list_flat,
+ max_fuzzy_spelling_mistakes_num,
+ match_fuzzy_whole_phrase_bool,
+ page_sizes_df,
+ text_extraction_only,
+ textract_output_found,
+ all_page_line_level_ocr_results,
+ all_page_line_level_ocr_results_with_words,
+ chosen_local_ocr_model,
+ log_files_output_paths=log_files_output_paths,
+ out_file_paths=out_file_paths,
+ nlp_analyser=nlp_analyser,
+ output_folder=output_folder,
+ input_folder=input_folder,
+ custom_llm_instructions=custom_llm_instructions,
+ inference_server_vlm_model=inference_server_vlm_model,
+ efficient_ocr=efficient_ocr,
+ save_page_ocr_visualisations=save_page_ocr_visualisations,
+ pages_to_process=pages_needing_ocr_1based,
+ ocr_first_pass_max_workers=ocr_first_pass_max_workers,
+ pages_in_pdf_points=pages_with_text_1based,
+ hybrid_textract_bedrock_vlm=hybrid_textract_bedrock_vlm,
+ overwrite_existing_ocr_results=overwrite_existing_ocr_results,
+ defer_inline_custom_vlm_detection_pass=_defer_inline_vlm_detection_for_post_pass,
+ )
+ out_file_paths = out_file_paths.copy()
+ vlm_total_input_tokens += vlm_total_input_tokens_page
+ vlm_total_output_tokens += vlm_total_output_tokens_page
+ if vlm_model_name_page and not vlm_model_name:
+ vlm_model_name = vlm_model_name_page
+ if new_textract_request_metadata and isinstance(
+ new_textract_request_metadata, list
+ ):
+ all_textract_request_metadata.extend(new_textract_request_metadata)
+
+ # Merge text-path and OCR-path outputs so review file and CSV include text-path redactions
+ if (
+ pages_with_text_1based
+ and _text_path_decision_table is not None
+ and not _text_path_decision_table.empty
+ ):
+ if (
+ "page" in _text_path_decision_table.columns
+ and "page" in all_pages_decision_process_table.columns
+ ):
+ text_rows = _text_path_decision_table[
+ _text_path_decision_table["page"]
+ .astype(int)
+ .isin(pages_with_text_1based)
+ ]
+ ocr_rows = all_pages_decision_process_table[
+ all_pages_decision_process_table["page"]
+ .astype(int)
+ .isin(pages_needing_ocr_1based)
+ ]
+ all_pages_decision_process_table = (
+ pd.concat([text_rows, ocr_rows], ignore_index=True)
+ .sort_values("page")
+ .reset_index(drop=True)
+ )
+ if _text_path_annotations:
+ seen_images = {
+ ann.get("image") for ann in annotations_all_pages
+ }
+ for ann in _text_path_annotations:
+ if ann.get("image") and ann.get("image") not in seen_images:
+ annotations_all_pages.append(ann)
+ seen_images.add(ann.get("image"))
+
+ # Set current_loop_page so downstream logic sees "all pages done"
+ current_loop_page = number_of_pages_to_process
+
+ # Ensure Total pages (annotate_max_pages / annotate_max_pages_bottom) show full
+ # document count when mixing text extraction and OCR pages
+ annotate_max_pages = number_of_pages
+
+ # OCR results are in PDF points only when no OCR path ran (text path only).
+ # When redact_image_pdf ran, results are in image pixels.
+ ocr_results_use_pdf_points = not pages_needing_ocr_1based
+ # For mixed (text + OCR) we need per-page division; record text-extracted pages.
+ pages_with_text_extraction_1based = (
+ list(pages_with_text_1based) if pages_with_text_1based else None
+ )
+
+ # CUSTOM_VLM under EFFICIENT_OCR: CUSTOM_VLM_SIGNATURE uses full merged page
+ # images; CUSTOM_VLM_FACES (and Textract Face detection) uses sparse OCR-only
+ # paths (pages_needing_ocr_1based). Two passes when both are selected.
+ _vlm_backend_available = (
+ CUSTOM_VLM_BACKEND != "bedrock_vlm" or bedrock_runtime is not None
+ )
+ if (
+ _run_vlm_pass_after_all_pages
+ and _vlm_backend_available
+ and pymupdf_doc is not None
+ and not isinstance(pymupdf_doc, list)
+ ):
+ num_pages = pymupdf_doc.page_count
+ if num_pages and num_pages > 0:
+ progress(
+ 0.65, desc="Preparing custom VLM entity (face/signature) pass"
+ )
+ entities_for_vlm = list(chosen_redact_entities or [])
+ for _vlm_src in (
+ chosen_redact_comprehend_entities or [],
+ chosen_llm_entities or [],
+ ):
+ for _e in _vlm_src:
+ if (
+ str(_e).startswith("CUSTOM_VLM")
+ and _e not in entities_for_vlm
+ ):
+ entities_for_vlm.append(_e)
+ if (
+ "Face detection" in (handwrite_signature_checkbox or [])
+ and "CUSTOM_VLM_FACES" not in entities_for_vlm
+ ):
+ entities_for_vlm.append("CUSTOM_VLM_FACES")
+
+ _vlm_wants_person = "CUSTOM_VLM_FACES" in entities_for_vlm
+ _vlm_wants_signature = "CUSTOM_VLM_SIGNATURE" in entities_for_vlm
+
+ _vlm_rounds = []
+ if _vlm_wants_signature:
+ full_merged_paths, page_sizes = (
+ _build_full_merged_pdf_image_paths_for_vlm(
+ file_path,
+ num_pages,
+ pages_needing_ocr_1based,
+ pages_with_text_1based,
+ pdf_image_file_paths,
+ all_pages_1based,
+ input_folder,
+ pymupdf_doc,
+ page_sizes,
+ progress,
+ )
+ )
+ full_page_sizes_df = pd.DataFrame(page_sizes)
+ if (
+ not full_page_sizes_df.empty
+ and "page" in full_page_sizes_df.columns
+ ):
+ full_page_sizes_df[["page"]] = full_page_sizes_df[
+ ["page"]
+ ].apply(pd.to_numeric, errors="coerce")
+ _vlm_rounds.append(
+ (
+ ["CUSTOM_VLM_SIGNATURE"],
+ full_merged_paths,
+ full_page_sizes_df,
+ "Running CUSTOM_VLM signature detection on all page images",
+ )
+ )
+ if _vlm_wants_person:
+ person_page_sizes_df = pd.DataFrame(page_sizes)
+ if (
+ not person_page_sizes_df.empty
+ and "page" in person_page_sizes_df.columns
+ ):
+ person_page_sizes_df[["page"]] = person_page_sizes_df[
+ ["page"]
+ ].apply(pd.to_numeric, errors="coerce")
+ _vlm_rounds.append(
+ (
+ ["CUSTOM_VLM_FACES"],
+ pdf_image_file_paths,
+ person_page_sizes_df,
+ "Running custom VLM entity detection on OCR-classified pages only",
+ )
+ )
+
+ if not _vlm_rounds:
+ full_merged_paths, page_sizes = (
+ _build_full_merged_pdf_image_paths_for_vlm(
+ file_path,
+ num_pages,
+ pages_needing_ocr_1based,
+ pages_with_text_1based,
+ pdf_image_file_paths,
+ all_pages_1based,
+ input_folder,
+ pymupdf_doc,
+ page_sizes,
+ progress,
+ )
+ )
+ full_page_sizes_df = pd.DataFrame(page_sizes)
+ if (
+ not full_page_sizes_df.empty
+ and "page" in full_page_sizes_df.columns
+ ):
+ full_page_sizes_df[["page"]] = full_page_sizes_df[
+ ["page"]
+ ].apply(pd.to_numeric, errors="coerce")
+ _vlm_rounds.append(
+ (
+ entities_for_vlm,
+ full_merged_paths,
+ full_page_sizes_df,
+ "Running custom VLM entity detection on all page images",
+ )
+ )
+
+ for _round_idx, (
+ _vlm_entities,
+ _vlm_paths,
+ _vlm_sizes_df,
+ _vlm_msg,
+ ) in enumerate(_vlm_rounds):
+ progress(
+ 0.7 + 0.02 * float(_round_idx),
+ desc=_vlm_msg,
+ )
+ print(_vlm_msg)
+ (
+ vlm_in,
+ vlm_out,
+ vlm_name,
+ vlm_annotations_list,
+ vlm_decision_rows,
+ ) = run_custom_vlm_only_pass(
+ file_path,
+ _vlm_paths,
+ pymupdf_doc,
+ _vlm_sizes_df,
+ _vlm_entities,
+ bedrock_runtime,
+ output_folder,
+ input_folder,
+ num_pages,
+ page_min=page_min,
+ page_max=page_max if page_max > 0 else num_pages,
+ progress=progress,
+ inference_server_vlm_model=inference_server_vlm_model,
+ )
+ vlm_total_input_tokens += vlm_in
+ vlm_total_output_tokens += vlm_out
+ if vlm_name and not vlm_model_name:
+ vlm_model_name = vlm_name
+ # Merge VLM boxes into annotations and decision table.
+ if vlm_annotations_list:
+ annotations_all_pages.extend(vlm_annotations_list)
+ if vlm_decision_rows:
+ vlm_df = pd.DataFrame(vlm_decision_rows)
+ if (
+ all_pages_decision_process_table is not None
+ and not all_pages_decision_process_table.empty
+ ):
+ all_pages_decision_process_table = pd.concat(
+ [
+ all_pages_decision_process_table,
+ vlm_df,
+ ],
+ ignore_index=True,
+ )
+ else:
+ all_pages_decision_process_table = vlm_df
+
+ elif (
+ text_extraction_method == LOCAL_OCR_MODEL_TEXT_EXTRACT_OPTION
+ or text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION
+ or text_extraction_method == BEDROCK_VLM_TEXT_EXTRACT_OPTION
+ or text_extraction_method == GEMINI_VLM_TEXT_EXTRACT_OPTION
+ or text_extraction_method == AZURE_OPENAI_VLM_TEXT_EXTRACT_OPTION
+ ):
+
+ # Analyse and redact image-based pdf or image
+ if is_pdf_or_image(file_path) is False:
+ out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
+ raise Exception(out_message)
+
+ print(
+ "Redacting file " + pdf_file_name_with_ext + " as an image-based file"
+ )
+
+ (
+ pymupdf_doc,
+ all_pages_decision_process_table,
+ log_files_output_paths,
+ new_textract_request_metadata,
+ annotations_all_pages,
+ current_loop_page,
+ page_break_return,
+ all_page_line_level_ocr_results_df,
+ comprehend_query_number,
+ all_page_line_level_ocr_results,
+ all_page_line_level_ocr_results_with_words,
+ selection_element_results_list_df,
+ form_key_value_results_list_df,
+ out_file_paths,
+ llm_model_name,
+ llm_total_input_tokens,
+ llm_total_output_tokens,
+ vlm_model_name_page,
+ vlm_total_input_tokens_page,
+ vlm_total_output_tokens_page,
+ ) = redact_image_pdf(
+ file_path,
+ pdf_image_file_paths,
+ language,
+ chosen_redact_entities,
+ chosen_redact_comprehend_entities,
+ in_allow_list_flat,
+ chosen_llm_entities,
+ page_min,
+ page_max,
+ text_extraction_method,
+ handwrite_signature_checkbox,
+ blank_request_metadata,
+ current_loop_page,
+ page_break_return,
+ annotations_all_pages,
+ all_page_line_level_ocr_results_df,
+ all_pages_decision_process_table,
+ pymupdf_doc,
+ pii_identification_method,
+ comprehend_query_number,
+ comprehend_client,
+ bedrock_runtime,
+ textract_client,
+ gemini_client,
+ gemini_config,
+ azure_openai_client,
+ custom_recogniser_word_list_flat,
+ redact_whole_page_list_flat,
+ max_fuzzy_spelling_mistakes_num,
+ match_fuzzy_whole_phrase_bool,
+ page_sizes_df,
+ text_extraction_only,
+ textract_output_found,
+ all_page_line_level_ocr_results,
+ all_page_line_level_ocr_results_with_words,
+ chosen_local_ocr_model,
+ log_files_output_paths=log_files_output_paths,
+ out_file_paths=out_file_paths,
+ nlp_analyser=nlp_analyser,
+ output_folder=output_folder,
+ input_folder=input_folder,
+ custom_llm_instructions=custom_llm_instructions,
+ inference_server_vlm_model=inference_server_vlm_model,
+ efficient_ocr=efficient_ocr,
+ save_page_ocr_visualisations=save_page_ocr_visualisations,
+ ocr_first_pass_max_workers=ocr_first_pass_max_workers,
+ hybrid_textract_bedrock_vlm=hybrid_textract_bedrock_vlm,
+ overwrite_existing_ocr_results=overwrite_existing_ocr_results,
+ )
+
+ # This line creates a copy of out_file_paths to break potential links with log_files_output_paths
+ out_file_paths = out_file_paths.copy()
+
+ # Accumulate VLM token usage
+ vlm_total_input_tokens += vlm_total_input_tokens_page
+ vlm_total_output_tokens += vlm_total_output_tokens_page
+ if vlm_model_name_page and not vlm_model_name:
+ vlm_model_name = vlm_model_name_page
+
+ # Save Textract request metadata (if exists)
+ if new_textract_request_metadata and isinstance(
+ new_textract_request_metadata, list
+ ):
+ all_textract_request_metadata.extend(new_textract_request_metadata)
+
+ elif text_extraction_method == SELECTABLE_TEXT_EXTRACT_OPTION:
+
+ if is_pdf(file_path) is False:
+ out_message = "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'."
+ raise Exception(out_message)
+
+ # Analyse text-based pdf
+ print("Redacting file as text-based PDF")
+
+ (
+ pymupdf_doc,
+ all_pages_decision_process_table,
+ all_page_line_level_ocr_results_df,
+ annotations_all_pages,
+ current_loop_page,
+ page_break_return,
+ comprehend_query_number,
+ all_page_line_level_ocr_results_with_words,
+ llm_model_name_text,
+ llm_total_input_tokens_text,
+ llm_total_output_tokens_text,
+ ) = redact_text_pdf(
+ file_path,
+ language,
+ chosen_redact_entities,
+ chosen_redact_comprehend_entities,
+ in_allow_list_flat,
+ page_min,
+ page_max,
+ current_loop_page,
+ page_break_return,
+ annotations_all_pages,
+ all_page_line_level_ocr_results_df,
+ all_pages_decision_process_table,
+ pymupdf_doc,
+ all_page_line_level_ocr_results_with_words,
+ pii_identification_method,
+ comprehend_query_number,
+ comprehend_client,
+ custom_recogniser_word_list_flat,
+ redact_whole_page_list_flat,
+ max_fuzzy_spelling_mistakes_num,
+ match_fuzzy_whole_phrase_bool,
+ page_sizes_df,
+ document_cropboxes,
+ text_extraction_only,
+ output_folder=output_folder,
+ input_folder=input_folder,
+ bedrock_runtime=bedrock_runtime,
+ model_choice=CLOUD_LLM_PII_MODEL_CHOICE,
+ custom_llm_instructions=custom_llm_instructions,
+ chosen_llm_entities=chosen_llm_entities,
+ efficient_ocr=efficient_ocr,
+ )
+
+ # Accumulate LLM token usage from text PDF redaction
+ llm_total_input_tokens += llm_total_input_tokens_text
+ llm_total_output_tokens += llm_total_output_tokens_text
+ if llm_model_name_text and not llm_model_name:
+ llm_model_name = llm_model_name_text
+
+ # When selectable text + CUSTOM_VLM_*: keep text from PDF, run VLM (face/signature) on images only.
+ # Backend from CUSTOM_VLM_BACKEND (bedrock_vlm, inference_vlm, transformers_vlm).
+ _vlm_backend_available_selectable = (
+ CUSTOM_VLM_BACKEND != "bedrock_vlm" or bedrock_runtime is not None
+ )
+ if (
+ _custom_vlm_requires_image_global
+ and _vlm_backend_available_selectable
+ and pdf_image_file_paths
+ ):
+ print(
+ "CUSTOM_VLM entity (face/signature/etc.) selected with simple text extraction: "
+ "running VLM detection on page images and merging with text-based redactions."
+ )
+ _entities_vlm_selectable = list(chosen_redact_entities or [])
+ for _vlm_src in (
+ chosen_redact_comprehend_entities or [],
+ chosen_llm_entities or [],
+ ):
+ for _e in _vlm_src:
+ if (
+ str(_e).startswith("CUSTOM_VLM")
+ and _e not in _entities_vlm_selectable
+ ):
+ _entities_vlm_selectable.append(_e)
+ (
+ vlm_in,
+ vlm_out,
+ vlm_name,
+ vlm_annotations_list,
+ vlm_decision_rows,
+ ) = run_custom_vlm_only_pass(
+ file_path,
+ pdf_image_file_paths,
+ pymupdf_doc,
+ page_sizes_df,
+ _entities_vlm_selectable,
+ bedrock_runtime,
+ output_folder,
+ input_folder,
+ number_of_pages,
+ page_min=page_min,
+ page_max=page_max if page_max > 0 else number_of_pages,
+ progress=progress,
+ inference_server_vlm_model=inference_server_vlm_model,
+ )
+ vlm_total_input_tokens += vlm_in
+ vlm_total_output_tokens += vlm_out
+ if vlm_name and not vlm_model_name:
+ vlm_model_name = vlm_name
+ if vlm_annotations_list:
+ annotations_all_pages.extend(vlm_annotations_list)
+ if vlm_decision_rows:
+ vlm_df = pd.DataFrame(vlm_decision_rows)
+ if (
+ all_pages_decision_process_table is not None
+ and not all_pages_decision_process_table.empty
+ ):
+ all_pages_decision_process_table = pd.concat(
+ [all_pages_decision_process_table, vlm_df],
+ ignore_index=True,
+ )
+ else:
+ all_pages_decision_process_table = vlm_df
+ else:
+ out_message = "No redaction method selected"
+ print(out_message)
+ raise Exception(out_message)
+
+ # If at last page, save to file - CHANGED - now will return outputs regardless of page progress.
+ # if current_loop_page >= number_of_pages_to_process:
+
+ print(
+ "Current page number",
+ (page_min + current_loop_page),
+ "is the last page processed.",
+ )
+ latest_file_completed += 1
+ # current_loop_page = 999
+
+ if latest_file_completed != len(file_paths_list):
+ print(
+ "Completed file number:",
+ str(latest_file_completed),
+ "there are more files to do",
+ )
+
+ # Save redacted file
+ if pii_identification_method != NO_REDACTION_PII_OPTION:
+ if RETURN_REDACTED_PDF is True:
+ progress(0.9, "Saving redacted file")
+
+ if is_pdf(file_path) is False:
+ out_redacted_png_path = (
+ output_folder + pdf_file_name_without_ext + "_redacted.png"
+ )
+ # Add page range suffix if partial processing
+ out_redacted_png_path = add_page_range_suffix_to_file_path(
+ out_redacted_png_path,
+ page_min,
+ current_loop_page,
+ number_of_pages,
+ page_max,
+ )
+ # pymupdf_doc is an image list in this case
+ if isinstance(pymupdf_doc[-1], str):
+ # Normalize and validate path safety before opening image
+ normalized_path = os.path.normpath(
+ os.path.abspath(pymupdf_doc[-1])
+ )
+ if validate_path_containment(normalized_path, INPUT_FOLDER):
+ img = Image.open(normalized_path)
+ else:
+ raise ValueError(
+ f"Unsafe image path detected: {pymupdf_doc[-1]}"
+ )
+ # Otherwise could be an image object
+ else:
+ img = pymupdf_doc[-1]
+ img.save(out_redacted_png_path, "PNG", resolution=image_dpi)
+
+ if isinstance(out_redacted_png_path, str):
+ out_file_paths.append(out_redacted_png_path)
+ else:
+ out_file_paths.append(out_redacted_png_path[0])
+
+ # Same outputs as PDF route: _redacted.pdf and _redactions_for_review.pdf
+ try:
+ img_doc = pymupdf.open()
+ page = img_doc.new_page(width=img.width, height=img.height)
+ page.insert_image(page.rect, filename=out_redacted_png_path)
+ out_redacted_pdf_file_path = (
+ output_folder + pdf_file_name_without_ext + "_redacted.pdf"
+ )
+ out_redacted_pdf_file_path = add_page_range_suffix_to_file_path(
+ out_redacted_pdf_file_path,
+ page_min,
+ current_loop_page,
+ number_of_pages,
+ page_max,
+ )
+ save_pdf_with_or_without_compression(
+ img_doc, out_redacted_pdf_file_path
+ )
+ if isinstance(out_redacted_pdf_file_path, str):
+ out_file_paths.append(out_redacted_pdf_file_path)
+ else:
+ out_file_paths.append(out_redacted_pdf_file_path[0])
+ img_doc.close()
+
+ if RETURN_PDF_FOR_REVIEW is True:
+ out_review_pdf_file_path = (
+ output_folder
+ + pdf_file_name_without_ext
+ + "_redactions_for_review.pdf"
+ )
+ out_review_pdf_file_path = (
+ add_page_range_suffix_to_file_path(
+ out_review_pdf_file_path,
+ page_min,
+ current_loop_page,
+ number_of_pages,
+ page_max,
+ )
+ )
+ review_img_doc = pymupdf.open(out_redacted_pdf_file_path)
+ save_pdf_with_or_without_compression(
+ review_img_doc, out_review_pdf_file_path
+ )
+ if isinstance(out_review_pdf_file_path, str):
+ out_file_paths.append(out_review_pdf_file_path)
+ else:
+ out_file_paths.append(out_review_pdf_file_path[0])
+ review_img_doc.close()
+ except Exception as e:
+ print(f"Failed to create PDF outputs from image: {e}")
+
+ else:
+ # Check if we have dual PDF documents to save
+ applied_redaction_pymupdf_doc = None
+
+ if RETURN_PDF_FOR_REVIEW and RETURN_REDACTED_PDF:
+ # When efficient_ocr is true, some pages come from redact_image_pdf (OCR)
+ # and some from redact_text_pdf (selectable text). Merge both into one map
+ # so the final document has redactions applied to all processed pages.
+ has_image_pages = (
+ hasattr(redact_image_pdf, "_applied_redaction_pages")
+ and redact_image_pdf._applied_redaction_pages
+ )
+ has_text_pages = (
+ hasattr(redact_text_pdf, "_applied_redaction_pages")
+ and redact_text_pdf._applied_redaction_pages
+ )
+ if has_image_pages or has_text_pages:
+ # Create final document by copying the original document and replacing specific pages
+ applied_redaction_pymupdf_doc = pymupdf.open()
+ applied_redaction_pymupdf_doc.insert_pdf(pymupdf_doc)
+
+ # Build a single mapping from both image (OCR) and text paths.
+ # Add text-path pages first so they are not overwritten; then OCR pages.
+ applied_redaction_pages_map = {}
+ # Keep references to parent docs so insert_pdf can use them (avoid GC)
+ _applied_redaction_parent_docs = []
+
+ def add_pages_to_map(source_pages):
+ for applied_redaction_page_data in source_pages:
+ if isinstance(applied_redaction_page_data, tuple):
+ applied_redaction_page, original_page_number = (
+ applied_redaction_page_data
+ )
+ applied_redaction_pages_map[
+ original_page_number
+ ] = applied_redaction_page
+ if (
+ hasattr(applied_redaction_page, "parent")
+ and applied_redaction_page.parent
+ is not None
+ ):
+ _applied_redaction_parent_docs.append(
+ applied_redaction_page.parent
+ )
+ else:
+ applied_redaction_page = (
+ applied_redaction_page_data
+ )
+ applied_redaction_pages_map[0] = (
+ applied_redaction_page # Default to page 0 if no original number
+ )
+ if (
+ hasattr(applied_redaction_page, "parent")
+ and applied_redaction_page.parent
+ is not None
+ ):
+ _applied_redaction_parent_docs.append(
+ applied_redaction_page.parent
+ )
+
+ if has_text_pages:
+ add_pages_to_map(
+ redact_text_pdf._applied_redaction_pages,
+ )
+ if has_image_pages:
+ add_pages_to_map(
+ redact_image_pdf._applied_redaction_pages,
+ )
+
+ # Replace pages in the final document with their final versions.
+ # Process in descending page order so delete_page() does not shift
+ # indices of pages we have not yet replaced (text and OCR pages).
+ for original_page_number in sorted(
+ applied_redaction_pages_map.keys(), reverse=True
+ ):
+ applied_redaction_page = applied_redaction_pages_map[
+ original_page_number
+ ]
+ if (
+ original_page_number
+ < applied_redaction_pymupdf_doc.page_count
+ ):
+ # Remove the original page and insert the final page
+ applied_redaction_pymupdf_doc.delete_page(
+ original_page_number
+ )
+ try:
+ applied_redaction_pymupdf_doc.insert_pdf(
+ applied_redaction_page.parent,
+ from_page=applied_redaction_page.number,
+ to_page=applied_redaction_page.number,
+ start_at=original_page_number,
+ )
+ except IndexError:
+ # Retry without link processing if it fails
+ print(
+ "IndexError: Retrying without link processing"
+ )
+ applied_redaction_pymupdf_doc.insert_pdf(
+ applied_redaction_page.parent,
+ from_page=applied_redaction_page.number,
+ to_page=applied_redaction_page.number,
+ start_at=original_page_number,
+ links=False,
+ )
+
+ applied_redaction_pymupdf_doc[
+ original_page_number
+ ].apply_redactions(
+ images=APPLY_REDACTIONS_IMAGES,
+ graphics=APPLY_REDACTIONS_GRAPHICS,
+ text=APPLY_REDACTIONS_TEXT,
+ )
+
+ # Clear the stored final pages from both sources (guard for concurrent requests)
+ if has_image_pages and hasattr(
+ redact_image_pdf, "_applied_redaction_pages"
+ ):
+ delattr(redact_image_pdf, "_applied_redaction_pages")
+ if has_text_pages and hasattr(
+ redact_text_pdf, "_applied_redaction_pages"
+ ):
+ delattr(redact_text_pdf, "_applied_redaction_pages")
+
+ # Save final redacted PDF if we have dual outputs or if RETURN_PDF_FOR_REVIEW is False
+ if RETURN_PDF_FOR_REVIEW is False or applied_redaction_pymupdf_doc:
+ out_redacted_pdf_file_path = (
+ output_folder + pdf_file_name_without_ext + "_redacted.pdf"
+ )
+ # Add page range suffix if partial processing
+
+ out_redacted_pdf_file_path = add_page_range_suffix_to_file_path(
+ out_redacted_pdf_file_path,
+ page_min,
+ current_loop_page,
+ number_of_pages,
+ page_max,
+ )
+ # print("Saving redacted PDF file:", out_redacted_pdf_file_path)
+
+ # Use final document if available, otherwise use main document
+ doc_to_save = (
+ applied_redaction_pymupdf_doc
+ if applied_redaction_pymupdf_doc
+ else pymupdf_doc
+ )
+
+ if out_redacted_pdf_file_path:
+ save_pdf_with_or_without_compression(
+ doc_to_save, out_redacted_pdf_file_path
+ )
+
+ if isinstance(out_redacted_pdf_file_path, str):
+ out_file_paths.append(out_redacted_pdf_file_path)
+ else:
+ out_file_paths.append(out_redacted_pdf_file_path[0])
+
+ # Release file handle so Gradio can read the output (Windows)
+ # if applied_redaction_pymupdf_doc is not None:
+ # applied_redaction_pymupdf_doc.close()
+ # applied_redaction_pymupdf_doc = None
+ # elif (
+ # not RETURN_PDF_FOR_REVIEW
+ # and pymupdf_doc is not None
+ # and not isinstance(pymupdf_doc, list)
+ # ):
+ # try:
+ # pymupdf_doc.close()
+ # except Exception:
+ # pass
+ # pymupdf_doc = None
+
+ # Always return a file for review if a pdf is given and RETURN_PDF_FOR_REVIEW is True
+ if is_pdf(file_path) is True:
+ if RETURN_PDF_FOR_REVIEW is True:
+ out_review_pdf_file_path = (
+ output_folder
+ + pdf_file_name_without_ext
+ + "_redactions_for_review.pdf"
+ )
+ # Add page range suffix if partial processing
+ out_review_pdf_file_path = add_page_range_suffix_to_file_path(
+ out_review_pdf_file_path,
+ page_min,
+ current_loop_page,
+ number_of_pages,
+ page_max,
+ )
+
+ if out_review_pdf_file_path:
+ save_pdf_with_or_without_compression(
+ pymupdf_doc, out_review_pdf_file_path
+ )
+ if isinstance(out_review_pdf_file_path, str):
+ out_file_paths.append(out_review_pdf_file_path)
+ else:
+ out_file_paths.append(out_review_pdf_file_path[0])
+ # Release file handle so Gradio can read the output (Windows)
+ # if pymupdf_doc is not None and not isinstance(pymupdf_doc, list):
+ # try:
+ # pymupdf_doc.close()
+ # except Exception:
+ # pass
+ # pymupdf_doc = None
+
+ if not all_page_line_level_ocr_results_df.empty:
+ all_page_line_level_ocr_results_df = all_page_line_level_ocr_results_df[
+ ["page", "text", "left", "top", "width", "height", "line", "conf"]
+ ]
+ else:
+ all_page_line_level_ocr_results_df = pd.DataFrame(
+ columns=[
+ "page",
+ "text",
+ "left",
+ "top",
+ "width",
+ "height",
+ "line",
+ "conf",
+ ]
+ )
+
+ ocr_file_path = (
+ output_folder
+ + pdf_file_name_without_ext
+ + "_ocr_output_"
+ + file_ending
+ + ".csv"
+ )
+ # Add page range suffix if partial processing
+ ocr_file_path = add_page_range_suffix_to_file_path(
+ ocr_file_path, page_min, current_loop_page, number_of_pages, page_max
+ )
+ all_page_line_level_ocr_results_df.sort_values(["page", "line"], inplace=True)
+ all_page_line_level_ocr_results_df.to_csv(
+ ocr_file_path, index=None, encoding="utf-8-sig"
+ )
+
+ if isinstance(ocr_file_path, str):
+ out_file_paths.append(ocr_file_path)
+ else:
+ out_file_paths.append(ocr_file_path[0])
+
+ # Set when word-level OCR JSON/CSV is written; required below even if list is empty
+ # (e.g. parallel Tesseract produced no merged results).
+ all_page_line_level_ocr_results_with_words_df_file_path = None
+
+ if all_page_line_level_ocr_results_with_words:
+ all_page_line_level_ocr_results_with_words = merge_page_results(
+ all_page_line_level_ocr_results_with_words
+ )
+
+ with open(
+ all_page_line_level_ocr_results_with_words_json_file_path, "w"
+ ) as json_file:
+ json.dump(
+ all_page_line_level_ocr_results_with_words,
+ json_file,
+ separators=(",", ":"),
+ )
+
+ all_page_line_level_ocr_results_with_words_df = (
+ word_level_ocr_output_to_dataframe(
+ all_page_line_level_ocr_results_with_words
+ )
+ )
+
+ # When EFFICIENT_OCR mixes text (PDF points) and OCR (image pixels), pass
+ # pages_in_pdf_points so division uses mediabox for text pages and image dims for OCR.
+ _pages_pdf_points = (
+ set(pages_with_text_extraction_1based)
+ if pages_with_text_extraction_1based
+ else None
+ )
+ all_page_line_level_ocr_results_with_words_df = (
+ divide_coordinates_by_page_sizes(
+ all_page_line_level_ocr_results_with_words_df,
+ page_sizes_df,
+ xmin="word_x0",
+ xmax="word_x1",
+ ymin="word_y0",
+ ymax="word_y1",
+ coordinates_in_pdf_points=(
+ text_extraction_method == SELECTABLE_TEXT_EXTRACT_OPTION
+ or ocr_results_use_pdf_points is True
+ ),
+ pages_in_pdf_points=_pages_pdf_points,
+ )
+ )
+ # Normalize line-level coordinates too (used as fallback max for word_y1)
+ all_page_line_level_ocr_results_with_words_df = (
+ divide_coordinates_by_page_sizes(
+ all_page_line_level_ocr_results_with_words_df,
+ page_sizes_df,
+ xmin="line_x0",
+ xmax="line_x1",
+ ymin="line_y0",
+ ymax="line_y1",
+ coordinates_in_pdf_points=(
+ text_extraction_method == SELECTABLE_TEXT_EXTRACT_OPTION
+ or ocr_results_use_pdf_points is True
+ ),
+ pages_in_pdf_points=_pages_pdf_points,
+ )
+ )
+
+ all_page_line_level_ocr_results_with_words_df["line_text"] = ""
+ # Keep line_x0, line_x1, line_y0, line_y1 so downstream can clip word boxes to line
+
+ sort_cols = ["page", "line", "word_x0"]
+ if not all_page_line_level_ocr_results_with_words_df.empty and all(
+ c in all_page_line_level_ocr_results_with_words_df.columns
+ for c in sort_cols
+ ):
+ all_page_line_level_ocr_results_with_words_df.sort_values(
+ sort_cols, inplace=True
+ )
+ all_page_line_level_ocr_results_with_words_df_file_path = (
+ all_page_line_level_ocr_results_with_words_json_file_path.replace(
+ ".json", ".csv"
+ )
+ )
+ # Add page range suffix if partial processing
+ all_page_line_level_ocr_results_with_words_df_file_path = (
+ add_page_range_suffix_to_file_path(
+ all_page_line_level_ocr_results_with_words_df_file_path,
+ page_min,
+ current_loop_page,
+ number_of_pages,
+ page_max,
+ )
+ )
+ # For rows where the subset columns are duplicated (i.e., all fields identical within the subset),
+ # set their values to empty values in those columns except for the first occurrence.
+ subset_cols = [
+ "line_text",
+ "line_x0",
+ "line_y0",
+ "line_x1",
+ "line_y1",
+ "line_conf",
+ ]
+ # Identify duplicated rows (excluding the first occurrence)
+ dupes_mask = all_page_line_level_ocr_results_with_words_df.duplicated(
+ subset=subset_cols, keep="first"
+ )
+ # Set these columns to empty for duplicated rows
+ for col in subset_cols:
+ all_page_line_level_ocr_results_with_words_df.loc[dupes_mask, col] = (
+ ""
+ if all_page_line_level_ocr_results_with_words_df[col].dtype == "O"
+ else None
+ )
+ all_page_line_level_ocr_results_with_words_df.to_csv(
+ all_page_line_level_ocr_results_with_words_df_file_path,
+ index=False,
+ encoding="utf-8-sig",
+ )
+
+ if (
+ all_page_line_level_ocr_results_with_words_json_file_path
+ not in log_files_output_paths
+ ):
+ if isinstance(
+ all_page_line_level_ocr_results_with_words_json_file_path, str
+ ):
+ log_files_output_paths.append(
+ all_page_line_level_ocr_results_with_words_json_file_path
+ )
+ else:
+ log_files_output_paths.append(
+ all_page_line_level_ocr_results_with_words_json_file_path[0]
+ )
+
+ if (
+ all_page_line_level_ocr_results_with_words_df_file_path
+ not in log_files_output_paths
+ ):
+ if isinstance(
+ all_page_line_level_ocr_results_with_words_df_file_path, str
+ ):
+ log_files_output_paths.append(
+ all_page_line_level_ocr_results_with_words_df_file_path
+ )
+ else:
+ log_files_output_paths.append(
+ all_page_line_level_ocr_results_with_words_df_file_path[0]
+ )
+
+ if (
+ all_page_line_level_ocr_results_with_words_df_file_path
+ not in out_file_paths
+ ):
+ if isinstance(
+ all_page_line_level_ocr_results_with_words_df_file_path, str
+ ):
+ out_file_paths.append(
+ all_page_line_level_ocr_results_with_words_df_file_path
+ )
+ else:
+ out_file_paths.append(
+ all_page_line_level_ocr_results_with_words_df_file_path[0]
+ )
+
+ # Save decision process outputs
+ if not all_pages_decision_process_table.empty:
+ all_pages_decision_process_table_file_path = (
+ output_folder
+ + pdf_file_name_without_ext
+ + "_all_pages_decision_process_table_output_"
+ + file_ending
+ + ".csv"
+ )
+ # Add page range suffix if partial processing
+ all_pages_decision_process_table_file_path = (
+ add_page_range_suffix_to_file_path(
+ all_pages_decision_process_table_file_path,
+ page_min,
+ current_loop_page,
+ number_of_pages,
+ page_max,
+ )
+ )
+ all_pages_decision_process_table.to_csv(
+ all_pages_decision_process_table_file_path,
+ index=None,
+ encoding="utf-8-sig",
+ )
+ log_files_output_paths.append(all_pages_decision_process_table_file_path)
+
+ # Save outputs from form analysis if they exist
+ if not selection_element_results_list_df.empty:
+ selection_element_results_list_df_file_path = (
+ output_folder
+ + pdf_file_name_without_ext
+ + "_selection_element_results_output_"
+ + file_ending
+ + ".csv"
+ )
+ # Add page range suffix if partial processing
+ selection_element_results_list_df_file_path = (
+ add_page_range_suffix_to_file_path(
+ selection_element_results_list_df_file_path,
+ page_min,
+ current_loop_page,
+ number_of_pages,
+ page_max,
+ )
+ )
+ selection_element_results_list_df.to_csv(
+ selection_element_results_list_df_file_path,
+ index=None,
+ encoding="utf-8-sig",
+ )
+ out_file_paths.append(selection_element_results_list_df_file_path)
+
+ if not form_key_value_results_list_df.empty:
+ form_key_value_results_list_df_file_path = (
+ output_folder
+ + pdf_file_name_without_ext
+ + "_form_key_value_results_output_"
+ + file_ending
+ + ".csv"
+ )
+ # Add page range suffix if partial processing
+ form_key_value_results_list_df_file_path = (
+ add_page_range_suffix_to_file_path(
+ form_key_value_results_list_df_file_path,
+ page_min,
+ current_loop_page,
+ number_of_pages,
+ page_max,
+ )
+ )
+ form_key_value_results_list_df.to_csv(
+ form_key_value_results_list_df_file_path,
+ index=None,
+ encoding="utf-8-sig",
+ )
+ out_file_paths.append(form_key_value_results_list_df_file_path)
+
+ # Convert the gradio annotation boxes to relative coordinates
+ progress(0.95, "Creating review file output")
+ # EFFICIENT_OCR: ensure text-extracted pages have mediabox dimensions and
+ # placeholder image path so divide_coordinates_by_page_sizes uses mediabox (not nan).
+ if (
+ pages_with_text_extraction_1based
+ and pymupdf_doc is not None
+ and not isinstance(pymupdf_doc, list)
+ and hasattr(pymupdf_doc, "load_page")
+ ):
+ pages_in_df = set(
+ pd.to_numeric(page_sizes_df["page"], errors="coerce")
+ .dropna()
+ .astype(int)
+ )
+ placeholder_base = "placeholder_image_{}.png"
+ for page_num in pages_with_text_extraction_1based:
+ try:
+ pymupdf_page = pymupdf_doc.load_page(int(page_num) - 1)
+ mb = pymupdf_page.mediabox
+ page_int = int(page_num)
+ if page_int in pages_in_df:
+ # Update existing row (e.g. placeholder with nan) so division has mediabox
+ mask = page_sizes_df["page"] == page_int
+ page_sizes_df.loc[mask, "image_width"] = mb.width
+ page_sizes_df.loc[mask, "image_height"] = mb.height
+ if "mediabox_width" in page_sizes_df.columns:
+ page_sizes_df.loc[mask, "mediabox_width"] = mb.width
+ if "mediabox_height" in page_sizes_df.columns:
+ page_sizes_df.loc[mask, "mediabox_height"] = mb.height
+ # Align image_path with placeholder so annotations match
+ page_sizes_df.loc[mask, "image_path"] = placeholder_base.format(
+ page_int - 1
+ )
+ else:
+ new_row = {
+ "page": page_num,
+ "image_path": placeholder_base.format(page_int - 1),
+ "image_width": mb.width,
+ "image_height": mb.height,
+ "mediabox_width": mb.width,
+ "mediabox_height": mb.height,
+ }
+ if "cropbox_width" in page_sizes_df.columns:
+ new_row["cropbox_width"] = pymupdf_page.cropbox.width
+ if "cropbox_height" in page_sizes_df.columns:
+ new_row["cropbox_height"] = pymupdf_page.cropbox.height
+ page_sizes_df = pd.concat(
+ [page_sizes_df, pd.DataFrame([new_row])],
+ ignore_index=True,
+ )
+ pages_in_df.add(page_int)
+ except Exception as e:
+ print(
+ f"Warning: Could not add/update mediabox for text-extracted page {page_num}: {e}"
+ )
+ page_sizes = page_sizes_df.to_dict(orient="records")
+
+ all_image_annotations_df = convert_annotation_data_to_dataframe(
+ annotations_all_pages
+ )
+ _pages_pdf_pts = (
+ set(pages_with_text_extraction_1based)
+ if pages_with_text_extraction_1based
+ else None
+ )
+ all_image_annotations_df = divide_coordinates_by_page_sizes(
+ all_image_annotations_df,
+ page_sizes_df,
+ xmin="xmin",
+ xmax="xmax",
+ ymin="ymin",
+ ymax="ymax",
+ coordinates_in_pdf_points=(
+ text_extraction_method == SELECTABLE_TEXT_EXTRACT_OPTION
+ or ocr_results_use_pdf_points is True
+ ),
+ pages_in_pdf_points=_pages_pdf_pts,
+ )
+ annotations_all_pages_divide = create_annotation_dicts_from_annotation_df(
+ all_image_annotations_df, page_sizes
+ )
+ annotations_all_pages_divide = remove_duplicate_images_with_blank_boxes(
+ annotations_all_pages_divide
+ )
+
+ # Save the gradio_annotation_boxes to a review csv file
+ review_file_state = convert_annotation_json_to_review_df(
+ annotations_all_pages_divide,
+ all_pages_decision_process_table,
+ page_sizes=page_sizes,
+ )
+
+ # Don't need page sizes in outputs
+ review_file_state.drop(
+ [
+ "image_width",
+ "image_height",
+ "mediabox_width",
+ "mediabox_height",
+ "cropbox_width",
+ "cropbox_height",
+ ],
+ axis=1,
+ inplace=True,
+ errors="ignore",
+ )
+
+ if (
+ pii_identification_method == NO_REDACTION_PII_OPTION
+ and not form_key_value_results_list_df.empty
+ ):
+ print(
+ "Form outputs found with no redaction method selected. Creating review file from form outputs."
+ )
+ review_file_state = form_key_value_results_list_df
+ annotations_all_pages_divide = create_annotation_dicts_from_annotation_df(
+ review_file_state, page_sizes
+ )
+
+ review_file_path = orig_pdf_file_path + "_review_file.csv"
+ # Add page range suffix if partial processing
+ review_file_path = add_page_range_suffix_to_file_path(
+ review_file_path, page_min, current_loop_page, number_of_pages, page_max
+ )
+ if isinstance(review_file_path, str):
+ review_file_state.to_csv(review_file_path, index=None, encoding="utf-8-sig")
+ else:
+ review_file_state.to_csv(
+ review_file_path[0], index=None, encoding="utf-8-sig"
+ )
+
+ if pii_identification_method != NO_REDACTION_PII_OPTION:
+ if isinstance(review_file_path, str):
+ out_file_paths.append(review_file_path)
+ else:
+ out_file_paths.append(review_file_path[0])
+
+ # Make a combined message for the file
+ if isinstance(combined_out_message, list):
+ combined_out_message = "\n".join(combined_out_message)
+ elif combined_out_message is None:
+ combined_out_message = ""
+
+ _sep = "\n" if combined_out_message else ""
+ if isinstance(out_message, list) and out_message:
+ combined_out_message = combined_out_message + _sep + "\n".join(out_message)
+ elif isinstance(out_message, str) and out_message:
+ combined_out_message = combined_out_message + _sep + out_message
+
+ if efficient_ocr_text_pages is not None:
+ _sep = "\n" if combined_out_message else ""
+ combined_out_message = (
+ combined_out_message
+ + _sep
+ + "Efficient OCR: "
+ + str(efficient_ocr_text_pages)
+ + " page(s) via text extraction, "
+ + str(efficient_ocr_ocr_pages)
+ + " page(s) via full OCR."
+ )
+
+ toc = time.perf_counter()
+ time_taken = toc - tic
+ estimated_time_taken_state += time_taken
+
+ estimated_time_taken_state = round(estimated_time_taken_state, 1)
+
+ out_time_message = f" Redacted in {estimated_time_taken_state:0.1f} seconds."
+ combined_out_message = (
+ combined_out_message + " " + out_time_message
+ ) # Ensure this is a single string
+
+ # Add redaction summary (total redactions and pages with redactions)
+ if isinstance(review_file_state, pd.DataFrame) and not review_file_state.empty:
+ total_redactions = len(review_file_state)
+ pages_with_redactions = (
+ review_file_state["page"].nunique()
+ if "page" in review_file_state.columns
+ else 0
+ )
+ combined_out_message = (
+ combined_out_message
+ + f" Total redactions: {total_redactions}. Pages with redactions: {pages_with_redactions}."
+ )
+
+ sum_numbers_before_seconds(combined_out_message)
+
+ # else:
+ # toc = time.perf_counter()
+ # time_taken = toc - tic
+ # estimated_time_taken_state += time_taken
+
+ # If textract requests made, write to logging file. Also record number of Textract requests
+ if all_textract_request_metadata and isinstance(
+ all_textract_request_metadata, list
+ ):
+ all_request_metadata_str = "\n".join(all_textract_request_metadata).strip()
+
+ textract_metadata_filename = (
+ pdf_file_name_without_ext + "_textract_metadata.txt"
+ )
+
+ # Add page range suffix if partial processing
+ textract_metadata_filename = add_page_range_suffix_to_file_path(
+ textract_metadata_filename,
+ page_min,
+ current_loop_page,
+ number_of_pages,
+ page_max,
+ )
+
+ secure_file_write(
+ output_folder,
+ textract_metadata_filename,
+ all_request_metadata_str,
+ )
+
+ all_textract_request_metadata_file_path = (
+ output_folder + textract_metadata_filename
+ )
+
+ # Add the request metadata to the log outputs if not there already
+ if all_textract_request_metadata_file_path not in log_files_output_paths:
+ if isinstance(all_textract_request_metadata_file_path, str):
+ log_files_output_paths.append(all_textract_request_metadata_file_path)
+ else:
+ log_files_output_paths.append(
+ all_textract_request_metadata_file_path[0]
+ )
+
+ new_textract_query_numbers = len(all_textract_request_metadata)
+ total_textract_query_number += new_textract_query_numbers
+
+ # Ensure no duplicated output files
+ log_files_output_paths = sorted(list(set(log_files_output_paths)))
+ out_file_paths = sorted(list(set(out_file_paths)))
+
+ # Only pass paths that exist to Gradio (avoids FileNotFoundError when gr.File stats paths
+ # that no longer exist, e.g. after loading existing Textract results or in ephemeral containers)
+ out_file_paths = [
+ p for p in out_file_paths if isinstance(p, str) and os.path.exists(p)
+ ]
+ log_files_output_paths = [
+ p for p in log_files_output_paths if isinstance(p, str) and os.path.exists(p)
+ ]
+
+ # Create OCR review files list for input_review_files component
+
+ if ocr_file_path:
+ if isinstance(ocr_file_path, str):
+ ocr_review_files.append(ocr_file_path)
+ duplication_file_path_outputs.append(ocr_file_path)
+ else:
+ ocr_review_files.append(ocr_file_path[0])
+ duplication_file_path_outputs.append(ocr_file_path[0])
+
+ if all_page_line_level_ocr_results_with_words_df_file_path:
+ if isinstance(all_page_line_level_ocr_results_with_words_df_file_path, str):
+ ocr_review_files.append(
+ all_page_line_level_ocr_results_with_words_df_file_path
+ )
+ else:
+ ocr_review_files.append(
+ all_page_line_level_ocr_results_with_words_df_file_path[0]
+ )
+
+ # Output file paths (only include existing paths so Gradio gr.File does not raise on os.stat)
+ if not review_file_path:
+ review_out_file_paths = [prepared_pdf_file_paths[-1]]
+ else:
+ review_out_file_paths = [prepared_pdf_file_paths[-1], review_file_path]
+ review_out_file_paths = [
+ p for p in review_out_file_paths if isinstance(p, str) and os.path.exists(p)
+ ]
+
+ if total_textract_query_number > number_of_pages:
+ total_textract_query_number = number_of_pages
+
+ page_break_return = True
+
+ # Ensure Total pages (annotate_max_pages / annotate_max_pages_bottom) reflect the
+ # processed document so the UI is correct (e.g. after EFFICIENT_OCR or multi-file).
+ if not isinstance(pymupdf_doc, list) and hasattr(pymupdf_doc, "page_count"):
+ _doc_pages = pymupdf_doc.page_count
+ if _doc_pages and _doc_pages > 0:
+ annotate_max_pages = _doc_pages
+
+ # Ensure total_pdf_page_count (for usage logs) reflects the full document page count,
+ # not a subset (e.g. when EFFICIENT_OCR only processes OCR-needed pages or when
+ # loading from existing Textract JSON).
+ total_pages_for_usage_log = number_of_pages
+ if not isinstance(pymupdf_doc, list) and hasattr(pymupdf_doc, "page_count"):
+ _doc_pages = pymupdf_doc.page_count
+ if _doc_pages and _doc_pages > 0:
+ total_pages_for_usage_log = _doc_pages
+
+ estimated_time_taken_state = round(estimated_time_taken_state, 1)
+
+ # Only pass existing paths to Gradio for any path lists used by file components
+ duplication_file_path_outputs = [
+ p
+ for p in duplication_file_path_outputs
+ if isinstance(p, str) and os.path.exists(p)
+ ]
+ ocr_review_files = [
+ p for p in ocr_review_files if isinstance(p, str) and os.path.exists(p)
+ ]
+
+ return (
+ combined_out_message,
+ out_file_paths,
+ out_file_paths,
+ latest_file_completed,
+ log_files_output_paths,
+ log_files_output_paths,
+ estimated_time_taken_state,
+ all_request_metadata_str,
+ pymupdf_doc,
+ annotations_all_pages_divide,
+ current_loop_page,
+ page_break_return,
+ all_page_line_level_ocr_results_df,
+ all_pages_decision_process_table,
+ comprehend_query_number,
+ review_out_file_paths,
+ annotate_max_pages,
+ annotate_max_pages,
+ prepared_pdf_file_paths,
+ pdf_image_file_paths,
+ review_file_state,
+ page_sizes,
+ duplication_file_path_outputs,
+ duplication_file_path_outputs, # Write ocr_file_path to in_duplicate_pages
+ duplication_file_path_outputs, # Write ocr_file_path to in_summarisation_ocr_files
+ review_file_path,
+ total_textract_query_number,
+ ocr_file_path,
+ all_page_line_level_ocr_results,
+ all_page_line_level_ocr_results_with_words,
+ all_page_line_level_ocr_results_with_words_df,
+ review_file_state,
+ task_textbox,
+ ocr_review_files,
+ vlm_model_name,
+ vlm_total_input_tokens,
+ vlm_total_output_tokens,
+ llm_model_name,
+ llm_total_input_tokens,
+ llm_total_output_tokens,
+ total_pages_for_usage_log,
+ )
+
+
+def convert_pikepdf_coords_to_pymupdf(
+ pymupdf_page: Page, pikepdf_bbox, type="pikepdf_annot"
+):
+ """
+ Convert annotations from pikepdf to pymupdf format, handling the mediabox larger than rect.
+ """
+ # Use cropbox if available, otherwise use mediabox
+ reference_box = pymupdf_page.rect
+ mediabox = pymupdf_page.mediabox
+
+ reference_box_height = reference_box.height
+ reference_box_width = reference_box.width
+
+ # Convert PyMuPDF coordinates back to PDF coordinates (bottom-left origin)
+ media_height = mediabox.height
+ media_width = mediabox.width
+
+ media_reference_y_diff = media_height - reference_box_height
+ media_reference_x_diff = media_width - reference_box_width
+
+ y_diff_ratio = media_reference_y_diff / reference_box_height
+ x_diff_ratio = media_reference_x_diff / reference_box_width
+
+ # Extract the annotation rectangle field (pikepdf may use /Rect or Name.Rect)
+ if type == "pikepdf_annot":
+ try:
+ rect_field = pikepdf_bbox["/Rect"]
+ except (KeyError, TypeError):
+ try:
+ rect_field = pikepdf_bbox[Name.Rect]
+ except (KeyError, TypeError):
+ raise ValueError("pikepdf annotation has no /Rect") from None
+ else:
+ rect_field = pikepdf_bbox
+
+ rect_coordinates = [float(coord) for coord in rect_field] # Convert to floats
+
+ # Unpack coordinates
+ x1, y1, x2, y2 = rect_coordinates
+
+ new_x1 = x1 - (media_reference_x_diff * x_diff_ratio)
+ new_y1 = media_height - y2 - (media_reference_y_diff * y_diff_ratio)
+ new_x2 = x2 - (media_reference_x_diff * x_diff_ratio)
+ new_y2 = media_height - y1 - (media_reference_y_diff * y_diff_ratio)
+
+ return new_x1, new_y1, new_x2, new_y2
+
+
+def convert_pikepdf_to_image_coords(
+ pymupdf_page, annot, image: Image, type="pikepdf_annot"
+):
+ """
+ Convert annotations from pikepdf coordinates to image coordinates.
+ """
+
+ # Get the dimensions of the page in points with pymupdf
+ rect_height = pymupdf_page.rect.height
+ rect_width = pymupdf_page.rect.width
+
+ # Get the dimensions of the image
+ image_page_width, image_page_height = image.size
+
+ # Calculate scaling factors between pymupdf and PIL image
+ scale_width = image_page_width / rect_width
+ scale_height = image_page_height / rect_height
+
+ # Extract the /Rect field
+ if type == "pikepdf_annot":
+ rect_field = annot["/Rect"]
+ else:
+ rect_field = annot
+
+ # Convert the extracted /Rect field to a list of floats
+ rect_coordinates = [float(coord) for coord in rect_field]
+
+ # Convert the Y-coordinates (flip using the image height)
+ x1, y1, x2, y2 = rect_coordinates
+ x1_image = round(x1 * scale_width, 2)
+ new_y1_image = round(
+ image_page_height - (y2 * scale_height), 2
+ ) # Flip Y0 (since it starts from bottom)
+ x2_image = round(x2 * scale_width, 2)
+ new_y2_image = round(image_page_height - (y1 * scale_height), 2) # Flip Y1
+
+ return x1_image, new_y1_image, x2_image, new_y2_image
+
+
+def convert_pikepdf_decision_output_to_image_coords(
+ pymupdf_page: Document, pikepdf_decision_ouput_data: List[dict], image: Image
+):
+ if isinstance(image, str):
+ # Normalize and validate path safety before opening image
+ normalized_path = os.path.normpath(os.path.abspath(image))
+ if validate_path_containment(normalized_path, INPUT_FOLDER):
+ image_path = normalized_path
+ image = Image.open(image_path)
+ else:
+ raise ValueError(f"Unsafe image path detected: {image}")
+
+ # Loop through each item in the data
+ for item in pikepdf_decision_ouput_data:
+ # Extract the bounding box
+ bounding_box = item["boundingBox"]
+
+ # Create a pikepdf_bbox dictionary to match the expected input
+ pikepdf_bbox = {"/Rect": bounding_box}
+
+ # Call the conversion function
+ new_x1, new_y1, new_x2, new_y2 = convert_pikepdf_to_image_coords(
+ pymupdf_page, pikepdf_bbox, image, type="pikepdf_annot"
+ )
+
+ # Update the original object with the new bounding box values
+ item["boundingBox"] = [new_x1, new_y1, new_x2, new_y2]
+
+ return pikepdf_decision_ouput_data
+
+
+def _rect_display_to_unrotated(page: Page, rect: Rect) -> Rect:
+ """
+ Convert a rectangle from display (page.rect) space to unrotated page space.
+ PyMuPDF requires coordinates in unrotated space for add_redact_annot etc.;
+ we build rects by scaling image coords to page.rect, which is in display space when the page is rotated.
+ """
+ try:
+ derot = getattr(page, "derotation_matrix", None)
+ if derot is not None:
+ rect = rect * derot
+ rect = rect.normalize()
+ except Exception:
+ pass
+ return rect
+
+
+def convert_image_coords_to_pymupdf(
+ pymupdf_page: Document, annot: dict, image: Image, type: str = "image_recognizer"
+):
+ """
+ Converts an image with redaction coordinates from a CustomImageRecognizerResult or pikepdf object with image coordinates to pymupdf coordinates.
+
+ Images are always rendered from the MediaBox (via _render_pdf_page_to_png_pymupdf_mediabox),
+ so scaling must use the MediaBox dimensions. When CropBox != MediaBox the cropbox offset
+ is subtracted at the end so that the returned coordinates are in the page's current
+ CropBox-local coordinate system (what PyMuPDF expects for add_redact_annot / Rect).
+ When CropBox == MediaBox the offset is zero and behaviour is unchanged.
+ """
+
+ # Use MediaBox dimensions for scaling (the image was rendered from the MediaBox).
+ mediabox_height = pymupdf_page.mediabox.height
+ mediabox_width = pymupdf_page.mediabox.width
+
+ image_page_width, image_page_height = image.size
+
+ # Calculate scaling factors between PIL image and MediaBox
+ scale_width = mediabox_width / image_page_width
+ scale_height = mediabox_height / image_page_height
+
+ # Offset needed to convert from MediaBox-local to CropBox-local coordinates.
+ # cropbox.x0 is always 0 after PyMuPDF normalises the coordinate system, so
+ # cropbox_x_off = -mediabox.x0 (positive when CropBox is to the right of MediaBox origin).
+ cropbox_x_off = pymupdf_page.cropbox.x0 - pymupdf_page.mediabox.x0
+ cropbox_y_off = pymupdf_page.cropbox.y0 - pymupdf_page.mediabox.y0
+
+ # Calculate scaled coordinates
+ if type == "image_recognizer":
+ x1 = annot.left * scale_width
+ new_y1 = annot.top * scale_height
+ x2 = (annot.left + annot.width) * scale_width
+ new_y2 = (annot.top + annot.height) * scale_height
+ # Else assume it is a pikepdf derived object
+ else:
+ rect_field = annot["/Rect"]
+ rect_coordinates = [float(coord) for coord in rect_field] # Convert to floats
+
+ # Unpack coordinates
+ x1, y1, x2, y2 = rect_coordinates
+
+ x1 = x1 * scale_width
+ new_y1 = (y2 + (y1 - y2)) * scale_height
+ x2 = (x1 + (x2 - x1)) * scale_width
+ new_y2 = y2 * scale_height
+
+ # Convert from MediaBox-local to CropBox-local (no-op when they are equal)
+ x1 -= cropbox_x_off
+ new_y1 -= cropbox_y_off
+ x2 -= cropbox_x_off
+ new_y2 -= cropbox_y_off
+
+ return x1, new_y1, x2, new_y2
+
+
+def convert_gradio_image_annotator_object_coords_to_pymupdf(
+ pymupdf_page: Page, annot: dict, image: Image, image_dimensions: dict = None
+):
+ """
+ Converts an image with redaction coordinates from a gradio annotation component to pymupdf coordinates.
+
+ Images are always rendered from the MediaBox, so MediaBox dimensions are used for scaling.
+ The CropBox offset is subtracted at the end to produce CropBox-local coordinates.
+ When CropBox == MediaBox the offset is zero and behaviour is unchanged.
+ """
+
+ # Use MediaBox dimensions for scaling (the image was rendered from the MediaBox).
+ mediabox_height = pymupdf_page.mediabox.height
+ mediabox_width = pymupdf_page.mediabox.width
+
+ if image_dimensions:
+ image_page_width = image_dimensions["image_width"]
+ image_page_height = image_dimensions["image_height"]
+ elif image:
+ image_page_width, image_page_height = image.size
+
+ # Calculate scaling factors between PIL image and MediaBox
+ scale_width = mediabox_width / image_page_width
+ scale_height = mediabox_height / image_page_height
+
+ # Offset to convert from MediaBox-local to CropBox-local (zero when they are equal)
+ cropbox_x_off = pymupdf_page.cropbox.x0 - pymupdf_page.mediabox.x0
+ cropbox_y_off = pymupdf_page.cropbox.y0 - pymupdf_page.mediabox.y0
+
+ # Calculate scaled coordinates (MediaBox-local)
+ x1 = annot["xmin"] * scale_width
+ new_y1 = annot["ymin"] * scale_height
+ x2 = annot["xmax"] * scale_width
+ new_y2 = annot["ymax"] * scale_height
+
+ # Convert to CropBox-local
+ x1 -= cropbox_x_off
+ new_y1 -= cropbox_y_off
+ x2 -= cropbox_x_off
+ new_y2 -= cropbox_y_off
+
+ return x1, new_y1, x2, new_y2
+
+
+def move_page_info(file_path: str) -> str:
+ # Split the string at '.png'
+ base, extension = file_path.rsplit(".pdf", 1)
+
+ # Extract the page info
+ page_info = base.split("page ")[1].split(" of")[0] # Get the page number
+ new_base = base.replace(
+ f"page {page_info} of ", ""
+ ) # Remove the page info from the original position
+
+ # Construct the new file path
+ new_file_path = f"{new_base}_page_{page_info}.png"
+
+ return new_file_path
+
+
+def prepare_custom_image_recogniser_result_annotation_box(
+ page: Page,
+ annot: CustomImageRecognizerResult,
+ image: Image,
+ page_sizes_df: pd.DataFrame,
+ custom_colours: bool = USE_GUI_BOX_COLOURS_FOR_OUTPUTS,
+):
+ """
+ Prepare an image annotation box and coordinates based on a CustomImageRecogniserResult, PyMuPDF page, and PIL Image.
+ """
+
+ img_annotation_box = dict()
+
+ # For efficient lookup, set 'page' as index if it's not already
+ if "page" in page_sizes_df.columns:
+ page_sizes_df = page_sizes_df.set_index("page")
+
+ # PyMuPDF page numbers are 0-based, DataFrame index assumed 1-based
+ page_num_one_based = page.number + 1
+
+ pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2 = 0, 0, 0, 0 # Initialize defaults
+
+ if image:
+ pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2 = (
+ convert_image_coords_to_pymupdf(page, annot, image)
+ )
+
+ else:
+ # --- Calculate coordinates when no image is present ---
+ # CustomImageRecognizerResult from OCR/deny-list have coords in image pixel space.
+ # If we have image_width/image_height in page_sizes_df, scale from pixels to PDF.
+ # Otherwise assume annot coords are relative to MediaBox (text-path/legacy).
+ try:
+ page_info = page_sizes_df.loc[page_num_one_based]
+ rect_width = page.rect.width
+ rect_height = page.rect.height
+ img_w = page_info.get("image_width")
+ img_h = page_info.get("image_height")
+ if (
+ img_w is not None
+ and img_h is not None
+ and pd.notna(img_w)
+ and pd.notna(img_h)
+ and float(img_w) > 0
+ and float(img_h) > 0
+ ):
+ # Image pixel coords (from OCR/CUSTOM) -> scale to PDF.
+ # Images are always rendered from the MediaBox, so use MediaBox dimensions
+ # for scaling, then subtract the CropBox offset to get CropBox-local coords.
+ mb_w = page_info.get("mediabox_width") or rect_width
+ mb_h = page_info.get("mediabox_height") or rect_height
+ x_off = float(page_info.get("cropbox_x_offset") or 0)
+ y_off = float(page_info.get("cropbox_y_offset_from_top") or 0)
+ scale_x = float(mb_w) / float(img_w)
+ scale_y = float(mb_h) / float(img_h)
+ pymupdf_x1 = annot.left * scale_x - x_off
+ pymupdf_y1 = annot.top * scale_y - y_off
+ pymupdf_x2 = (annot.left + annot.width) * scale_x - x_off
+ pymupdf_y2 = (annot.top + annot.height) * scale_y - y_off
+ else:
+ # MediaBox-relative (top-left origin), e.g. text-path
+ mb_width = page_info["mediabox_width"]
+ mb_height = page_info["mediabox_height"]
+ x_offset = page_info["cropbox_x_offset"]
+ y_offset = page_info["cropbox_y_offset_from_top"]
+ if mb_width <= 0 or mb_height <= 0:
+ print(
+ f"Warning: Invalid MediaBox dimensions ({mb_width}x{mb_height}) for page {page_num_one_based}. Setting coords to 0."
+ )
+ else:
+ pymupdf_x1 = annot.left - x_offset
+ pymupdf_x2 = annot.left + annot.width - x_offset
+ pymupdf_y1 = annot.top - y_offset
+ pymupdf_y2 = annot.top + annot.height - y_offset
+
+ except KeyError:
+ print(
+ f"Warning: Page number {page_num_one_based} not found in page_sizes_df. Cannot get MediaBox dimensions. Setting coords to 0."
+ )
+ except AttributeError as e:
+ print(
+ f"Error accessing attributes ('left', 'top', etc.) on 'annot' object for page {page_num_one_based}: {e}"
+ )
+ except Exception as e:
+ print(
+ f"Error during coordinate calculation for page {page_num_one_based}: {e}"
+ )
+
+ rect = Rect(
+ pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2
+ ) # Create the PyMuPDF Rect (in display space when built from image)
+ rect = _rect_display_to_unrotated(page, rect)
+
+ # Now creating image annotation object
+ image_x1 = annot.left
+ image_x2 = annot.left + annot.width
+ image_y1 = annot.top
+ image_y2 = annot.top + annot.height
+
+ # Create image annotation boxes
+ img_annotation_box["xmin"] = image_x1
+ img_annotation_box["ymin"] = image_y1
+ img_annotation_box["xmax"] = image_x2 # annot.left + annot.width
+ img_annotation_box["ymax"] = image_y2 # annot.top + annot.height
+ img_annotation_box["color"] = (
+ annot.color if custom_colours is True else CUSTOM_BOX_COLOUR
+ )
+ try:
+ img_annotation_box["label"] = str(annot.entity_type)
+ except Exception as e:
+ print(f"Error getting entity type: {e}")
+ img_annotation_box["label"] = "Redaction"
+
+ if hasattr(annot, "text") and annot.text:
+ img_annotation_box["text"] = str(annot.text)
+ else:
+ img_annotation_box["text"] = ""
+
+ # Assign an id
+ img_annotation_box = fill_missing_box_ids(img_annotation_box)
+
+ return img_annotation_box, rect
+
+
+def convert_pikepdf_annotations_to_result_annotation_box(
+ page: Page,
+ annot: dict,
+ image: Image = None,
+ convert_pikepdf_to_pymupdf_coords: bool = True,
+ page_sizes_df: pd.DataFrame = pd.DataFrame(),
+ image_dimensions: dict = dict(),
+):
+ """
+ Convert redaction objects with pikepdf coordinates to annotation boxes for PyMuPDF that can then be redacted from the document. First 1. converts pikepdf to pymupdf coordinates, then 2. converts pymupdf coordinates to image coordinates if page is an image.
+ """
+ img_annotation_box = dict()
+ page_no = page.number
+
+ if convert_pikepdf_to_pymupdf_coords is True:
+ pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2 = (
+ convert_pikepdf_coords_to_pymupdf(page, annot)
+ )
+ else:
+ pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2 = (
+ convert_image_coords_to_pymupdf(
+ page, annot, image, type="pikepdf_image_coords"
+ )
+ )
+
+ rect = Rect(pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2)
+ if not convert_pikepdf_to_pymupdf_coords:
+ # Coords from image are in display space when page is rotated
+ rect = _rect_display_to_unrotated(page, rect)
+
+ # If an image is provided, convert PyMuPDF coordinates to image coordinates
+ # for the annotation box (used in review PDF)
+ if image is not None:
+ # Convert PyMuPDF coordinates to image pixel coordinates
+ image_x1, image_y1, image_x2, image_y2 = convert_pymupdf_to_image_coords(
+ page,
+ pymupdf_x1,
+ pymupdf_y1,
+ pymupdf_x2,
+ pymupdf_y2,
+ image,
+ image_dimensions,
+ )
+ # Use image coordinates for the annotation box
+ img_annotation_box["xmin"] = image_x1
+ img_annotation_box["ymin"] = image_y1
+ img_annotation_box["xmax"] = image_x2
+ img_annotation_box["ymax"] = image_y2
+ else:
+ # If no image, use PyMuPDF coordinates (PDF points). Store 1-based page on box
+ # so convert_annotation_data_to_dataframe can join to correct mediabox row (EFFICIENT_OCR).
+ convert_df = pd.DataFrame(
+ {
+ "page": [page_no],
+ "xmin": [pymupdf_x1],
+ "ymin": [pymupdf_y1],
+ "xmax": [pymupdf_x2],
+ "ymax": [pymupdf_y2],
+ }
+ )
+ converted_df = convert_df # divide_coordinates_by_page_sizes(convert_df, page_sizes_df, xmin="xmin", xmax="xmax", ymin="ymin", ymax="ymax")
+ img_annotation_box["xmin"] = converted_df["xmin"].max()
+ img_annotation_box["ymin"] = converted_df["ymin"].max()
+ img_annotation_box["xmax"] = converted_df["xmax"].max()
+ img_annotation_box["ymax"] = converted_df["ymax"].max()
+ img_annotation_box["page"] = (
+ page_no + 1
+ ) # 1-based for divide_coordinates_by_page_sizes
+
+ img_annotation_box["color"] = CUSTOM_BOX_COLOUR
+
+ if isinstance(annot, Dictionary):
+ img_annotation_box["label"] = str(annot["/T"])
+
+ if hasattr(annot, "Contents"):
+ img_annotation_box["text"] = str(annot.Contents)
+ else:
+ img_annotation_box["text"] = ""
+ else:
+ img_annotation_box["label"] = "REDACTION"
+ img_annotation_box["text"] = ""
+
+ return img_annotation_box, rect
+
+
+def set_cropbox_safely(page: Page, original_cropbox: Optional[Rect]):
+ """
+ Sets the cropbox of a PyMuPDF page safely and defensively.
+
+ If the 'original_cropbox' is valid (i.e., a pymupdf.Rect instance, not None, not empty,
+ not infinite, and fully contained within the page's mediabox), it is set as the cropbox.
+
+ Otherwise, the page's mediabox is used, and a warning is printed to explain why.
+
+ Args:
+ page: The PyMuPDF page object.
+ original_cropbox: The Rect representing the desired cropbox.
+ """
+ mediabox = page.mediabox
+ reason_for_defaulting = ""
+
+ # Check for None
+ if original_cropbox is None:
+ reason_for_defaulting = "the original cropbox is None."
+ # Check for incorrect type
+ elif not isinstance(original_cropbox, Rect):
+ reason_for_defaulting = f"the original cropbox is not a pymupdf.Rect instance (got {type(original_cropbox)})."
+ else:
+ # Normalise the cropbox (ensures x0 < x1 and y0 < y1)
+ original_cropbox.normalize()
+
+ # Check for empty or infinite or out-of-bounds
+ if original_cropbox.is_empty:
+ reason_for_defaulting = (
+ f"the provided original cropbox {original_cropbox} is empty."
+ )
+ elif original_cropbox.is_infinite:
+ reason_for_defaulting = (
+ f"the provided original cropbox {original_cropbox} is infinite."
+ )
+ elif not mediabox.contains(original_cropbox):
+ reason_for_defaulting = (
+ f"the provided original cropbox {original_cropbox} is not fully contained "
+ f"within the page's mediabox {mediabox}."
+ )
+
+ if reason_for_defaulting:
+ # print(
+ # f"Warning (Page {page.number}): Cannot use original cropbox because {reason_for_defaulting} "
+ # f"Defaulting to the page's mediabox as the cropbox."
+ # )
+ page.set_cropbox(mediabox)
+ else:
+ page.set_cropbox(original_cropbox)
+
+
+def convert_color_to_range_0_1(color):
+ return tuple(component / 255 for component in color)
+
+
+def define_box_colour(
+ custom_colours: bool, img_annotation_box: dict, CUSTOM_BOX_COLOUR: tuple
+):
+ """
+ Determines the color for a bounding box annotation.
+
+ If `custom_colours` is True, it attempts to parse the color from `img_annotation_box['color']`.
+ It supports color strings in "(R,G,B)" format (0-255 integers) or tuples/lists of (R,G,B)
+ where components are either 0-1 floats or 0-255 integers.
+ If parsing fails or `custom_colours` is False, it defaults to `CUSTOM_BOX_COLOUR`.
+ All output colors are converted to a 0.0-1.0 float range.
+
+ Args:
+ custom_colours (bool): If True, attempts to use a custom color from `img_annotation_box`.
+ img_annotation_box (dict): A dictionary that may contain a 'color' key with the custom color.
+ CUSTOM_BOX_COLOUR (tuple): The default color to use if custom colors are not enabled or parsing fails.
+ Expected to be a tuple of (R, G, B) with values in the 0.0-1.0 range.
+
+ Returns:
+ tuple: A tuple (R, G, B) representing the chosen color, with components in the 0.0-1.0 float range.
+ """
+ if custom_colours is True:
+ color_input = img_annotation_box["color"]
+ out_colour = (0, 0, 0) # Initialize with a default black color (0.0-1.0 range)
+
+ if isinstance(color_input, str):
+ # Expected format: "(R,G,B)" where R,G,B are integers 0-255 (e.g., "(255,0,0)")
+ try:
+ # Remove parentheses and split by comma, then convert to integers
+ components_str = color_input.strip().strip("()").split(",")
+ colour_tuple_int = tuple(int(c.strip()) for c in components_str)
+
+ # Validate the parsed integer tuple
+ if len(colour_tuple_int) == 3 and not all(
+ 0 <= c <= 1 for c in colour_tuple_int
+ ):
+ out_colour = convert_color_to_range_0_1(colour_tuple_int)
+ elif len(colour_tuple_int) == 3 and all(
+ 0 <= c <= 1 for c in colour_tuple_int
+ ):
+ out_colour = colour_tuple_int
+ else:
+ print(
+ f"Warning: Invalid color string values or length for '{color_input}'. Expected (R,G,B) with R,G,B in 0-255. Defaulting to black."
+ )
+ except (ValueError, IndexError):
+ print(
+ f"Warning: Could not parse color string '{color_input}'. Expected '(R,G,B)' format. Defaulting to black."
+ )
+ elif isinstance(color_input, (tuple, list)) and len(color_input) == 3:
+ # Expected formats: (R,G,B) where R,G,B are either 0-1 floats or 0-255 integers
+ if all(isinstance(c, (int, float)) for c in color_input):
+ # Case 1: Components are already 0.0-1.0 floats
+ if all(isinstance(c, float) and 0.0 <= c <= 1.0 for c in color_input):
+ out_colour = tuple(color_input)
+ # Case 2: Components are 0-255 integers
+ elif not all(
+ isinstance(c, float) and 0.0 <= c <= 1.0 for c in color_input
+ ):
+ out_colour = convert_color_to_range_0_1(color_input)
+ else:
+ # Numeric values but not in expected 0-1 float or 0-255 integer ranges
+ print(
+ f"Warning: Invalid color tuple/list values {color_input}. Expected (R,G,B) with R,G,B in 0-1 floats or 0-255 integers. Defaulting to black."
+ )
+ else:
+ # Contains non-numeric values (e.g., (1, 'a', 3))
+ print(
+ f"Warning: Color tuple/list {color_input} contains non-numeric values. Defaulting to black."
+ )
+ else:
+ # Catch-all for any other unexpected format (e.g., None, dict, etc.)
+ print(
+ f"Warning: Unexpected color format for {color_input}. Expected string '(R,G,B)' or tuple/list (R,G,B). Defaulting to black."
+ )
+
+ # Final safeguard: Ensure out_colour is always a valid PyMuPDF color tuple (3 floats 0.0-1.0)
+ if not (
+ isinstance(out_colour, tuple)
+ and len(out_colour) == 3
+ and all(isinstance(c, float) and 0.0 <= c <= 1.0 for c in out_colour)
+ ):
+ out_colour = (0.0, 0.0, 0.0)
+ else:
+ if CUSTOM_BOX_COLOUR:
+ # Should be a tuple of three integers between 0 and 255 from config
+ if (
+ isinstance(CUSTOM_BOX_COLOUR, (tuple, list))
+ and len(CUSTOM_BOX_COLOUR) >= 3
+ ):
+ # Convert from 0-255 range to 0-1 range
+ out_colour = tuple(
+ float(component / 255) if component >= 1 else float(component)
+ for component in CUSTOM_BOX_COLOUR[:3]
+ )
+ else:
+ out_colour = (0.0, 0.0, 0.0)
+
+ # PyMuPDF requires 1, 3 or 4 float components in 0-1; ensure tuple of floats
+ if isinstance(out_colour, (tuple, list)) and len(out_colour) in (3, 4):
+ out_colour = tuple(float(max(0.0, min(1.0, c))) for c in out_colour)
+ else:
+ out_colour = (0.0, 0.0, 0.0)
+ return out_colour
+
+
+def redact_single_box(
+ pymupdf_page: Page,
+ pymupdf_rect: Rect,
+ img_annotation_box: dict,
+ custom_colours: bool = USE_GUI_BOX_COLOURS_FOR_OUTPUTS,
+ retain_text: bool = RETURN_PDF_FOR_REVIEW,
+ return_pdf_end_of_redaction: bool = RETURN_REDACTED_PDF,
+):
+ """
+ Commit redaction boxes to a PyMuPDF page.
+
+ Args:
+ pymupdf_page (Page): The PyMuPDF page object to which the redaction will be applied.
+ pymupdf_rect (Rect): The PyMuPDF rectangle defining the bounds of the redaction box.
+ img_annotation_box (dict): A dictionary containing annotation details, such as label, text, and color.
+ custom_colours (bool, optional): If True, uses custom colors for the final redacted PDF
+ (..._redacted.pdf). The review PDF (..._redactions_for_review.pdf)
+ always uses custom colours. Defaults to USE_GUI_BOX_COLOURS_FOR_OUTPUTS.
+ retain_text (bool, optional): If True, adds a redaction annotation but retains the underlying text.
+ If False, the text within the redaction area is deleted.
+ Defaults to RETURN_PDF_FOR_REVIEW.
+ return_pdf_end_of_redaction (bool, optional): If True, returns both review and final redacted page objects.
+ Defaults to RETURN_REDACTED_PDF.
+
+ Returns:
+ Page or Tuple[Page, Page]: If return_pdf_end_of_redaction is True and retain_text is True,
+ returns a tuple of (review_page, applied_redaction_page). Otherwise returns a single Page.
+ """
+
+ pymupdf_x1 = pymupdf_rect[0]
+ pymupdf_y1 = pymupdf_rect[1]
+ pymupdf_x2 = pymupdf_rect[2]
+ pymupdf_y2 = pymupdf_rect[3]
+
+ # Full size redaction box for covering all the text of a word
+ full_size_redaction_box = Rect(
+ pymupdf_x1 - 1, pymupdf_y1 - 1, pymupdf_x2 + 1, pymupdf_y2 + 1
+ )
+
+ # Calculate tiny height redaction box so that it doesn't delete text from adjacent lines
+ redact_bottom_y = pymupdf_y1 + 2
+ redact_top_y = pymupdf_y2 - 2
+
+ # Calculate the middle y value and set a small height if default values are too close together
+ if (redact_top_y - redact_bottom_y) < 1:
+ middle_y = (pymupdf_y1 + pymupdf_y2) / 2
+ redact_bottom_y = middle_y - 1
+ redact_top_y = middle_y + 1
+
+ rect_small_pixel_height = Rect(
+ pymupdf_x1 + 2, redact_bottom_y, pymupdf_x2 - 2, redact_top_y
+ ) # Slightly smaller than outside box
+
+ # Review PDF (..._redactions_for_review.pdf): always use custom colours
+ review_colour = define_box_colour(True, img_annotation_box, CUSTOM_BOX_COLOUR)
+ # Final redacted PDF (..._redacted.pdf): respect USE_GUI_BOX_COLOURS_FOR_OUTPUTS
+ output_colour = define_box_colour(
+ custom_colours, img_annotation_box, CUSTOM_BOX_COLOUR
+ )
+
+ img_annotation_box["text"] = img_annotation_box.get("text") or ""
+ img_annotation_box["label"] = img_annotation_box.get("label") or "Redaction"
+
+ # Create a copy of the page for final redaction if needed
+ applied_redaction_page = None
+ if return_pdf_end_of_redaction and retain_text:
+ # Create a deep copy of the page for final redaction
+
+ applied_redaction_page = pymupdf.open()
+ applied_redaction_page.insert_pdf(
+ pymupdf_page.parent,
+ from_page=pymupdf_page.number,
+ to_page=pymupdf_page.number,
+ )
+ applied_redaction_page = applied_redaction_page[0]
+
+ # Handle review page first, then deal with final redacted page (retain_text = True)
+ if retain_text is True:
+
+ annot = pymupdf_page.add_redact_annot(full_size_redaction_box)
+ annot.set_colors(stroke=review_colour, fill=review_colour, colors=review_colour)
+ annot.set_name(img_annotation_box["label"])
+ # Cache creationDate per second to avoid thousands of strftime calls per document
+ now_sec = int(time.time())
+ if not hasattr(redact_single_box, "_creation_date_cache"):
+ redact_single_box._creation_date_cache = [0, ""]
+ if redact_single_box._creation_date_cache[0] != now_sec:
+ redact_single_box._creation_date_cache[0] = now_sec
+ redact_single_box._creation_date_cache[1] = datetime.now().strftime(
+ "%Y%m%d%H%M%S"
+ )
+ annot.set_info(
+ info=img_annotation_box["label"],
+ title=img_annotation_box["label"],
+ subject=img_annotation_box["label"],
+ content=img_annotation_box["text"],
+ creationDate=redact_single_box._creation_date_cache[1],
+ )
+ annot.update(opacity=0.5, cross_out=False)
+
+ # If we need both review and final pages, and the applied redaction page has been prepared, apply final redaction to the copy
+ if return_pdf_end_of_redaction and applied_redaction_page is not None:
+ # Apply final redaction to the copy
+
+ # Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
+ applied_redaction_page.add_redact_annot(rect_small_pixel_height)
+
+ # Only create a box over the whole rect if we want to delete the text
+ shape = applied_redaction_page.new_shape()
+ shape.draw_rect(pymupdf_rect)
+
+ # Use solid fill for normal redaction
+ shape.finish(color=output_colour, fill=output_colour)
+ shape.commit()
+
+ return pymupdf_page, applied_redaction_page
+ else:
+ return pymupdf_page
+
+ # If we don't need to retain the text, we only have one page which is the applied redaction page, so just apply the redaction to the page
+ else:
+ # Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
+ pymupdf_page.add_redact_annot(rect_small_pixel_height)
+
+ # Only create a box over the whole rect if we want to delete the text
+ shape = pymupdf_page.new_shape()
+ shape.draw_rect(pymupdf_rect)
+
+ # PyMuPDF requires 1, 3 or 4 float components in range 0-1
+ _colour = output_colour
+ if isinstance(_colour, (tuple, list)) and len(_colour) in (3, 4):
+ _colour = tuple(float(max(0.0, min(1.0, c))) for c in _colour)
+ else:
+ _colour = (0.0, 0.0, 0.0)
+ shape.finish(color=_colour, fill=_colour)
+ shape.commit()
+
+ return pymupdf_page
+
+
+def redact_whole_pymupdf_page(
+ rect_height: float,
+ rect_width: float,
+ page: Page,
+ custom_colours: bool = False,
+ border: float = 5,
+ redact_pdf: bool = True,
+):
+ """
+ Redacts a whole page of a PDF document.
+
+ Args:
+ rect_height (float): The height of the page in points.
+ rect_width (float): The width of the page in points.
+ page (Page): The PyMuPDF page object to be redacted.
+ custom_colours (bool, optional): If True, uses custom colors for the redaction box.
+ border (float, optional): The border width in points. Defaults to 5.
+ redact_pdf (bool, optional): If True, redacts the PDF document. Defaults to True.
+ """
+ # Small border to page that remains white
+
+ # Define the coordinates for the Rect (PDF coordinates for actual redaction)
+ whole_page_x1, whole_page_y1 = 0 + border, 0 + border # Bottom-left corner
+ whole_page_x2, whole_page_y2 = (
+ rect_width - border,
+ rect_height - border,
+ ) # Top-right corner
+
+ # Create new image annotation element based on whole page coordinates
+ whole_page_rect = Rect(whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2)
+
+ # Calculate relative coordinates for the annotation box (0-1 range)
+ # This ensures the coordinates are already in relative format for output files
+ relative_border = border / min(
+ rect_width, rect_height
+ ) # Scale border proportionally
+ relative_x1 = relative_border
+ relative_y1 = relative_border
+ relative_x2 = 1 - relative_border
+ relative_y2 = 1 - relative_border
+
+ # Write whole page annotation to annotation boxes using relative coordinates
+ whole_page_img_annotation_box = dict()
+ whole_page_img_annotation_box["xmin"] = relative_x1
+ whole_page_img_annotation_box["ymin"] = relative_y1
+ whole_page_img_annotation_box["xmax"] = relative_x2
+ whole_page_img_annotation_box["ymax"] = relative_y2
+ # Match word-level redactions: define_box_colour uses this when GUI/output colours are on.
+ whole_page_img_annotation_box["color"] = CUSTOM_BOX_COLOUR
+ whole_page_img_annotation_box["label"] = "Whole page"
+
+ if redact_pdf is True:
+ redact_single_box(
+ page, whole_page_rect, whole_page_img_annotation_box, custom_colours
+ )
+
+ return whole_page_img_annotation_box
+
+
+def _merge_adjacent_redaction_boxes(
+ boxes: List[dict],
+ threshold: float = 2.0,
+) -> List[dict]:
+ """
+ Merge overlapping or nearly touching dict-style redaction boxes into fewer
+ larger rectangles. Boxes must have xmin, ymin, xmax, ymax. Other keys
+ are taken from the first box in each merged group. In-place style:
+ repeatedly merge any two boxes that overlap or are within threshold
+ (in coordinate units), until no merges possible.
+ """
+ if not boxes or len(boxes) <= 1:
+ return list(boxes)
+
+ def _union(b1: dict, b2: dict) -> dict:
+ out = dict(b1)
+ out["xmin"] = min(b1["xmin"], b2["xmin"])
+ out["ymin"] = min(b1["ymin"], b2["ymin"])
+ out["xmax"] = max(b1["xmax"], b2["xmax"])
+ out["ymax"] = max(b1["ymax"], b2["ymax"])
+ return out
+
+ def _overlap_or_close(b1: dict, b2: dict) -> bool:
+ if b1["xmax"] + threshold < b2["xmin"] or b2["xmax"] + threshold < b1["xmin"]:
+ return False
+ if b1["ymax"] + threshold < b2["ymin"] or b2["ymax"] + threshold < b1["ymin"]:
+ return False
+ return True
+
+ merged = list(boxes)
+ while True:
+ changed = False
+ for i in range(len(merged)):
+ for j in range(i + 1, len(merged)):
+ if _overlap_or_close(merged[i], merged[j]):
+ merged[i] = _union(merged[i], merged[j])
+ merged.pop(j)
+ changed = True
+ break
+ if changed:
+ break
+ if not changed:
+ break
+ return merged
+
+
+def redact_page_with_pymupdf(
+ page: Page,
+ page_annotations: dict,
+ image: Image = None,
+ custom_colours: bool = USE_GUI_BOX_COLOURS_FOR_OUTPUTS,
+ redact_whole_page: bool = False,
+ convert_pikepdf_to_pymupdf_coords: bool = True,
+ original_cropbox: List[Rect] = list(),
+ page_sizes_df: pd.DataFrame = pd.DataFrame(),
+ return_pdf_for_review: bool = RETURN_PDF_FOR_REVIEW,
+ return_pdf_end_of_redaction: bool = RETURN_REDACTED_PDF,
+ input_folder: str = INPUT_FOLDER,
+ image_dimensions_override: Optional[dict] = None,
+ review_page: Optional[Page] = None,
+):
+ """
+ Applies redactions to a single PyMuPDF page based on provided annotations.
+
+ This function processes various types of annotations (Gradio, CustomImageRecognizerResult,
+ or pikepdf-like) and applies them as redactions to the given PyMuPDF page. It can also
+ redact the entire page if specified.
+
+ Args:
+ page (Page): The PyMuPDF page object to which redactions will be applied.
+ page_annotations (dict): A dictionary containing annotation data for the current page.
+ Expected to have a 'boxes' key with a list of annotation boxes.
+ image (Image, optional): A PIL Image object or path to an image file associated with the page.
+ Used for coordinate conversions if available. Defaults to None.
+ custom_colours (bool, optional): If True, custom box colors will be used for redactions.
+ Defaults to USE_GUI_BOX_COLOURS_FOR_OUTPUTS.
+ redact_whole_page (bool, optional): If True, the entire page will be redacted. Defaults to False.
+ convert_pikepdf_to_pymupdf_coords (bool, optional): If True, coordinates from pikepdf-like
+ annotations will be converted to PyMuPDF's
+ coordinate system. Defaults to True.
+ original_cropbox (List[Rect], optional): The original cropbox of the page. This is used
+ to restore the cropbox after redactions. Defaults to an empty list.
+ page_sizes_df (pd.DataFrame, optional): A DataFrame containing page size and image dimension
+ information, used for coordinate scaling. Defaults to an empty DataFrame.
+ return_pdf_for_review (bool, optional): If True, redactions are applied in a way suitable for
+ review (e.g., not removing underlying text/images completely).
+ Defaults to RETURN_PDF_FOR_REVIEW.
+ return_pdf_end_of_redaction (bool, optional): If True, returns both review and final redacted page objects.
+ Defaults to RETURN_REDACTED_PDF.
+ review_page (Page, optional): When provided, the same redactions are applied to this page (with text
+ retained for review) in a single pass, avoiding a second full annotation loop.
+
+ Returns:
+ Tuple[Page, dict] or Tuple[Tuple[Page, Page], dict]: A tuple containing:
+ - page (Page or Tuple[Page, Page]): The PyMuPDF page object(s) with redactions applied.
+ If return_pdf_end_of_redaction is True and return_pdf_for_review is True,
+ returns a tuple of (review_page, applied_redaction_page).
+ - out_annotation_boxes (dict): A dictionary containing the processed annotation boxes
+ for the page, including the image path.
+ """
+
+ rect_height = page.rect.height
+ rect_width = page.rect.width
+
+ mediabox_height = page.mediabox.height
+ mediabox_width = page.mediabox.width
+
+ page_no = page.number
+ page_num_reported = page_no + 1
+
+ # Use precomputed dimensions when provided (skips all DataFrame work on hot path)
+ if image_dimensions_override and isinstance(image_dimensions_override, dict):
+ image_dimensions = dict(image_dimensions_override)
+ else:
+ image_dimensions = dict()
+ # Only convert/lookup when caller did not pass dimensions (avoids O(n_pages) per call)
+ if not page_sizes_df.empty and "page" in page_sizes_df.columns:
+ if not pd.api.types.is_numeric_dtype(page_sizes_df["page"]):
+ page_sizes_df[["page"]] = page_sizes_df[["page"]].apply(
+ pd.to_numeric, errors="coerce"
+ )
+ if not image and "image_width" in page_sizes_df.columns:
+ if not pd.api.types.is_numeric_dtype(page_sizes_df["image_width"]):
+ page_sizes_df[["image_width"]] = page_sizes_df[["image_width"]].apply(
+ pd.to_numeric, errors="coerce"
+ )
+ if not pd.api.types.is_numeric_dtype(page_sizes_df["image_height"]):
+ page_sizes_df[["image_height"]] = page_sizes_df[["image_height"]].apply(
+ pd.to_numeric, errors="coerce"
+ )
+ image_dimensions["image_width"] = page_sizes_df.loc[
+ page_sizes_df["page"] == page_num_reported, "image_width"
+ ].max()
+ image_dimensions["image_height"] = page_sizes_df.loc[
+ page_sizes_df["page"] == page_num_reported, "image_height"
+ ].max()
+ if pd.isna(image_dimensions.get("image_width")):
+ image_dimensions = dict()
+
+ out_annotation_boxes = dict()
+ all_image_annotation_boxes = list()
+ dual_output_same_pass = (
+ return_pdf_end_of_redaction and return_pdf_for_review and review_page is None
+ )
+ applied_redaction_page = None
+ if dual_output_same_pass:
+ applied_redaction_doc = pymupdf.open()
+ applied_redaction_doc.insert_pdf(
+ page.parent,
+ from_page=page.number,
+ to_page=page.number,
+ )
+ applied_redaction_page = applied_redaction_doc[0]
+
+ if isinstance(image, Image.Image):
+ # Create an image path using the input folder with PDF filename
+ # Get the PDF filename from the page's parent document
+ pdf_filename = (
+ os.path.basename(page.parent.name)
+ if hasattr(page.parent, "name") and page.parent.name
+ else "document"
+ )
+ # Normalize and validate path safety before using in file path construction
+ normalized_filename = os.path.normpath(pdf_filename)
+ # Ensure the filename doesn't contain path traversal characters
+ if (
+ ".." in normalized_filename
+ or "/" in normalized_filename
+ or "\\" in normalized_filename
+ ):
+ normalized_filename = "document" # Fallback to safe default
+ image_path = os.path.join(
+ input_folder, f"{normalized_filename}_{page.number}.png"
+ )
+ # Skip disk save when dimensions were precomputed (avoids I/O; file not needed for lookup)
+ if not image_dimensions_override and not os.path.exists(image_path):
+ image.save(image_path)
+ elif isinstance(image, str):
+ # Normalize and validate path safety before checking existence.
+ # Use input_folder (caller's) so page images under output_folder can be loaded
+ # when the caller passes a folder that contains them (e.g. second redaction run).
+ normalized_path = os.path.normpath(os.path.abspath(image))
+ if validate_path_containment(normalized_path, input_folder):
+ image_path = normalized_path
+ image = Image.open(image_path)
+ elif "image_path" in page_sizes_df.columns:
+ try:
+ image_path = page_sizes_df.loc[
+ page_sizes_df["page"] == (page_no + 1), "image_path"
+ ].iloc[0]
+ except IndexError:
+ image_path = ""
+ image = None
+ else:
+ image_path = ""
+ image = None
+ else:
+ # print("image is not an Image object or string")
+ image_path = ""
+ image = None
+
+ # Check if this is an object used in the Gradio Annotation component
+ if isinstance(page_annotations, dict):
+ page_annotations = page_annotations["boxes"]
+
+ # Optional: merge overlapping/close dict boxes to reduce redact_single_box calls
+ if (
+ MERGE_SMALL_REDACTIONS
+ and page_annotations
+ and all(isinstance(b, dict) for b in page_annotations)
+ ):
+ page_annotations = _merge_adjacent_redaction_boxes(page_annotations)
+
+ for annot in page_annotations:
+ # Pikepdf redaction annotations (from text-path create_pikepdf_annotations_for_bounding_boxes)
+ # must be handled first; do not treat Dictionary as dict.
+ if isinstance(annot, Dictionary):
+ if not image:
+ convert_pikepdf_to_pymupdf_coords = True
+ img_annotation_box, rect = (
+ convert_pikepdf_annotations_to_result_annotation_box(
+ page,
+ annot,
+ image,
+ convert_pikepdf_to_pymupdf_coords,
+ page_sizes_df,
+ image_dimensions=image_dimensions,
+ )
+ )
+ img_annotation_box = fill_missing_box_ids(img_annotation_box)
+ all_image_annotation_boxes.append(img_annotation_box)
+ if review_page is not None:
+ redact_single_box(
+ page,
+ rect,
+ img_annotation_box,
+ custom_colours,
+ retain_text=False,
+ return_pdf_end_of_redaction=False,
+ )
+ redact_single_box(
+ review_page,
+ rect,
+ img_annotation_box,
+ custom_colours,
+ retain_text=True,
+ return_pdf_end_of_redaction=False,
+ )
+ elif dual_output_same_pass and applied_redaction_page is not None:
+ # Apply both review and final redactions cumulatively on the same page copies.
+ redact_single_box(
+ page,
+ rect,
+ img_annotation_box,
+ custom_colours,
+ retain_text=True,
+ return_pdf_end_of_redaction=False,
+ )
+ redact_single_box(
+ applied_redaction_page,
+ rect,
+ img_annotation_box,
+ custom_colours,
+ retain_text=False,
+ return_pdf_end_of_redaction=False,
+ )
+ else:
+ redact_result = redact_single_box(
+ page,
+ rect,
+ img_annotation_box,
+ custom_colours,
+ return_pdf_for_review,
+ return_pdf_end_of_redaction,
+ )
+ if isinstance(redact_result, tuple):
+ page, _ = redact_result
+ continue
+ # Check if an Image recogniser result, or a Gradio annotation object
+ if (isinstance(annot, CustomImageRecognizerResult)) or isinstance(annot, dict):
+
+ img_annotation_box = dict()
+
+ # Should already be in correct format if img_annotator_box is an input
+ if isinstance(annot, dict):
+ annot = fill_missing_box_ids(annot)
+ img_annotation_box = annot
+
+ # Whole page redactions: always use full-page rect so they are reliably
+ # included when applying from Review tab (e.g. redactions_for_review.pdf)
+ whole_page_border = 5
+ if img_annotation_box.get("label") == "Whole page":
+ pymupdf_x1 = 0 + whole_page_border
+ pymupdf_y1 = 0 + whole_page_border
+ pymupdf_x2 = rect_width - whole_page_border
+ pymupdf_y2 = rect_height - whole_page_border
+ rect = Rect(pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2)
+ rect = _rect_display_to_unrotated(page, rect)
+ else:
+ box_coordinates = (
+ img_annotation_box["xmin"],
+ img_annotation_box["ymin"],
+ img_annotation_box["xmax"],
+ img_annotation_box["ymax"],
+ )
+
+ # Check if all coordinates are equal to or less than 1
+ are_coordinates_relative = all(
+ coord <= 1 for coord in box_coordinates
+ )
+
+ if are_coordinates_relative is True:
+ # Check if coordinates are relative, if so then multiply by mediabox size
+ pymupdf_x1 = img_annotation_box["xmin"] * mediabox_width
+ pymupdf_y1 = img_annotation_box["ymin"] * mediabox_height
+ pymupdf_x2 = img_annotation_box["xmax"] * mediabox_width
+ pymupdf_y2 = img_annotation_box["ymax"] * mediabox_height
+
+ elif image_dimensions or image:
+ pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2 = (
+ convert_gradio_image_annotator_object_coords_to_pymupdf(
+ page, img_annotation_box, image, image_dimensions
+ )
+ )
+ else:
+ print(
+ "Could not convert image annotator coordinates in redact_page_with_pymupdf"
+ )
+ pymupdf_x1 = img_annotation_box["xmin"]
+ pymupdf_y1 = img_annotation_box["ymin"]
+ pymupdf_x2 = img_annotation_box["xmax"]
+ pymupdf_y2 = img_annotation_box["ymax"]
+
+ if "text" in annot and annot["text"]:
+ img_annotation_box["text"] = str(annot["text"])
+ else:
+ img_annotation_box["text"] = ""
+
+ rect = Rect(
+ pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2
+ ) # Create the PyMuPDF Rect (display space when from image/gradio)
+ rect = _rect_display_to_unrotated(page, rect)
+
+ # Else should be CustomImageRecognizerResult
+ elif isinstance(annot, CustomImageRecognizerResult):
+ # print("annot is a CustomImageRecognizerResult")
+ img_annotation_box, rect = (
+ prepare_custom_image_recogniser_result_annotation_box(
+ page, annot, image, page_sizes_df, custom_colours
+ )
+ )
+
+ # Else it should be a pikepdf annotation object (handled above via Dictionary check)
+ else:
+ img_annotation_box, rect = (
+ convert_pikepdf_annotations_to_result_annotation_box(
+ page,
+ annot,
+ image,
+ convert_pikepdf_to_pymupdf_coords,
+ page_sizes_df,
+ image_dimensions=image_dimensions,
+ )
+ )
+ img_annotation_box = fill_missing_box_ids(img_annotation_box)
+
+ all_image_annotation_boxes.append(img_annotation_box)
+
+ # Redact the annotations from the document
+ if review_page is not None:
+ redact_single_box(
+ page,
+ rect,
+ img_annotation_box,
+ custom_colours,
+ retain_text=False,
+ return_pdf_end_of_redaction=False,
+ )
+ redact_single_box(
+ review_page,
+ rect,
+ img_annotation_box,
+ custom_colours,
+ retain_text=True,
+ return_pdf_end_of_redaction=False,
+ )
+ elif dual_output_same_pass and applied_redaction_page is not None:
+ # Apply both review and final redactions cumulatively on the same page copies.
+ redact_single_box(
+ page,
+ rect,
+ img_annotation_box,
+ custom_colours,
+ retain_text=True,
+ return_pdf_end_of_redaction=False,
+ )
+ redact_single_box(
+ applied_redaction_page,
+ rect,
+ img_annotation_box,
+ custom_colours,
+ retain_text=False,
+ return_pdf_end_of_redaction=False,
+ )
+ else:
+ redact_result = redact_single_box(
+ page,
+ rect,
+ img_annotation_box,
+ custom_colours,
+ return_pdf_for_review,
+ return_pdf_end_of_redaction,
+ )
+ # Handle dual page objects if returned
+ if isinstance(redact_result, tuple):
+ page, _ = redact_result
+
+ # If whole page is to be redacted, do that here
+ if redact_whole_page is True:
+
+ whole_page_img_annotation_box = redact_whole_pymupdf_page(
+ rect_height, rect_width, page, custom_colours, border=5
+ )
+ if review_page is not None:
+ redact_whole_pymupdf_page(
+ rect_height, rect_width, review_page, custom_colours, border=5
+ )
+ # Ensure the whole page annotation box has a unique ID
+ whole_page_img_annotation_box = fill_missing_box_ids(
+ whole_page_img_annotation_box
+ )
+ all_image_annotation_boxes.append(whole_page_img_annotation_box)
+
+ # In dual-output mode, apply whole-page redaction to the final page copy too.
+ if dual_output_same_pass and applied_redaction_page is not None:
+ redact_whole_pymupdf_page(
+ rect_height,
+ rect_width,
+ applied_redaction_page,
+ custom_colours,
+ border=5,
+ )
+
+ out_annotation_boxes = {
+ "image": image_path, # Image.open(image_path), #image_path,
+ "boxes": all_image_annotation_boxes,
+ }
+
+ # If we are not returning the review page, can directly remove text and all images
+ if return_pdf_for_review is False:
+ page.apply_redactions(
+ images=APPLY_REDACTIONS_IMAGES,
+ graphics=APPLY_REDACTIONS_GRAPHICS,
+ text=APPLY_REDACTIONS_TEXT,
+ )
+
+ set_cropbox_safely(page, original_cropbox)
+ page.clean_contents()
+ if review_page is not None:
+ set_cropbox_safely(review_page, original_cropbox)
+ review_page.clean_contents()
+
+ # Handle dual page objects if we have a final page
+ if dual_output_same_pass and applied_redaction_page is not None:
+ # Apply redactions to applied redaction page only
+ applied_redaction_page.apply_redactions(
+ images=APPLY_REDACTIONS_IMAGES,
+ graphics=APPLY_REDACTIONS_GRAPHICS,
+ text=APPLY_REDACTIONS_TEXT,
+ )
+
+ set_cropbox_safely(applied_redaction_page, original_cropbox)
+ applied_redaction_page.clean_contents()
+ return (page, applied_redaction_page), out_annotation_boxes
+
+ else:
+ return page, out_annotation_boxes
+
+
+###
+# IMAGE-BASED OCR PDF TEXT DETECTION/REDACTION WITH TESSERACT OR AWS TEXTRACT
+###
+
+
+def merge_img_bboxes(
+ bboxes: list,
+ combined_results: Dict,
+ page_signature_recogniser_results: list = list(),
+ page_handwriting_recogniser_results: list = list(),
+ handwrite_signature_checkbox: List[str] = [
+ "Extract handwriting",
+ "Extract signatures",
+ ],
+ horizontal_threshold: int = 50,
+ vertical_threshold: int = 12,
+):
+ """
+ Merges bounding boxes for image annotations based on the provided results from signature and handwriting recognizers.
+
+ Args:
+ bboxes (list): A list of bounding boxes to be merged.
+ combined_results (Dict): A dictionary containing combined results with line text and their corresponding bounding boxes.
+ page_signature_recogniser_results (list, optional): A list of results from the signature recognizer. Defaults to an empty list.
+ page_handwriting_recogniser_results (list, optional): A list of results from the handwriting recognizer. Defaults to an empty list.
+ handwrite_signature_checkbox (List[str], optional): A list of options indicating whether to extract handwriting and signatures. Defaults to ["Extract handwriting", "Extract signatures"].
+ horizontal_threshold (int, optional): The threshold for merging bounding boxes horizontally. Defaults to 50.
+ vertical_threshold (int, optional): The threshold for merging bounding boxes vertically. Defaults to 12.
+
+ Returns:
+ None: This function modifies the bounding boxes in place and does not return a value.
+ """
+
+ all_bboxes = list()
+ merged_bboxes = list()
+ grouped_bboxes = defaultdict(list)
+
+ # Deep copy original bounding boxes to retain them
+ original_bboxes = copy.deepcopy(bboxes)
+
+ # Process signature and handwriting results
+ if page_signature_recogniser_results or page_handwriting_recogniser_results:
+
+ if "Extract handwriting" in handwrite_signature_checkbox:
+ # print("Extracting handwriting in merge_img_bboxes function")
+ merged_bboxes.extend(copy.deepcopy(page_handwriting_recogniser_results))
+
+ if "Extract signatures" in handwrite_signature_checkbox:
+ # print("Extracting signatures in merge_img_bboxes function")
+ merged_bboxes.extend(copy.deepcopy(page_signature_recogniser_results))
+
+ # Add VLM [FACE] and [SIGNATURE] detections from combined_results, if present
+ try:
+ for line_info in combined_results.values():
+ words = line_info.get("words", [])
+ for word in words:
+ text_val = word.get("text")
+ if text_val not in ["[FACE]", "[SIGNATURE]"]:
+ continue
+ conf_raw = float(word.get("conf", word.get("confidence", 0.0)))
+ conf = conf_raw / 100.0 if conf_raw > 1.0 else conf_raw
+ conf = max(0.0, min(1.0, conf))
+ if conf < CUSTOM_VLM_MIN_CONFIDENCE:
+ continue
+ x0, y0, x1, y1 = word.get("bounding_box", (0, 0, 0, 0))
+ width = x1 - x0
+ height = y1 - y0
+ entity_type = (
+ "CUSTOM_VLM_FACES"
+ if text_val == "[FACE]"
+ else "CUSTOM_VLM_SIGNATURE"
+ )
+ merged_bboxes.append(
+ CustomImageRecognizerResult(
+ entity_type,
+ 0,
+ 0,
+ conf,
+ int(x0),
+ int(y0),
+ int(width),
+ int(height),
+ text_val,
+ )
+ )
+ except Exception as e:
+ print(
+ f"Warning: Error while adding VLM [FACE]/[SIGNATURE] boxes in merge_img_bboxes: {e}"
+ )
+
+ # Reconstruct and merge bounding boxes only if MERGE_BOUNDING_BOXES is enabled
+ if MERGE_BOUNDING_BOXES:
+ # Reconstruct bounding boxes for substrings of interest
+ reconstructed_bboxes = list()
+ for bbox in bboxes:
+ bbox_box = (
+ bbox.left,
+ bbox.top,
+ bbox.left + bbox.width,
+ bbox.top + bbox.height,
+ )
+ for line_key, line_info in combined_results.items():
+ line_box = line_info["bounding_box"]
+ # Use actual line text (not the key, which is e.g. "text_line_1") for substring matching
+ actual_line_text = line_info.get("text", "")
+ if bounding_boxes_overlap(bbox_box, line_box):
+ if actual_line_text and bbox.text in actual_line_text:
+ start_char = actual_line_text.index(bbox.text)
+ end_char = start_char + len(bbox.text)
+
+ relevant_words = list()
+ current_char = 0
+ for word in line_info["words"]:
+ word_end = current_char + len(word["text"])
+ if (
+ current_char <= start_char < word_end
+ or current_char < end_char <= word_end
+ or (start_char <= current_char and word_end <= end_char)
+ ):
+ relevant_words.append(word)
+ if word_end >= end_char:
+ break
+ current_char = word_end
+ if not word["text"].endswith(" "):
+ current_char += 1 # +1 for space if the word doesn't already end with a space
+
+ if relevant_words:
+ left = min(
+ word["bounding_box"][0] for word in relevant_words
+ )
+ top = min(
+ word["bounding_box"][1] for word in relevant_words
+ )
+ right = max(
+ word["bounding_box"][2] for word in relevant_words
+ )
+ bottom = max(
+ word["bounding_box"][3] for word in relevant_words
+ )
+
+ combined_text = " ".join(
+ word["text"] for word in relevant_words
+ )
+
+ reconstructed_bbox = CustomImageRecognizerResult(
+ bbox.entity_type,
+ bbox.start,
+ bbox.end,
+ bbox.score,
+ left,
+ top,
+ right - left, # width
+ bottom - top, # height,
+ combined_text,
+ )
+ # reconstructed_bboxes.append(bbox) # Add original bbox
+ reconstructed_bboxes.append(
+ reconstructed_bbox
+ ) # Add merged bbox
+ break
+ else:
+ reconstructed_bboxes.append(bbox)
+
+ # Group reconstructed bboxes by approximate vertical proximity
+ for box in reconstructed_bboxes:
+ grouped_bboxes[round(box.top / vertical_threshold)].append(box)
+
+ # Merge within each group
+ for _, group in grouped_bboxes.items():
+ group.sort(key=lambda box: box.left)
+
+ merged_box = group[0]
+ for next_box in group[1:]:
+ if (
+ next_box.left - (merged_box.left + merged_box.width)
+ <= horizontal_threshold
+ ):
+ if next_box.text != merged_box.text:
+ new_text = merged_box.text + " " + next_box.text
+ else:
+ new_text = merged_box.text
+
+ if merged_box.entity_type != next_box.entity_type:
+ new_entity_type = (
+ merged_box.entity_type + " - " + next_box.entity_type
+ )
+ else:
+ new_entity_type = merged_box.entity_type
+
+ new_left = min(merged_box.left, next_box.left)
+ new_top = min(merged_box.top, next_box.top)
+ new_width = (
+ max(
+ merged_box.left + merged_box.width,
+ next_box.left + next_box.width,
+ )
+ - new_left
+ )
+ new_height = (
+ max(
+ merged_box.top + merged_box.height,
+ next_box.top + next_box.height,
+ )
+ - new_top
+ )
+
+ merged_box = CustomImageRecognizerResult(
+ new_entity_type,
+ merged_box.start,
+ merged_box.end,
+ merged_box.score,
+ new_left,
+ new_top,
+ new_width,
+ new_height,
+ new_text,
+ )
+ else:
+ merged_bboxes.append(merged_box)
+ merged_box = next_box
+
+ merged_bboxes.append(merged_box)
+
+ all_bboxes.extend(original_bboxes)
+ all_bboxes.extend(merged_bboxes)
+
+ # Return the unique original and merged bounding boxes
+ unique_bboxes = list(
+ {
+ (bbox.left, bbox.top, bbox.width, bbox.height): bbox for bbox in all_bboxes
+ }.values()
+ )
+ return unique_bboxes
+
+
+def redact_image_pdf(
+ file_path: str,
+ pdf_image_file_paths: List[str],
+ language: str,
+ chosen_redact_entities: List[str],
+ chosen_redact_comprehend_entities: List[str],
+ allow_list: List[str] = None,
+ chosen_llm_entities: List[str] = None,
+ page_min: int = 0,
+ page_max: int = 0,
+ text_extraction_method: str = LOCAL_OCR_MODEL_TEXT_EXTRACT_OPTION,
+ handwrite_signature_checkbox: List[str] = [
+ "Extract handwriting",
+ "Extract signatures",
+ ],
+ textract_request_metadata: list = list(),
+ current_loop_page: int = 0,
+ page_break_return: bool = False,
+ annotations_all_pages: List = list(),
+ all_page_line_level_ocr_results_df: pd.DataFrame = pd.DataFrame(
+ columns=["page", "text", "left", "top", "width", "height", "line", "conf"]
+ ),
+ all_pages_decision_process_table: pd.DataFrame = pd.DataFrame(
+ columns=[
+ "image_path",
+ "page",
+ "label",
+ "xmin",
+ "xmax",
+ "ymin",
+ "ymax",
+ "boundingBox",
+ "text",
+ "start",
+ "end",
+ "score",
+ "id",
+ ]
+ ),
+ pymupdf_doc: Document = list(),
+ pii_identification_method: str = "Local",
+ comprehend_query_number: int = 0,
+ comprehend_client: str = "",
+ bedrock_runtime=None,
+ textract_client: str = "",
+ gemini_client=None,
+ gemini_config=None,
+ azure_openai_client=None,
+ in_deny_list: List[str] = list(),
+ redact_whole_page_list: List[str] = list(),
+ max_fuzzy_spelling_mistakes_num: int = 1,
+ match_fuzzy_whole_phrase_bool: bool = True,
+ page_sizes_df: pd.DataFrame = pd.DataFrame(),
+ text_extraction_only: bool = False,
+ textract_output_found: bool = False,
+ all_page_line_level_ocr_results=list(),
+ all_page_line_level_ocr_results_with_words=list(),
+ chosen_local_ocr_model: str = DEFAULT_LOCAL_OCR_MODEL,
+ page_break_val: int = int(PAGE_BREAK_VALUE),
+ log_files_output_paths: List = list(),
+ out_file_paths: List = list(),
+ max_time: int = int(MAX_TIME_VALUE),
+ nlp_analyser: AnalyzerEngine = nlp_analyser,
+ output_folder: str = OUTPUT_FOLDER,
+ input_folder: str = INPUT_FOLDER,
+ custom_llm_instructions: str = "",
+ inference_server_vlm_model: str = "",
+ efficient_ocr: bool = EFFICIENT_OCR,
+ hybrid_textract_bedrock_vlm: bool = HYBRID_TEXTRACT_BEDROCK_VLM,
+ overwrite_existing_ocr_results: bool = OVERWRITE_EXISTING_OCR_RESULTS,
+ save_page_ocr_visualisations: bool = SAVE_PAGE_OCR_VISUALISATIONS,
+ pages_to_process: Optional[List[int]] = None,
+ ocr_first_pass_max_workers: Optional[int] = None,
+ pages_in_pdf_points: Optional[List[int]] = None,
+ progress=Progress(track_tqdm=True),
+ defer_inline_custom_vlm_detection_pass: bool = False,
+):
+ # Initialize LLM token tracking variables
+ llm_total_input_tokens = 0
+ llm_total_output_tokens = 0
+ llm_model_name = ""
+
+ # Initialize VLM token tracking variables
+ vlm_total_input_tokens = 0
+ vlm_total_output_tokens = 0
+ vlm_model_name = ""
+
+ """
+ This function redacts sensitive information from a PDF document. It takes the following parameters in order:
+
+ - file_path (str): The path to the PDF file to be redacted.
+ - pdf_image_file_paths (List[str]): A list of paths to the PDF file pages converted to images.
+ - language (str): The language of the text in the PDF.
+ - chosen_redact_entities (List[str]): A list of entity types to redact from the PDF.
+ - chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from the list allowed by the AWS Comprehend service.
+ - allow_list (List[str], optional): A list of entity types to allow in the PDF. Defaults to None.
+ - page_min (int, optional): The minimum page number to start redaction from. Defaults to 0.
+ - page_max (int, optional): The maximum page number to end redaction at. Defaults to 0.
+ - text_extraction_method (str, optional): The type of analysis to perform on the PDF. Defaults to LOCAL_OCR_MODEL_TEXT_EXTRACT_OPTION.
+ - handwrite_signature_checkbox (List[str], optional): A list of options for redacting handwriting and signatures. Defaults to ["Extract handwriting", "Extract signatures"].
+ - textract_request_metadata (list, optional): Metadata related to the redaction request. Defaults to an empty string.
+ - current_loop_page (int, optional): The current page being processed. Defaults to 0.
+ - page_break_return (bool, optional): Indicates if the function should return after a page break. Defaults to False.
+ - annotations_all_pages (List, optional): List of annotations on all pages that is used by the gradio_image_annotation object.
+ - all_page_line_level_ocr_results_df (pd.DataFrame, optional): All line level OCR results for the document as a Pandas dataframe,
+ - all_pages_decision_process_table (pd.DataFrame, optional): All redaction decisions for document as a Pandas dataframe.
+ - pymupdf_doc (Document, optional): The document as a PyMupdf object.
+ - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
+ - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
+ - comprehend_client (optional): A connection to the AWS Comprehend service via the boto3 package.
+ - textract_client (optional): A connection to the AWS Textract service via the boto3 package.
+ - in_deny_list (optional): A list of custom words that the user has chosen specifically to redact.
+ - redact_whole_page_list (optional, List[str]): A list of pages to fully redact.
+ - max_fuzzy_spelling_mistakes_num (int, optional): The maximum number of spelling mistakes allowed in a searched phrase for fuzzy matching. Can range from 0-9.
+ - match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
+ - page_sizes_df (pd.DataFrame, optional): A pandas dataframe of PDF page sizes in PDF or image format.
+ - text_extraction_only (bool, optional): Should the function only extract text, or also do redaction.
+ - textract_output_found (bool, optional): Boolean is true when a textract OCR output for the file has been found.
+ - all_page_line_level_ocr_results (optional): List of all page line level OCR results.
+ - all_page_line_level_ocr_results_with_words (optional): List of all page line level OCR results with words.
+ - chosen_local_ocr_model (str, optional): The local model chosen for OCR. Defaults to DEFAULT_LOCAL_OCR_MODEL, other choices are "paddle" for PaddleOCR, or "hybrid-paddle" for a combination of both.
+ - page_break_val (int, optional): The value at which to trigger a page break. Defaults to PAGE_BREAK_VALUE.
+ - log_files_output_paths (List, optional): List of file paths used for saving redaction process logging results.
+ - out_file_paths (List, optional): List of file paths used for saving redaction process output results.
+ - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
+ - nlp_analyser (AnalyzerEngine, optional): The nlp_analyser object to use for entity detection. Defaults to nlp_analyser.
+ - output_folder (str, optional): The folder for file outputs.
+ - input_folder (str, optional): The folder for file inputs.
+ - custom_llm_instructions (str, optional): Custom instructions for LLM-based entity detection. Defaults to an empty string.
+ - inference_server_vlm_model (str, optional): The inference-server VLM model to use for OCR. Defaults to an empty string. If empty, uses DEFAULT_INFERENCE_SERVER_VLM_MODEL.
+ - efficient_ocr (bool, optional): Whether to use efficient OCR. Defaults to EFFICIENT_OCR.
+ - progress (Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
+ The function returns a redacted PDF document along with processing output objects.
+ """
+
+ # Default chosen_llm_entities to chosen_redact_comprehend_entities if not provided
+ # if chosen_llm_entities is None:
+ # chosen_llm_entities = chosen_redact_comprehend_entities
+
+ # When AWS Textract + Face detection: always analyse all pages for faces (as if
+ # CUSTOM_VLM_FACES were enabled), regardless of PII method or text_extraction_only.
+ if "Face detection" in (handwrite_signature_checkbox or []):
+ chosen_redact_entities = list(chosen_redact_entities or [])
+ chosen_redact_comprehend_entities = list(
+ chosen_redact_comprehend_entities or []
+ )
+ chosen_llm_entities = list(chosen_llm_entities or [])
+ if "CUSTOM_VLM_FACES" not in chosen_redact_entities:
+ chosen_redact_entities.append("CUSTOM_VLM_FACES")
+ if "CUSTOM_VLM_FACES" not in chosen_redact_comprehend_entities:
+ chosen_redact_comprehend_entities.append("CUSTOM_VLM_FACES")
+ if "CUSTOM_VLM_FACES" not in chosen_llm_entities:
+ chosen_llm_entities.append("CUSTOM_VLM_FACES")
+
+ # When True, skip per-page VLM face/signature detection; run_custom_vlm_only_pass
+ # (after OCR in efficient-OCR flow) handles each once per page on full images.
+ _skip_inline_custom_vlm_detection = defer_inline_custom_vlm_detection_pass
+
+ tic = time.perf_counter()
+
+ file_name = get_file_name_without_type(file_path)
+ comprehend_query_number_new = 0
+ selection_element_results_list_df = pd.DataFrame()
+ form_key_value_results_list_df = pd.DataFrame()
+ textract_json_file_path = ""
+ textract_client_not_found = False
+ # Try updating the supported languages for the spacy analyser
+ try:
+ nlp_analyser = create_nlp_analyser(language, existing_nlp_analyser=nlp_analyser)
+ # Check list of nlp_analyser recognisers and languages
+ if language != "en":
+ gr.Info(
+ f"Language: {language} only supports the following entity detection: {str(nlp_analyser.registry.get_supported_entities(languages=[language]))}"
+ )
+
+ except Exception as e:
+ print(f"Error creating nlp_analyser for {language}: {e}")
+ raise Exception(f"Error creating nlp_analyser for {language}: {e}")
+
+ # Update custom word list analyser object with any new words that have been added to the custom deny list
+ if in_deny_list:
+ nlp_analyser.registry.remove_recognizer("CUSTOM")
+ new_custom_recogniser = custom_word_list_recogniser(in_deny_list)
+ nlp_analyser.registry.add_recognizer(new_custom_recogniser)
+
+ nlp_analyser.registry.remove_recognizer("CustomWordFuzzyRecognizer")
+ new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(
+ supported_entities=["CUSTOM_FUZZY"],
+ custom_list=in_deny_list,
+ spelling_mistakes_max=max_fuzzy_spelling_mistakes_num,
+ search_whole_phrase=match_fuzzy_whole_phrase_bool,
+ )
+ nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
+
+ # Map text extraction method to OCR engine
+ if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
+ ocr_engine = "tesseract" # Not actually used, but required for initialization
+ elif text_extraction_method == BEDROCK_VLM_TEXT_EXTRACT_OPTION:
+ ocr_engine = "bedrock-vlm"
+ elif text_extraction_method == GEMINI_VLM_TEXT_EXTRACT_OPTION:
+ ocr_engine = "gemini-vlm"
+ elif text_extraction_method == AZURE_OPENAI_VLM_TEXT_EXTRACT_OPTION:
+ ocr_engine = "azure-openai-vlm"
+ else:
+ ocr_engine = chosen_local_ocr_model
+
+ image_analyser = CustomImageAnalyzerEngine(
+ analyzer_engine=nlp_analyser,
+ ocr_engine=ocr_engine,
+ language=language,
+ output_folder=output_folder,
+ save_page_ocr_visualisations=save_page_ocr_visualisations,
+ )
+
+ if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
+ out_message = "Connection to AWS Comprehend service unsuccessful."
+ print(out_message)
+ raise Exception(out_message)
+
+ if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION and textract_client == "":
+ out_message_warning = "Connection to AWS Textract service unsuccessful. Redaction will only continue if local AWS Textract results can be found."
+ textract_client_not_found = True
+ print(out_message_warning)
+ # raise Exception(out_message)
+
+ number_of_pages = pymupdf_doc.page_count
+ print("Number of pages in document:", str(number_of_pages))
+
+ # Check that page_min and page_max are within expected ranges
+ if page_max > number_of_pages or page_max == 0:
+ page_max = number_of_pages
+
+ if page_min <= 0:
+ page_min = 0
+ else:
+ page_min = page_min - 1
+
+ print("Page range:", str(page_min + 1), "to", str(page_max))
+
+ # If running Textract, check if file already exists. If it does, load in existing data
+ if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
+ # Generate suffix based on checkbox options
+ textract_suffix = get_textract_file_suffix(handwrite_signature_checkbox)
+ textract_json_file_path = (
+ output_folder + file_name + textract_suffix + "_textract.json"
+ )
+ if overwrite_existing_ocr_results:
+ # Skip loading existing results, start fresh
+ textract_data = {}
+ is_missing = True
+ else:
+ textract_data, is_missing, log_files_output_paths = (
+ load_and_convert_textract_json(
+ textract_json_file_path, log_files_output_paths, page_sizes_df
+ )
+ )
+ if textract_data:
+ pass
+ original_textract_data = textract_data.copy()
+
+ if textract_client_not_found and is_missing:
+ print(
+ "No existing Textract results file found and no Textract client found. Redaction will not continue."
+ )
+ raise Exception(
+ "No existing Textract results file found and no Textract client found. Redaction will not continue."
+ )
+
+ # If running local OCR option, check if file already exists. If it does, load in existing data
+ if text_extraction_method == LOCAL_OCR_MODEL_TEXT_EXTRACT_OPTION:
+ all_page_line_level_ocr_results_with_words_json_file_path = (
+ output_folder + file_name + "_ocr_results_with_words_local_ocr.json"
+ )
+
+ # Preserve any pre-existing word-level OCR results passed in (e.g. from
+ # EFFICIENT_OCR's selectable-text path). This prevents us from accidentally
+ # dropping those pages when we load/recompute OCR-needed pages.
+ pre_existing_word_results = (
+ list(all_page_line_level_ocr_results_with_words or [])
+ if all_page_line_level_ocr_results_with_words is not None
+ else []
+ )
+
+ if overwrite_existing_ocr_results:
+ # Skip loading existing cached results. Keep any pre-existing word
+ # results for pages outside `pages_to_process` so we don't erase
+ # selectable-text path output in efficient-ocr mixed mode.
+ if pages_to_process:
+ pages_to_process_set = set(pages_to_process)
+
+ def _page_in_to_process(item) -> bool:
+ try:
+ return int(item.get("page")) in pages_to_process_set
+ except Exception:
+ return False
+
+ pre_existing_word_results = [
+ item
+ for item in pre_existing_word_results
+ if not _page_in_to_process(item)
+ ]
+
+ all_page_line_level_ocr_results_with_words = list(pre_existing_word_results)
+ is_missing = True
+ print("overwriting existing OCR results with words")
+ elif os.path.exists(all_page_line_level_ocr_results_with_words_json_file_path):
+ print("Loading existing OCR results with words for local OCR analysis")
+ cached_word_results = []
+ (
+ cached_word_results,
+ _is_missing,
+ log_files_output_paths,
+ ) = load_and_convert_ocr_results_with_words_json(
+ all_page_line_level_ocr_results_with_words_json_file_path,
+ log_files_output_paths,
+ page_sizes_df,
+ )
+
+ all_page_line_level_ocr_results_with_words = merge_page_results(
+ list(pre_existing_word_results) + list(cached_word_results)
+ )
+
+ original_all_page_line_level_ocr_results_with_words = (
+ all_page_line_level_ocr_results_with_words.copy()
+ )
+
+ ###
+ if current_loop_page == 0:
+ page_loop_start = page_min
+ else:
+ page_loop_start = current_loop_page
+
+ page_loop_end = page_max
+
+ # When pages_to_process is provided (e.g. from efficient_ocr), iterate only over those pages (1-based list).
+ if pages_to_process is not None:
+ page_loop_pages = sorted([p - 1 for p in pages_to_process]) # 0-indexed, sorted
+ page_min = 0
+ page_max = number_of_pages
+ else:
+ page_loop_pages = None
+
+ # If there's data from a previous run (passed in via the DataFrame parameters), add it
+ all_line_level_ocr_results_list = list()
+ all_pages_decision_process_list = list()
+ selection_element_results_list = list()
+ form_key_value_results_list = list()
+
+ # Track which pages already have line-level OCR outputs so we don't duplicate rows
+ # when we rebuild from cached Textract `ocr_results_with_words` entries.
+ existing_line_level_pages_1based = set()
+ if (
+ isinstance(all_page_line_level_ocr_results_df, pd.DataFrame)
+ and not all_page_line_level_ocr_results_df.empty
+ and "page" in all_page_line_level_ocr_results_df.columns
+ ):
+ try:
+ existing_line_level_pages_1based = set(
+ pd.to_numeric(
+ all_page_line_level_ocr_results_df["page"], errors="coerce"
+ )
+ .dropna()
+ .astype(int)
+ .unique()
+ .tolist()
+ )
+ except Exception:
+ existing_line_level_pages_1based = set()
+
+ if not all_page_line_level_ocr_results_df.empty:
+ all_line_level_ocr_results_list.extend(
+ all_page_line_level_ocr_results_df.to_dict("records")
+ )
+ if not all_pages_decision_process_table.empty:
+ all_pages_decision_process_list.extend(
+ all_pages_decision_process_table.to_dict("records")
+ )
+
+ # Dictionary to store OCR results and page metadata for two-pass processing
+ # This allows us to do all OCR first, then all PII detection, avoiding model switching
+ ocr_results_by_page = {}
+
+ # Pre-initialise ocr_results_by_page from pages that already have OCR/text results
+ # (e.g. when EFFICIENT_OCR is True, text-extraction pages are already processed by redact_text_pdf)
+ if all_page_line_level_ocr_results_with_words:
+ by_page = {}
+ for item in all_page_line_level_ocr_results_with_words:
+ p = item.get("page")
+ if p is None:
+ continue
+ page_1based = int(p) if isinstance(p, str) else p
+ if page_1based not in by_page:
+ by_page[page_1based] = {"page": page_1based, "results": {}}
+ by_page[page_1based]["results"].update(item.get("results", {}))
+
+ for page_1based in by_page:
+ page_no = page_1based - 1
+ if page_no < 0 or page_no >= number_of_pages:
+ continue
+ try:
+ image_path = page_sizes_df.loc[
+ page_sizes_df["page"] == page_1based, "image_path"
+ ].iloc[0]
+ except Exception:
+ image_path = (
+ pdf_image_file_paths[page_no]
+ if page_no < len(pdf_image_file_paths)
+ else ""
+ )
+ try:
+ pymupdf_page = pymupdf_doc.load_page(page_no)
+ except Exception:
+ continue
+ try:
+ original_cropbox = page_sizes_df.loc[
+ page_sizes_df["page"] == page_1based, "original_cropbox"
+ ].iloc[0]
+ except Exception:
+ original_cropbox = pymupdf_page.cropbox.irect
+
+ image = None
+ page_width = pymupdf_page.mediabox.width
+ page_height = pymupdf_page.mediabox.height
+ if isinstance(image_path, str) and image_path:
+ normalized_path = os.path.normpath(os.path.abspath(image_path))
+ if validate_path_containment(normalized_path, input_folder):
+ try:
+ image = Image.open(normalized_path)
+ page_width, page_height = image.size
+ except Exception:
+ pass
+
+ page_data_merged = by_page[page_1based]
+ page_line_level_ocr_results_with_words = [page_data_merged]
+
+ ocr_results_by_page[page_no] = {
+ "page_line_level_ocr_results": None,
+ "page_line_level_ocr_results_with_words": page_line_level_ocr_results_with_words,
+ "page_signature_recogniser_results": list(),
+ "page_handwriting_recogniser_results": list(),
+ "handwriting_or_signature_boxes": list(),
+ "image_path": image_path,
+ "pymupdf_page": pymupdf_page,
+ "original_cropbox": original_cropbox,
+ "page_width": page_width,
+ "page_height": page_height,
+ "image": image,
+ "reported_page_number": str(page_1based),
+ }
+
+ # Load existing Textract OCR results from output folder if present (per-page use in loop).
+ # When a page exists in this file, use it as page_line_level_ocr_results_with_words,
+ # recreate page_line_level_ocr_results, and skip OCR (including hybrid textract-bedrock).
+ textract_ocr_by_page_1based = None
+ if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
+ existing_ocr_file_name = file_name + "_ocr_results_with_words_textract.json"
+ existing_textract_ocr_path = output_folder + existing_ocr_file_name
+ if os.path.exists(existing_textract_ocr_path):
+ (
+ loaded_textract_ocr_list,
+ is_missing,
+ log_files_output_paths,
+ ) = load_and_convert_ocr_results_with_words_json(
+ existing_textract_ocr_path,
+ log_files_output_paths,
+ page_sizes_df,
+ )
+ if not is_missing and loaded_textract_ocr_list:
+ textract_ocr_by_page_1based = {}
+ for item in loaded_textract_ocr_list:
+ p = item.get("page")
+ if p is not None:
+ page_1based = int(p) if isinstance(p, str) else p
+ textract_ocr_by_page_1based[page_1based] = item
+ if textract_ocr_by_page_1based:
+ print(
+ f"Found existing Textract OCR results file: {existing_ocr_file_name} "
+ f"({len(textract_ocr_by_page_1based)} pages). Will use for matching pages and skip OCR/hybrid."
+ )
+
+ # When > 1, OCR first pass runs analyse_page_with_textract in parallel (AWS Textract only).
+ # With efficient_ocr, the caller can pass pages_to_process so multiple OCR-needed pages
+ # are processed in one call, enabling parallel OCR.
+ _ocr_first_pass_max_workers = (
+ ocr_first_pass_max_workers
+ if ocr_first_pass_max_workers is not None
+ else OCR_FIRST_PASS_MAX_WORKERS
+ )
+ use_ocr_parallel_textract = (
+ _ocr_first_pass_max_workers > 1
+ and text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION
+ )
+ ocr_parallel_jobs_textract = [] if use_ocr_parallel_textract else None
+ ocr_pymupdf_pages_textract = {} if use_ocr_parallel_textract else None
+
+ # FIRST PASS: Perform OCR on all pages
+ # This collects OCR results without doing PII detection, which is more efficient
+ # when using inference servers that need to switch between VLM and LLM models
+ print("First pass: Performing OCR on all pages...")
+ if page_loop_pages is not None:
+ ocr_progress_bar = tqdm(
+ page_loop_pages,
+ unit="pages",
+ desc="Performing image-based processing",
+ )
+ else:
+ ocr_progress_bar = tqdm(
+ range(page_loop_start, page_loop_end),
+ unit="pages",
+ desc="Performing image-based processing",
+ )
+ _ocr_gui_total_pages = (
+ len(page_loop_pages)
+ if page_loop_pages is not None
+ else max(0, page_loop_end - page_loop_start)
+ )
+ _ocr_gui_total_str = str(_ocr_gui_total_pages) if _ocr_gui_total_pages > 0 else "?"
+
+ # Parallel Bedrock VLM page OCR: run perform_ocr for all pages that need it, then use results in the loop.
+ ocr_results_from_parallel = {}
+ # Parallel local page OCR (Tesseract / Paddle / etc.): perform_ocr per page, then use in loop.
+ ocr_results_from_parallel_local_ocr = {}
+ if (
+ text_extraction_method == BEDROCK_VLM_TEXT_EXTRACT_OPTION
+ and bedrock_runtime is not None
+ ):
+ _bedrock_ocr_tasks = []
+ _pages_iter = (
+ list(page_loop_pages)
+ if page_loop_pages is not None
+ else list(range(page_loop_start, page_loop_end))
+ )
+ for _pno in _pages_iter:
+ try:
+ _img_path = page_sizes_df.loc[
+ page_sizes_df["page"] == (_pno + 1), "image_path"
+ ].iloc[0]
+ except Exception:
+ _img_path = (
+ pdf_image_file_paths[_pno]
+ if _pno < len(pdf_image_file_paths)
+ else ""
+ )
+ if all_page_line_level_ocr_results_with_words:
+ _reported = str(_pno + 1)
+ _match = next(
+ (
+ item
+ for item in all_page_line_level_ocr_results_with_words
+ if int(item.get("page", -1)) == int(_reported)
+ ),
+ None,
+ )
+ if _match:
+ continue
+ _actual = _img_path
+ if isinstance(_img_path, str) and (
+ "placeholder_image" in _img_path or "image_placeholder" in _img_path
+ ):
+ try:
+ _, _created, _, _ = process_single_page_for_image_conversion(
+ pdf_path=file_path,
+ page_num=_pno,
+ image_dpi=IMAGES_DPI,
+ create_images=True,
+ input_folder=input_folder,
+ )
+ if os.path.exists(_created):
+ _actual = _created
+ if not page_sizes_df.empty:
+ page_sizes_df.loc[
+ page_sizes_df["page"] == (_pno + 1),
+ "image_path",
+ ] = _created
+ except Exception:
+ pass
+ _bedrock_ocr_tasks.append((_pno, _actual))
+ if _bedrock_ocr_tasks:
+
+ def _run_bedrock_page_ocr(task):
+ pno, actual_path = task
+ return (
+ pno,
+ image_analyser.perform_ocr(
+ actual_path,
+ bedrock_runtime=bedrock_runtime,
+ gemini_client=gemini_client,
+ gemini_config=gemini_config,
+ azure_openai_client=azure_openai_client,
+ vlm_model_choice=CLOUD_VLM_MODEL_CHOICE,
+ inference_server_model_name=(
+ inference_server_vlm_model
+ if inference_server_vlm_model
+ else None
+ ),
+ page_index_0=pno,
+ ),
+ )
+
+ _max_workers = min(MAX_WORKERS, len(_bedrock_ocr_tasks))
+ with ThreadPoolExecutor(max_workers=_max_workers) as executor:
+ for pno, result in executor.map(
+ _run_bedrock_page_ocr, _bedrock_ocr_tasks
+ ):
+ ocr_results_from_parallel[pno] = result
+
+ if text_extraction_method == LOCAL_OCR_MODEL_TEXT_EXTRACT_OPTION:
+ _local_ocr_tasks = []
+ _pages_iter = (
+ list(page_loop_pages)
+ if page_loop_pages is not None
+ else list(range(page_loop_start, page_loop_end))
+ )
+ for _pno in _pages_iter:
+ if all_page_line_level_ocr_results_with_words:
+ _reported = str(_pno + 1)
+ _match = next(
+ (
+ item
+ for item in all_page_line_level_ocr_results_with_words
+ if int(item.get("page", -1)) == int(_reported)
+ ),
+ None,
+ )
+ if _match:
+ continue
+ try:
+ _img_path = page_sizes_df.loc[
+ page_sizes_df["page"] == (_pno + 1), "image_path"
+ ].iloc[0]
+ except Exception:
+ _img_path = (
+ pdf_image_file_paths[_pno]
+ if _pno < len(pdf_image_file_paths)
+ else ""
+ )
+ _actual = _img_path
+ if isinstance(_img_path, str) and (
+ "placeholder_image" in _img_path or "image_placeholder" in _img_path
+ ):
+ try:
+ _, _created, _, _ = process_single_page_for_image_conversion(
+ pdf_path=file_path,
+ page_num=_pno,
+ image_dpi=IMAGES_DPI,
+ create_images=True,
+ input_folder=input_folder,
+ )
+ if os.path.exists(_created):
+ _actual = _created
+ if not page_sizes_df.empty:
+ page_sizes_df.loc[
+ page_sizes_df["page"] == (_pno + 1),
+ "image_path",
+ ] = _created
+ except Exception:
+ pass
+ _local_ocr_tasks.append((_pno, _actual))
+
+ if _local_ocr_tasks:
+
+ def _run_local_page_ocr(task):
+ pno, actual_path = task
+ (
+ page_word_level_ocr_results,
+ page_vlm_input_tokens,
+ page_vlm_output_tokens,
+ page_vlm_model_name,
+ ) = image_analyser.perform_ocr(
+ actual_path,
+ page_index_0=pno,
+ )
+
+ # Pre-compute line-level OCR structures here so the main loop can
+ # skip the expensive combine_ocr_results step.
+ (
+ page_line_level_ocr_results,
+ page_line_level_ocr_results_with_words,
+ ) = combine_ocr_results(page_word_level_ocr_results, page=str(pno + 1))
+
+ return (
+ pno,
+ (
+ page_word_level_ocr_results,
+ page_vlm_input_tokens,
+ page_vlm_output_tokens,
+ page_vlm_model_name,
+ page_line_level_ocr_results,
+ page_line_level_ocr_results_with_words,
+ ),
+ )
+
+ if chosen_local_ocr_model == "paddle":
+ _max_workers = min(PADDLE_MAX_WORKERS, len(_local_ocr_tasks))
+ _parallel_label = "Paddle"
+ elif chosen_local_ocr_model == "tesseract":
+ _max_workers = min(TESSERACT_MAX_WORKERS, len(_local_ocr_tasks))
+ _parallel_label = "Tesseract"
+ else:
+ # hybrid-paddle, VLM hybrids, local VLM, etc. (same cap as before Tesseract split)
+ _max_workers = min(TESSERACT_MAX_WORKERS, len(_local_ocr_tasks))
+ _parallel_label = "local model"
+
+ _local_ocr_sequential_only = frozenset(
+ (
+ "vlm",
+ "inference-server",
+ "hybrid-paddle-vlm",
+ "hybrid-paddle-inference-server",
+ )
+ )
+ _sequential_local_ocr = chosen_local_ocr_model in _local_ocr_sequential_only
+ _phase = "Sequential" if _sequential_local_ocr else "Parallel"
+
+ _n_local = len(_local_ocr_tasks)
+ if _sequential_local_ocr:
+ print(
+ f"{_phase} {_parallel_label} page OCR: processing {_n_local} page(s) "
+ f"one at a time ({chosen_local_ocr_model!r})."
+ )
+ else:
+ print(
+ f"{_phase} {_parallel_label} page OCR: processing {_n_local} page(s) "
+ f"with {int(_max_workers)} worker(s)."
+ )
+ # Progress slice for this phase (first pass OCR); main page loop uses tqdm after this.
+ _prog_lo, _prog_hi = 0.0, 1.0
+ try:
+ progress(
+ _prog_lo,
+ desc=f"{_phase} {_parallel_label} OCR (0/{_n_local} pages)",
+ )
+ except Exception:
+ pass
+ _completed_local = 0
+ if _sequential_local_ocr:
+ for task in _local_ocr_tasks:
+ pno = task[0]
+ try:
+ res_pno, result = _run_local_page_ocr(task)
+ ocr_results_from_parallel_local_ocr[res_pno] = result
+ except Exception as _e:
+ print(f"Sequential local OCR failed for page {pno + 1}: {_e}")
+ _completed_local += 1
+ try:
+ frac = _prog_lo + (_prog_hi - _prog_lo) * (
+ _completed_local / max(_n_local, 1)
+ )
+ progress(
+ frac,
+ desc=(
+ f"{_phase} {_parallel_label} OCR "
+ f"({_completed_local}/{_n_local} pages)"
+ ),
+ )
+ except Exception:
+ pass
+ else:
+ with ThreadPoolExecutor(max_workers=_max_workers) as executor:
+ future_to_pno = {
+ executor.submit(_run_local_page_ocr, task): task[0]
+ for task in _local_ocr_tasks
+ }
+ for fut in as_completed(future_to_pno):
+ pno = future_to_pno[fut]
+ try:
+ res_pno, result = fut.result()
+ ocr_results_from_parallel_local_ocr[res_pno] = result
+ except Exception as _e:
+ print(f"Parallel local OCR failed for page {pno + 1}: {_e}")
+ _completed_local += 1
+ try:
+ frac = _prog_lo + (_prog_hi - _prog_lo) * (
+ _completed_local / max(_n_local, 1)
+ )
+ progress(
+ frac,
+ desc=(
+ f"{_phase} {_parallel_label} OCR "
+ f"({_completed_local}/{_n_local} pages)"
+ ),
+ )
+ except Exception:
+ pass
+
+ for page_no in ocr_progress_bar:
+
+ reported_page_number = str(page_no + 1)
+ try:
+ ocr_progress_bar.set_postfix_str(
+ f"OCR · page {reported_page_number}/{_ocr_gui_total_str}",
+ refresh=False,
+ )
+ except Exception:
+ pass
+ # print(
+ # f"[Performing image-based processing] page {reported_page_number}/{_ocr_gui_total_str}: "
+ # "start iteration (init lists, entity flags)"
+ # )
+
+ # Define once per iteration so they are always set before use at _include_vlm_boxes_in_outputs (line ~7610)
+ _person_selected = (
+ "CUSTOM_VLM_FACES" in (chosen_redact_entities or [])
+ or "CUSTOM_VLM_FACES" in (chosen_redact_comprehend_entities or [])
+ or "CUSTOM_VLM_FACES" in (chosen_llm_entities or [])
+ )
+ _textract_face_identification = (
+ text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION
+ and "Face detection" in (handwrite_signature_checkbox or [])
+ )
+ _run_face_pass = _person_selected or _textract_face_identification
+
+ handwriting_or_signature_boxes = list()
+ page_signature_recogniser_results = list()
+ page_handwriting_recogniser_results = list()
+ page_line_level_ocr_results_with_words = list()
+ page_line_level_ocr_results = None # Initialize to None, will be set during OCR
+ page_break_return = False
+
+ # Try to find image location
+ try:
+ image_path = page_sizes_df.loc[
+ page_sizes_df["page"] == (page_no + 1), "image_path"
+ ].iloc[0]
+ except Exception as e:
+ print("Could not find image_path in page_sizes_df due to:", e)
+ image_path = pdf_image_file_paths[page_no]
+
+ page_image_annotations = {"image": image_path, "boxes": []}
+ pymupdf_page = pymupdf_doc.load_page(page_no)
+ # print(
+ # f"[Performing image-based processing] page {reported_page_number}/{_ocr_gui_total_str}: "
+ # "PyMuPDF load_page"
+ # )
+
+ if not (page_no >= page_min and page_no < page_max):
+ # print(
+ # f"[Performing image-based processing] page {reported_page_number}/{_ocr_gui_total_str}: "
+ # "outside page_min/page_max — skipping OCR block for this page"
+ # )
+ continue
+
+ if page_no >= page_min and page_no < page_max:
+ # Need image size to convert OCR outputs to the correct sizes
+ if isinstance(image_path, str):
+ # Normalize and validate path safety before checking existence
+ normalized_path = os.path.normpath(os.path.abspath(image_path))
+ if validate_path_containment(normalized_path, input_folder):
+ image = Image.open(normalized_path)
+ page_width, page_height = image.size
+ else:
+ # If validation fails and input file is an image file, try using file_path as fallback
+ if (
+ is_pdf(file_path) is False
+ and isinstance(file_path, str)
+ and file_path
+ ):
+ normalized_file_path = os.path.normpath(
+ os.path.abspath(file_path)
+ )
+ # Check if it's a Gradio temporary file (often in temp directories)
+ is_gradio_temp = (
+ "gradio" in normalized_file_path.lower()
+ and "temp" in normalized_file_path.lower()
+ )
+ if is_gradio_temp or validate_path_containment(
+ normalized_file_path, input_folder
+ ):
+ try:
+ image = Image.open(normalized_file_path)
+ page_width, page_height = image.size
+ except Exception as e:
+ print(
+ f"Could not open image from file_path {file_path}: {e}"
+ )
+ image = None
+ page_width = pymupdf_page.mediabox.width
+ page_height = pymupdf_page.mediabox.height
+ else:
+ # For image files, at least keep image_path as a string for later use
+ image = None
+ page_width = pymupdf_page.mediabox.width
+ page_height = pymupdf_page.mediabox.height
+ else:
+ # print("Image path does not exist, using mediabox coordinates as page sizes")
+ image = None
+ page_width = pymupdf_page.mediabox.width
+ page_height = pymupdf_page.mediabox.height
+ elif not isinstance(image_path, Image.Image):
+ # If image_path is not a string or Image, and input file is an image file, try file_path
+ if (
+ is_pdf(file_path) is False
+ and isinstance(file_path, str)
+ and file_path
+ ):
+ normalized_file_path = os.path.normpath(os.path.abspath(file_path))
+ is_gradio_temp = (
+ "gradio" in normalized_file_path.lower()
+ and "temp" in normalized_file_path.lower()
+ )
+ if is_gradio_temp or validate_path_containment(
+ normalized_file_path, input_folder
+ ):
+ try:
+ image = Image.open(normalized_file_path)
+ page_width, page_height = image.size
+ except Exception as e:
+ print(
+ f"Could not open image from file_path {file_path}: {e}"
+ )
+ image = None
+ page_width = pymupdf_page.mediabox.width
+ page_height = pymupdf_page.mediabox.height
+ else:
+ image = None
+ page_width = pymupdf_page.mediabox.width
+ page_height = pymupdf_page.mediabox.height
+ else:
+ print(
+ f"Unexpected image_path type: {type(image_path)}, using page mediabox coordinates as page sizes"
+ ) # Ensure image_path is valid
+ image = None
+ page_width = pymupdf_page.mediabox.width
+ page_height = pymupdf_page.mediabox.height
+
+ try:
+ if not page_sizes_df.empty:
+ original_cropbox = page_sizes_df.loc[
+ page_sizes_df["page"] == (page_no + 1), "original_cropbox"
+ ].iloc[0]
+ except IndexError:
+ print(
+ "Can't find original cropbox details for page, using current PyMuPDF page cropbox"
+ )
+ original_cropbox = pymupdf_page.cropbox.irect
+
+ if image is None:
+ # Check if image_path is a placeholder and create the actual image
+ if isinstance(image_path, str) and (
+ "placeholder_image" in image_path
+ or "image_placeholder" in image_path
+ ):
+ # print(f"Detected placeholder image path: {image_path}")
+ try:
+ # Extract page number from placeholder path (e.g. placeholder_image_0.png or image_placeholder_0.png)
+ page_num_from_placeholder = int(
+ image_path.split("_")[-1].split(".")[0]
+ )
+
+ # Create the actual image using process_single_page_for_image_conversion
+ _, created_image_path, page_width, page_height = (
+ process_single_page_for_image_conversion(
+ pdf_path=file_path,
+ page_num=page_num_from_placeholder,
+ image_dpi=IMAGES_DPI,
+ create_images=True,
+ input_folder=input_folder,
+ )
+ )
+
+ # Load the created image
+ if os.path.exists(created_image_path):
+ image = Image.open(created_image_path)
+ # print(
+ # f"Successfully created and loaded image from: {created_image_path}"
+ # )
+ else:
+ # print(f"Failed to create image at: {created_image_path}")
+ page_width = pymupdf_page.mediabox.width
+ page_height = pymupdf_page.mediabox.height
+ except Exception as e:
+ print(f"Error creating image from placeholder: {e}")
+ page_width = pymupdf_page.mediabox.width
+ page_height = pymupdf_page.mediabox.height
+ else:
+ try:
+ # Create the actual image using process_single_page_for_image_conversion
+ _, created_image_path, page_width, page_height = (
+ process_single_page_for_image_conversion(
+ pdf_path=file_path,
+ page_num=page_no,
+ image_dpi=IMAGES_DPI,
+ create_images=True,
+ input_folder=input_folder,
+ )
+ )
+
+ # Load the created image
+ if os.path.exists(created_image_path):
+ image = Image.open(created_image_path)
+ else:
+ page_width = pymupdf_page.mediabox.width
+ page_height = pymupdf_page.mediabox.height
+
+ except Exception as e:
+ print(f"Error creating image from file_path: {e}")
+ page_width = pymupdf_page.mediabox.width
+ page_height = pymupdf_page.mediabox.height
+
+ if image is None:
+ print("Image is None - using mediabox coordinates")
+ page_width = pymupdf_page.mediabox.width
+ page_height = pymupdf_page.mediabox.height
+
+ # Ensure page_sizes_df has the dimensions we're using for this page (image or
+ # mediabox) so that divide_coordinates_by_page_sizes uses the same size later.
+ # Otherwise OCR/Textract boxes (in image pixels) get divided by mediabox and
+ # appear too big and shifted (e.g. towards bottom-right).
+ try:
+ _page_1based = page_no + 1
+ if not page_sizes_df.empty and "page" in page_sizes_df.columns:
+ mask = page_sizes_df["page"] == _page_1based
+ if mask.any():
+ if "image_width" not in page_sizes_df.columns:
+ page_sizes_df["image_width"] = float("nan")
+ if "image_height" not in page_sizes_df.columns:
+ page_sizes_df["image_height"] = float("nan")
+ page_sizes_df.loc[mask, "image_width"] = float(page_width)
+ page_sizes_df.loc[mask, "image_height"] = float(page_height)
+ except Exception:
+ pass
+
+ # print(
+ # f"[Performing image-based processing] page {reported_page_number}/{_ocr_gui_total_str}: "
+ # f"page image ready (size {int(page_width)}x{int(page_height)}, "
+ # f"PIL image loaded={image is not None})"
+ # )
+
+ # Step 1: Perform OCR. Either with Tesseract, cloud VLM, or with AWS Textract
+ # If using Tesseract or cloud VLM (all image-based OCR methods)
+ if (
+ text_extraction_method == LOCAL_OCR_MODEL_TEXT_EXTRACT_OPTION
+ or text_extraction_method == BEDROCK_VLM_TEXT_EXTRACT_OPTION
+ or text_extraction_method == GEMINI_VLM_TEXT_EXTRACT_OPTION
+ or text_extraction_method == AZURE_OPENAI_VLM_TEXT_EXTRACT_OPTION
+ ):
+
+ if all_page_line_level_ocr_results_with_words:
+ # Find the first dict where 'page' matches
+
+ matching_page = next(
+ (
+ item
+ for item in all_page_line_level_ocr_results_with_words
+ if int(item.get("page", -1)) == int(reported_page_number)
+ ),
+ None,
+ )
+
+ page_line_level_ocr_results_with_words = (
+ matching_page if matching_page else []
+ )
+ else:
+ page_line_level_ocr_results_with_words = list()
+
+ if page_line_level_ocr_results_with_words:
+ # print(
+ # "Found OCR results for page in existing OCR with words object"
+ # )
+ # print(
+ # f"[Performing image-based processing] page {reported_page_number}/{_ocr_gui_total_str}: "
+ # "OCR source — existing loaded JSON (recreate_page_line_level_ocr_results_with_page)"
+ # )
+
+ page_line_level_ocr_results = (
+ recreate_page_line_level_ocr_results_with_page(
+ page_line_level_ocr_results_with_words
+ )
+ )
+ # print(
+ # f"[Performing image-based processing] page {reported_page_number}/{_ocr_gui_total_str}: "
+ # "OCR JSON path — line-level recreated (aggregate list unchanged here)"
+ # )
+
+ else:
+ cached_precombined = False
+ # Use pre-computed result from parallel Bedrock OCR if available
+ if page_no in ocr_results_from_parallel:
+ # print(
+ # f"[Performing image-based processing] page {reported_page_number}/{_ocr_gui_total_str}: "
+ # "OCR source — parallel Bedrock VLM cache"
+ # )
+ (
+ page_word_level_ocr_results,
+ page_vlm_input_tokens,
+ page_vlm_output_tokens,
+ page_vlm_model_name,
+ ) = ocr_results_from_parallel[page_no]
+ elif page_no in ocr_results_from_parallel_local_ocr:
+ # print(
+ # f"[Performing image-based processing] page {reported_page_number}/{_ocr_gui_total_str}: "
+ # "OCR source — parallel local OCR cache (pre-combined line structure)"
+ # )
+ (
+ page_word_level_ocr_results,
+ page_vlm_input_tokens,
+ page_vlm_output_tokens,
+ page_vlm_model_name,
+ page_line_level_ocr_results,
+ page_line_level_ocr_results_with_words,
+ ) = ocr_results_from_parallel_local_ocr[page_no]
+ cached_precombined = True
+ else:
+ # print(
+ # f"[Performing image-based processing] page {reported_page_number}/{_ocr_gui_total_str}: "
+ # "OCR source — inline perform_ocr (no parallel/cache hit)"
+ # )
+ # Check if image_path is a placeholder and create the actual image if needed
+ actual_image_path = image_path
+ if isinstance(image_path, str) and (
+ "placeholder_image" in image_path
+ or "image_placeholder" in image_path
+ ):
+ try:
+ # Use the current page number (page_no is 0-indexed)
+ # Create the actual image using process_single_page_for_image_conversion
+ _, created_image_path, _, _ = (
+ process_single_page_for_image_conversion(
+ pdf_path=file_path,
+ page_num=page_no, # page_no is already 0-indexed
+ image_dpi=IMAGES_DPI,
+ create_images=True,
+ input_folder=input_folder,
+ )
+ )
+
+ # Use the created image path if it exists
+ if os.path.exists(created_image_path):
+ actual_image_path = created_image_path
+ # Update image_path in page_sizes_df for future reference
+ if not page_sizes_df.empty:
+ page_sizes_df.loc[
+ page_sizes_df["page"] == (page_no + 1),
+ "image_path",
+ ] = created_image_path
+ print(
+ f"Created actual image for page {page_no + 1} from placeholder: {created_image_path}"
+ )
+ else:
+ print(
+ f"Warning: Failed to create image for page {page_no + 1} from placeholder"
+ )
+ except Exception as e:
+ print(
+ f"Error creating image from placeholder for OCR: {e}"
+ )
+ # Fall back to using the placeholder path (will likely fail, but preserves original behavior)
+ actual_image_path = image_path
+
+ print(
+ f"Performing OCR on page {page_no + 1} from {actual_image_path}"
+ )
+
+ (
+ page_word_level_ocr_results,
+ page_vlm_input_tokens,
+ page_vlm_output_tokens,
+ page_vlm_model_name,
+ ) = image_analyser.perform_ocr(
+ actual_image_path,
+ bedrock_runtime=bedrock_runtime,
+ gemini_client=gemini_client,
+ gemini_config=gemini_config,
+ azure_openai_client=azure_openai_client,
+ vlm_model_choice=CLOUD_VLM_MODEL_CHOICE,
+ inference_server_model_name=(
+ inference_server_vlm_model
+ if inference_server_vlm_model
+ else None
+ ),
+ page_index_0=page_no,
+ )
+
+ # Accumulate VLM token usage
+ vlm_total_input_tokens += page_vlm_input_tokens
+ vlm_total_output_tokens += page_vlm_output_tokens
+ if page_vlm_model_name and not vlm_model_name:
+ vlm_model_name = page_vlm_model_name
+
+ if not cached_precombined:
+ (
+ page_line_level_ocr_results,
+ page_line_level_ocr_results_with_words,
+ ) = combine_ocr_results(
+ page_word_level_ocr_results, page=reported_page_number
+ )
+ # else: line/word structures already set from parallel local OCR worker
+
+ if all_page_line_level_ocr_results_with_words is None:
+ all_page_line_level_ocr_results_with_words = list()
+
+ all_page_line_level_ocr_results_with_words.append(
+ page_line_level_ocr_results_with_words
+ )
+ # print(
+ # f"[Performing image-based processing] page {reported_page_number}/{_ocr_gui_total_str}: "
+ # "appended page OCR to aggregate list"
+ # )
+
+ # Optional additional Bedrock VLM pass to detect people
+ # and inject [FACE] entries into the word-level OCR structure for AWS Bedrock VLM OCR.
+ # Run when CUSTOM_VLM_FACES is selected (in any entity selector), or when AWS Textract
+ # + Face detection is selected (all pages analysed for faces regardless of PII method
+ # or text_extraction_only).
+ # (_run_face_pass, _textract_face_identification set at start of loop)
+ if (
+ not _skip_inline_custom_vlm_detection
+ and (
+ text_extraction_method == BEDROCK_VLM_TEXT_EXTRACT_OPTION
+ or text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION
+ or pii_identification_method == AWS_PII_OPTION
+ or pii_identification_method == AWS_LLM_PII_OPTION
+ )
+ and _run_face_pass
+ and isinstance(page_line_level_ocr_results_with_words, dict)
+ and page_line_level_ocr_results_with_words.get("results")
+ and image is not None
+ and bedrock_runtime is not None
+ ):
+ try:
+ _gui_tqdm_subphase(
+ ocr_progress_bar,
+ progress,
+ "Face detection (VLM)",
+ reported_page_number,
+ _ocr_gui_total_str,
+ )
+ # print(
+ # f"[Performing image-based processing] page {reported_page_number}/{_ocr_gui_total_str}: "
+ # "optional — Bedrock VLM person detection"
+ # )
+ image_name = (
+ os.path.basename(image_path)
+ if isinstance(image_path, str)
+ else f"{file_name}_{reported_page_number}.png"
+ )
+
+ # Use Bedrock VLM for people detection
+ model_choice = (
+ CLOUD_VLM_MODEL_CHOICE if CLOUD_VLM_MODEL_CHOICE else None
+ )
+ normalised_coords_range = (
+ 999 # Full-page prompt uses 0-999 for all Bedrock models
+ )
+
+ people_ocr_result = _bedrock_page_ocr_predict(
+ image,
+ image_name=image_name,
+ normalised_coords_range=normalised_coords_range,
+ output_folder=output_folder,
+ detect_people_only=True,
+ model_choice=model_choice,
+ bedrock_runtime=bedrock_runtime,
+ page_index_0=page_no,
+ )
+
+ # Unpack tuple: (Dict[str, List], int, int, str) -> (ocr_data, vlm_input_tokens, vlm_output_tokens, vlm_model_name)
+ if (
+ isinstance(people_ocr_result, tuple)
+ and len(people_ocr_result) == 4
+ ):
+ (
+ people_ocr,
+ people_vlm_input_tokens,
+ people_vlm_output_tokens,
+ people_vlm_model_name,
+ ) = people_ocr_result
+ # Accumulate VLM token usage
+ vlm_total_input_tokens += people_vlm_input_tokens
+ vlm_total_output_tokens += people_vlm_output_tokens
+ if people_vlm_model_name and not vlm_model_name:
+ vlm_model_name = people_vlm_model_name
+ else:
+ people_ocr = (
+ people_ocr_result[0]
+ if isinstance(people_ocr_result, tuple)
+ else people_ocr_result
+ )
+
+ # Convert people_ocr outputs into additional word-level entries
+ texts = people_ocr.get("text", [])
+ lefts = people_ocr.get("left", [])
+ tops = people_ocr.get("top", [])
+ widths = people_ocr.get("width", [])
+ heights = people_ocr.get("height", [])
+ confs = people_ocr.get("conf", [])
+
+ results_dict = page_line_level_ocr_results_with_words["results"]
+
+ # Determine a valid starting line number for synthetic [FACE] lines
+ existing_lines = []
+ for _line_key, _line_data in results_dict.items():
+ line_val = _line_data.get("line")
+ if isinstance(line_val, (int, float, str)):
+ try:
+ existing_lines.append(int(line_val))
+ except Exception:
+ continue
+ next_line_number = (
+ max(existing_lines) if existing_lines else 0
+ ) + 1
+
+ existing_keys = list(results_dict.keys())
+ person_index_start = len(existing_keys) + 1
+
+ for idx, text in enumerate(texts):
+ if text != "[FACE]":
+ continue
+ try:
+ left = int(lefts[idx])
+ top = int(tops[idx])
+ width = int(widths[idx])
+ height = int(heights[idx])
+ conf = float(confs[idx]) if idx < len(confs) else 0.0
+ except Exception:
+ continue
+
+ key = f"person_line_{person_index_start + idx}"
+ bbox = (left, top, left + width, top + height)
+ results_dict[key] = {
+ "line": int(next_line_number),
+ "text": "[FACE]",
+ "bounding_box": bbox,
+ "words": [
+ {
+ "text": "[FACE]",
+ "bounding_box": bbox,
+ "conf": conf,
+ "model": "bedrock-vlm",
+ }
+ ],
+ "conf": conf,
+ }
+ next_line_number += 1
+ except Exception as e:
+ print(
+ f"Warning: Bedrock VLM person detection failed on page {reported_page_number}: {e}"
+ )
+
+ # Optional additional Bedrock VLM pass to detect signatures
+ # and inject [SIGNATURE] entries into the word-level OCR structure for AWS Bedrock VLM OCR.
+ # Only run when the user has selected CUSTOM_VLM_SIGNATURE (in any entity selector).
+ _signature_selected = (
+ "CUSTOM_VLM_SIGNATURE" in chosen_redact_entities
+ or "CUSTOM_VLM_SIGNATURE"
+ in (chosen_redact_comprehend_entities or [])
+ or "CUSTOM_VLM_SIGNATURE" in (chosen_llm_entities or [])
+ )
+ if (
+ not _skip_inline_custom_vlm_detection
+ and (
+ text_extraction_method == BEDROCK_VLM_TEXT_EXTRACT_OPTION
+ or text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION
+ or pii_identification_method == AWS_PII_OPTION
+ or pii_identification_method == AWS_LLM_PII_OPTION
+ )
+ and _signature_selected
+ and isinstance(page_line_level_ocr_results_with_words, dict)
+ and page_line_level_ocr_results_with_words.get("results")
+ and image is not None
+ and bedrock_runtime is not None
+ ):
+ try:
+ _gui_tqdm_subphase(
+ ocr_progress_bar,
+ progress,
+ "Signature detection (VLM)",
+ reported_page_number,
+ _ocr_gui_total_str,
+ )
+ # print(
+ # f"[Performing image-based processing] page {reported_page_number}/{_ocr_gui_total_str}: "
+ # "optional — Bedrock VLM signature detection"
+ # )
+ image_name = (
+ os.path.basename(image_path)
+ if isinstance(image_path, str)
+ else f"{file_name}_{reported_page_number}.png"
+ )
+
+ # Use Bedrock VLM for signature detection
+ model_choice = (
+ CLOUD_VLM_MODEL_CHOICE if CLOUD_VLM_MODEL_CHOICE else None
+ )
+ normalised_coords_range = (
+ 999 # Full-page prompt uses 0-999 for all Bedrock models
+ )
+
+ sig_ocr_result = _bedrock_page_ocr_predict(
+ image,
+ image_name=image_name,
+ normalised_coords_range=normalised_coords_range,
+ output_folder=output_folder,
+ detect_signatures_only=True,
+ model_choice=model_choice,
+ bedrock_runtime=bedrock_runtime,
+ page_index_0=page_no,
+ )
+
+ # Unpack tuple: (Dict[str, List], int, int, str) -> (ocr_data, vlm_input_tokens, vlm_output_tokens, vlm_model_name)
+ if (
+ isinstance(sig_ocr_result, tuple)
+ and len(sig_ocr_result) == 4
+ ):
+ (
+ sig_ocr,
+ sig_vlm_input_tokens,
+ sig_vlm_output_tokens,
+ sig_vlm_model_name,
+ ) = sig_ocr_result
+ # Accumulate VLM token usage
+ vlm_total_input_tokens += sig_vlm_input_tokens
+ vlm_total_output_tokens += sig_vlm_output_tokens
+ if sig_vlm_model_name and not vlm_model_name:
+ vlm_model_name = sig_vlm_model_name
+ else:
+ sig_ocr = (
+ sig_ocr_result[0]
+ if isinstance(sig_ocr_result, tuple)
+ else sig_ocr_result
+ )
+
+ # Convert sig_ocr outputs into additional word-level entries
+ texts = sig_ocr.get("text", [])
+ lefts = sig_ocr.get("left", [])
+ tops = sig_ocr.get("top", [])
+ widths = sig_ocr.get("width", [])
+ heights = sig_ocr.get("height", [])
+ confs = sig_ocr.get("conf", [])
+
+ results_dict = page_line_level_ocr_results_with_words["results"]
+
+ # Determine a valid starting line number for synthetic [SIGNATURE] lines
+ existing_lines = []
+ for _line_key, _line_data in results_dict.items():
+ line_val = _line_data.get("line")
+ if isinstance(line_val, (int, float, str)):
+ try:
+ existing_lines.append(int(line_val))
+ except Exception:
+ continue
+ next_line_number = (
+ max(existing_lines) if existing_lines else 0
+ ) + 1
+
+ existing_keys = list(results_dict.keys())
+ sig_index_start = len(existing_keys) + 1
+
+ for idx, text in enumerate(texts):
+ if text != "[SIGNATURE]":
+ continue
+ try:
+ left = int(lefts[idx])
+ top = int(tops[idx])
+ width = int(widths[idx])
+ height = int(heights[idx])
+ conf = float(confs[idx]) if idx < len(confs) else 0.0
+ except Exception:
+ continue
+
+ key = f"signature_line_{sig_index_start + idx}"
+ bbox = (left, top, left + width, top + height)
+ results_dict[key] = {
+ "line": int(next_line_number),
+ "text": "[SIGNATURE]",
+ "bounding_box": bbox,
+ "words": [
+ {
+ "text": "[SIGNATURE]",
+ "bounding_box": bbox,
+ "conf": conf,
+ "model": "bedrock-vlm",
+ }
+ ],
+ "conf": conf,
+ }
+ next_line_number += 1
+ except Exception as e:
+ print(
+ f"Warning: Bedrock VLM signature detection failed on page {reported_page_number}: {e}"
+ )
+
+ # Optional additional VLM / inference-server pass to detect people
+ # and inject [FACE] entries into the word-level OCR structure.
+ # Supports pure and hybrid VLM/inference-server local OCR models.
+ if (
+ not _skip_inline_custom_vlm_detection
+ and chosen_local_ocr_model
+ in [
+ "vlm",
+ "inference-server",
+ "hybrid-vlm",
+ "hybrid-paddle-vlm",
+ "hybrid-paddle-inference-server",
+ ]
+ and "CUSTOM_VLM_FACES" in chosen_redact_entities
+ and isinstance(page_line_level_ocr_results_with_words, dict)
+ and page_line_level_ocr_results_with_words.get("results")
+ and image is not None
+ ):
+ try:
+ _gui_tqdm_subphase(
+ ocr_progress_bar,
+ progress,
+ "Face detection (local VLM)",
+ reported_page_number,
+ _ocr_gui_total_str,
+ )
+ # print(
+ # f"[Performing image-based processing] page {reported_page_number}/{_ocr_gui_total_str}: "
+ # f"optional — local VLM/inference person detection ({chosen_local_ocr_model})"
+ # )
+ image_name = (
+ os.path.basename(image_path)
+ if isinstance(image_path, str)
+ else f"{file_name}_{reported_page_number}.png"
+ )
+
+ # Decide which backend to use for people detection
+ if chosen_local_ocr_model in [
+ "vlm",
+ "hybrid-vlm",
+ "hybrid-paddle-vlm",
+ ]:
+ people_ocr_result = _vlm_page_ocr_predict(
+ image,
+ image_name=image_name,
+ normalised_coords_range=999,
+ output_folder=output_folder,
+ detect_people_only=True,
+ page_index_0=page_no,
+ )
+ else: # inference-server based hybrids
+ people_ocr_result = _inference_server_page_ocr_predict(
+ image,
+ image_name=image_name,
+ normalised_coords_range=999,
+ output_folder=output_folder,
+ detect_people_only=True,
+ page_index_0=page_no,
+ )
+
+ # Unpack tuple: (Dict[str, List], int, int, str) -> (ocr_data, vlm_input_tokens, vlm_output_tokens, vlm_model_name)
+ if (
+ isinstance(people_ocr_result, tuple)
+ and len(people_ocr_result) == 4
+ ):
+ (
+ people_ocr,
+ people_vlm_input_tokens,
+ people_vlm_output_tokens,
+ people_vlm_model_name,
+ ) = people_ocr_result
+ # Accumulate VLM token usage
+ vlm_total_input_tokens += people_vlm_input_tokens
+ vlm_total_output_tokens += people_vlm_output_tokens
+ if people_vlm_model_name and not vlm_model_name:
+ vlm_model_name = people_vlm_model_name
+ else:
+ people_ocr = (
+ people_ocr_result[0]
+ if isinstance(people_ocr_result, tuple)
+ else people_ocr_result
+ )
+
+ # Convert people_ocr outputs into additional word-level entries
+ texts = people_ocr.get("text", [])
+ lefts = people_ocr.get("left", [])
+ tops = people_ocr.get("top", [])
+ widths = people_ocr.get("width", [])
+ heights = people_ocr.get("height", [])
+ confs = people_ocr.get("conf", [])
+
+ results_dict = page_line_level_ocr_results_with_words["results"]
+
+ # Determine a valid starting line number for synthetic [FACE] lines
+ existing_lines = []
+ for _line_key, _line_data in results_dict.items():
+ line_val = _line_data.get("line")
+ if isinstance(line_val, (int, float, str)):
+ try:
+ existing_lines.append(int(line_val))
+ except Exception:
+ continue
+ next_line_number = (
+ max(existing_lines) if existing_lines else 0
+ ) + 1
+
+ existing_keys = list(results_dict.keys())
+ person_index_start = len(existing_keys) + 1
+
+ for idx, text in enumerate(texts):
+ if text != "[FACE]":
+ continue
+ try:
+ left = int(lefts[idx])
+ top = int(tops[idx])
+ width = int(widths[idx])
+ height = int(heights[idx])
+ conf = float(confs[idx]) if idx < len(confs) else 0.0
+ except Exception:
+ continue
+
+ key = f"person_line_{person_index_start + idx}"
+ bbox = (left, top, left + width, top + height)
+ results_dict[key] = {
+ "line": int(next_line_number),
+ "text": "[FACE]",
+ "bounding_box": bbox,
+ "words": [
+ {
+ "text": "[FACE]",
+ "bounding_box": bbox,
+ "conf": conf,
+ "model": chosen_local_ocr_model,
+ }
+ ],
+ "conf": conf,
+ }
+ next_line_number += 1
+ except Exception as e:
+ print(
+ f"Warning: VLM person detection failed on page {reported_page_number}: {e}"
+ )
+
+ # Optional additional VLM / inference-server pass to detect signatures
+ # and inject [SIGNATURE] entries into the word-level OCR structure.
+ # Supports pure and hybrid VLM/inference-server local OCR models.
+ if (
+ not _skip_inline_custom_vlm_detection
+ and chosen_local_ocr_model
+ in [
+ "vlm",
+ "inference-server",
+ "hybrid-vlm",
+ "hybrid-paddle-vlm",
+ "hybrid-paddle-inference-server",
+ ]
+ and "CUSTOM_VLM_SIGNATURE" in chosen_redact_entities
+ and isinstance(page_line_level_ocr_results_with_words, dict)
+ and page_line_level_ocr_results_with_words.get("results")
+ and image is not None
+ ):
+ try:
+ _gui_tqdm_subphase(
+ ocr_progress_bar,
+ progress,
+ "Signature detection (local VLM)",
+ reported_page_number,
+ _ocr_gui_total_str,
+ )
+ # print(
+ # f"[Performing image-based processing] page {reported_page_number}/{_ocr_gui_total_str}: "
+ # f"optional — local VLM/inference signature detection ({chosen_local_ocr_model})"
+ # )
+ image_name = (
+ os.path.basename(image_path)
+ if isinstance(image_path, str)
+ else f"{file_name}_{reported_page_number}.png"
+ )
+
+ # Decide which backend to use for signature detection
+ if chosen_local_ocr_model in [
+ "vlm",
+ "hybrid-vlm",
+ "hybrid-paddle-vlm",
+ ]:
+ sig_ocr_result = _vlm_page_ocr_predict(
+ image,
+ image_name=image_name,
+ normalised_coords_range=999,
+ output_folder=output_folder,
+ detect_signatures_only=True,
+ page_index_0=page_no,
+ )
+ else: # inference-server based hybrids
+ sig_ocr_result = _inference_server_page_ocr_predict(
+ image,
+ image_name=image_name,
+ normalised_coords_range=999,
+ output_folder=output_folder,
+ detect_signatures_only=True,
+ page_index_0=page_no,
+ )
+
+ # Unpack tuple: (Dict[str, List], int, int, str) -> (ocr_data, vlm_input_tokens, vlm_output_tokens, vlm_model_name)
+ if (
+ isinstance(sig_ocr_result, tuple)
+ and len(sig_ocr_result) == 4
+ ):
+ (
+ sig_ocr,
+ sig_vlm_input_tokens,
+ sig_vlm_output_tokens,
+ sig_vlm_model_name,
+ ) = sig_ocr_result
+ # Accumulate VLM token usage
+ vlm_total_input_tokens += sig_vlm_input_tokens
+ vlm_total_output_tokens += sig_vlm_output_tokens
+ if sig_vlm_model_name and not vlm_model_name:
+ vlm_model_name = sig_vlm_model_name
+ else:
+ sig_ocr = (
+ sig_ocr_result[0]
+ if isinstance(sig_ocr_result, tuple)
+ else sig_ocr_result
+ )
+
+ # Convert sig_ocr outputs into additional word-level entries
+ texts = sig_ocr.get("text", [])
+ lefts = sig_ocr.get("left", [])
+ tops = sig_ocr.get("top", [])
+ widths = sig_ocr.get("width", [])
+ heights = sig_ocr.get("height", [])
+ confs = sig_ocr.get("conf", [])
+
+ results_dict = page_line_level_ocr_results_with_words["results"]
+
+ # Determine a valid starting line number for synthetic [SIGNATURE] lines
+ existing_lines = []
+ for _line_key, _line_data in results_dict.items():
+ line_val = _line_data.get("line")
+ if isinstance(line_val, (int, float, str)):
+ try:
+ existing_lines.append(int(line_val))
+ except Exception:
+ continue
+ next_line_number = (
+ max(existing_lines) if existing_lines else 0
+ ) + 1
+
+ existing_keys = list(results_dict.keys())
+ sig_index_start = len(existing_keys) + 1
+
+ for idx, text in enumerate(texts):
+ if text != "[SIGNATURE]":
+ continue
+ try:
+ left = int(lefts[idx])
+ top = int(tops[idx])
+ width = int(widths[idx])
+ height = int(heights[idx])
+ conf = float(confs[idx]) if idx < len(confs) else 0.0
+ except Exception:
+ continue
+
+ key = f"signature_line_{sig_index_start + idx}"
+ bbox = (left, top, left + width, top + height)
+ results_dict[key] = {
+ "line": int(next_line_number),
+ "text": "[SIGNATURE]",
+ "bounding_box": bbox,
+ "words": [
+ {
+ "text": "[SIGNATURE]",
+ "bounding_box": bbox,
+ "conf": conf,
+ "model": chosen_local_ocr_model,
+ }
+ ],
+ "conf": conf,
+ }
+ next_line_number += 1
+ except Exception as e:
+ print(
+ f"Warning: VLM signature detection failed on page {reported_page_number}: {e}"
+ )
+
+ # Check if page exists in existing textract data. If not, send to service to analyse
+ if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
+ # print(
+ # f"[Performing image-based processing] page {reported_page_number}/{_ocr_gui_total_str}: "
+ # "Textract branch — entering page processing"
+ # )
+ # Use existing Textract ocr_results_with_words file if this page is present:
+ # load as page_line_level_ocr_results_with_words, recreate page_line_level_ocr_results,
+ # store in ocr_results_by_page, and skip all OCR (including hybrid textract-bedrock).
+ if (
+ textract_ocr_by_page_1based is not None
+ and (page_no + 1) in textract_ocr_by_page_1based
+ ):
+ # print(
+ # f"[Performing image-based processing] page {reported_page_number}/{_ocr_gui_total_str}: "
+ # "Textract — using cached per-page ocr_results_with_words (no API)"
+ # )
+ page_line_level_ocr_results_with_words = (
+ textract_ocr_by_page_1based[page_no + 1]
+ )
+ page_line_level_ocr_results = (
+ recreate_page_line_level_ocr_results_with_page(
+ page_line_level_ocr_results_with_words
+ )
+ )
+ if all_page_line_level_ocr_results_with_words is None:
+ all_page_line_level_ocr_results_with_words = list()
+ all_page_line_level_ocr_results_with_words.append(
+ page_line_level_ocr_results_with_words
+ )
+ # Ensure the line-level OCR outputs table is populated for cached pages too.
+ # Otherwise the UI `all_page_line_level_ocr_results_df_base` can remain empty
+ # even though `ocr_results_with_words` loaded successfully.
+ try:
+ page_1based = int(
+ page_line_level_ocr_results.get("page", page_no + 1)
+ )
+ except Exception:
+ page_1based = page_no + 1
+ if page_1based not in existing_line_level_pages_1based:
+ try:
+ _line_results = (
+ page_line_level_ocr_results.get("results", []) or []
+ )
+ page_text_ocr_outputs = pd.DataFrame(
+ [
+ {
+ "page": page_1based,
+ "text": getattr(r, "text", ""),
+ "left": getattr(r, "left", None),
+ "top": getattr(r, "top", None),
+ "width": getattr(r, "width", None),
+ "height": getattr(r, "height", None),
+ "line": getattr(r, "line", None),
+ "conf": getattr(r, "conf", None),
+ }
+ for r in _line_results
+ ]
+ )
+ if not page_text_ocr_outputs.empty:
+ all_line_level_ocr_results_list.append(
+ page_text_ocr_outputs
+ )
+ existing_line_level_pages_1based.add(page_1based)
+ except Exception as _e:
+ print(
+ f"Warning: Could not rebuild line-level OCR dataframe from cached Textract results for page {page_1based}: {_e}"
+ )
+ if page_no >= page_min and page_no < page_max:
+ ocr_results_by_page[page_no] = {
+ "page_line_level_ocr_results": page_line_level_ocr_results,
+ "page_line_level_ocr_results_with_words": page_line_level_ocr_results_with_words,
+ "page_signature_recogniser_results": page_signature_recogniser_results,
+ "page_handwriting_recogniser_results": page_handwriting_recogniser_results,
+ "handwriting_or_signature_boxes": handwriting_or_signature_boxes,
+ "image_path": image_path,
+ "pymupdf_page": pymupdf_page,
+ "original_cropbox": original_cropbox,
+ "page_width": page_width,
+ "page_height": page_height,
+ "image": image,
+ "reported_page_number": reported_page_number,
+ }
+ # print(
+ # f"[Performing image-based processing] page {reported_page_number}/{_ocr_gui_total_str}: "
+ # "Textract cached — stored ocr_results_by_page for second pass"
+ # )
+ # Skip parallel/sequential Textract and hybrid Bedrock for pages loaded from
+ # _ocr_results_with_words_textract.json (already finalised OCR).
+ continue
+
+ text_blocks = list()
+ page_exists = False
+
+ # Parallel Textract: build lightweight job and defer API call to post-loop worker.
+ # To keep memory usage low on large documents, we avoid storing full image
+ # objects or image bytes here; workers will reopen the image from disk.
+ if use_ocr_parallel_textract:
+ if not image:
+ try:
+ _pn, _ip, _w, _h = process_single_page_for_image_conversion(
+ file_path, page_no
+ )
+ normalized_path = os.path.normpath(
+ os.path.abspath(image_path)
+ )
+ if validate_path_containment(normalized_path, input_folder):
+ image = Image.open(normalized_path)
+ page_width, page_height = image.size
+ except Exception as e:
+ print(
+ f"Could not load image for Textract job page {reported_page_number}: {e}"
+ )
+ continue
+ page_exists = (
+ any(
+ pg.get("page_no") == reported_page_number
+ for pg in textract_data.get("pages", [])
+ )
+ if textract_data
+ else False
+ )
+ cached_text_blocks = None
+ if page_exists:
+ cached_text_blocks = next(
+ pg["data"]
+ for pg in textract_data["pages"]
+ if pg["page_no"] == reported_page_number
+ )
+ ocr_parallel_jobs_textract.append(
+ {
+ "page_no": page_no,
+ "reported_page_number": reported_page_number,
+ "cached_text_blocks": cached_text_blocks,
+ "page_width": page_width,
+ "page_height": page_height,
+ "image_path": image_path,
+ "original_cropbox": original_cropbox,
+ "handwriting_or_signature_boxes": handwriting_or_signature_boxes,
+ "page_signature_recogniser_results": page_signature_recogniser_results,
+ "page_handwriting_recogniser_results": page_handwriting_recogniser_results,
+ }
+ )
+ ocr_pymupdf_pages_textract[page_no] = pymupdf_page
+ _tb_note = (
+ "cached blocks"
+ if cached_text_blocks is not None
+ else "will call Textract API in parallel batch"
+ )
+ # print(
+ # f"[Performing image-based processing] page {reported_page_number}/{_ocr_gui_total_str}: "
+ # f"Textract — queued for parallel pass ({_tb_note})"
+ # )
+ continue
+
+ if not textract_data:
+ # print(
+ # f"[Performing image-based processing] page {reported_page_number}/{_ocr_gui_total_str}: "
+ # "Textract sequential — calling analyse_page_with_textract (no JSON loaded yet)"
+ # )
+ try:
+ # print(f"Image object: {image}")
+ # Convert the image_path to bytes using an in-memory buffer
+ image_buffer = io.BytesIO()
+ image.save(
+ image_buffer, format="PNG"
+ ) # Save as PNG, or adjust format if needed
+ pdf_page_as_bytes = image_buffer.getvalue()
+
+ text_blocks, new_textract_request_metadata = (
+ analyse_page_with_textract(
+ pdf_page_as_bytes,
+ reported_page_number,
+ textract_client,
+ handwrite_signature_checkbox,
+ )
+ ) # Analyse page with Textract
+
+ if textract_json_file_path not in log_files_output_paths:
+ log_files_output_paths.append(textract_json_file_path)
+
+ textract_data = {"pages": [text_blocks]}
+ except Exception as e:
+ out_message = (
+ "Textract extraction for page "
+ + reported_page_number
+ + " failed due to:"
+ + str(e)
+ )
+ textract_data = {"pages": []}
+ new_textract_request_metadata = "Failed Textract API call"
+ raise Exception(out_message)
+
+ textract_request_metadata.append(new_textract_request_metadata)
+
+ else:
+ # Check if the current reported_page_number exists in the loaded JSON
+ page_exists = any(
+ page["page_no"] == reported_page_number
+ for page in textract_data.get("pages", [])
+ )
+
+ if not page_exists: # If the page does not exist, analyze again
+ print(
+ f"Page number {reported_page_number} not found in existing Textract data. Analysing."
+ )
+ # print(
+ # f"[Performing image-based processing] page {reported_page_number}/{_ocr_gui_total_str}: "
+ # "Textract sequential — page not in JSON, calling analyse_page_with_textract"
+ # )
+
+ try:
+ if not image:
+ page_num, image_path, width, height = (
+ process_single_page_for_image_conversion(
+ file_path, page_no
+ )
+ )
+
+ # Normalize and validate path safety before opening image
+ normalized_path = os.path.normpath(
+ os.path.abspath(image_path)
+ )
+ if validate_path_containment(
+ normalized_path, input_folder
+ ):
+ image = Image.open(normalized_path)
+ else:
+ raise ValueError(
+ f"Unsafe image path detected: {image_path}"
+ )
+
+ # Convert the image_path to bytes using an in-memory buffer
+ image_buffer = io.BytesIO()
+ image.save(
+ image_buffer, format="PNG"
+ ) # Save as PNG, or adjust format if needed
+ pdf_page_as_bytes = image_buffer.getvalue()
+
+ text_blocks, new_textract_request_metadata = (
+ analyse_page_with_textract(
+ pdf_page_as_bytes,
+ reported_page_number,
+ textract_client,
+ handwrite_signature_checkbox,
+ )
+ ) # Analyse page with Textract
+
+ # Check if "pages" key exists, if not, initialise it as an empty list
+ if "pages" not in textract_data:
+ textract_data["pages"] = list()
+
+ # Append the new page data
+ textract_data["pages"].append(text_blocks)
+
+ except Exception as e:
+ out_message = (
+ "Textract extraction for page "
+ + reported_page_number
+ + " failed due to:"
+ + str(e)
+ )
+ print(out_message)
+ text_blocks = list()
+ new_textract_request_metadata = "Failed Textract API call"
+
+ # Check if "pages" key exists, if not, initialise it as an empty list
+ if "pages" not in textract_data:
+ textract_data["pages"] = list()
+
+ raise Exception(out_message)
+
+ textract_request_metadata.append(new_textract_request_metadata)
+
+ else:
+ # print(
+ # f"[Performing image-based processing] page {reported_page_number}/{_ocr_gui_total_str}: "
+ # "Textract sequential — reusing text_blocks from loaded Textract JSON"
+ # )
+ # If the page exists, retrieve the data
+ text_blocks = next(
+ page["data"]
+ for page in textract_data["pages"]
+ if page["page_no"] == reported_page_number
+ )
+
+ # Use image dimensions for json_to_ocrresult so that OCR result coordinates
+ # are in image pixel space.
+ textract_page_width = page_width
+ textract_page_height = page_height
+
+ # print(
+ # f"[Performing image-based processing] page {reported_page_number}/{_ocr_gui_total_str}: "
+ # "Textract — json_to_ocrresult (blocks → line/word structures)"
+ # )
+ (
+ page_line_level_ocr_results,
+ handwriting_or_signature_boxes,
+ page_signature_recogniser_results,
+ page_handwriting_recogniser_results,
+ page_line_level_ocr_results_with_words,
+ selection_element_results,
+ form_key_value_results,
+ ) = json_to_ocrresult(
+ text_blocks,
+ textract_page_width,
+ textract_page_height,
+ reported_page_number,
+ )
+
+ # Hybrid Textract + Bedrock VLM: re-run low-confidence lines with Bedrock VLM
+ if (
+ hybrid_textract_bedrock_vlm
+ and text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION
+ and image is not None
+ and bedrock_runtime is not None
+ and CLOUD_VLM_MODEL_CHOICE
+ ):
+ # print(
+ # f"[Performing image-based processing] page {reported_page_number}/{_ocr_gui_total_str}: "
+ # f"Textract hybrid — Bedrock VLM on low-confidence lines "
+ # f"(threshold={HYBRID_TEXTRACT_BEDROCK_VLM_CONFIDENCE_THRESHOLD}, "
+ # f"padding={HYBRID_TEXTRACT_BEDROCK_VLM_PADDING})"
+ # )
+ image_name_seq = (
+ os.path.basename(image_path)
+ if isinstance(image_path, str)
+ else f"{file_name}_{reported_page_number}.png"
+ )
+ hybrid_result = _process_textract_page_with_hybrid_bedrock_vlm(
+ page_line_level_ocr_results,
+ page_line_level_ocr_results_with_words,
+ image,
+ textract_page_width,
+ textract_page_height,
+ HYBRID_TEXTRACT_BEDROCK_VLM_CONFIDENCE_THRESHOLD,
+ HYBRID_TEXTRACT_BEDROCK_VLM_PADDING,
+ bedrock_runtime,
+ CLOUD_VLM_MODEL_CHOICE,
+ output_folder,
+ image_name_seq,
+ )
+ if isinstance(hybrid_result, tuple) and len(hybrid_result) == 4:
+ (
+ page_line_level_ocr_results_with_words,
+ hybrid_vlm_input_tokens,
+ hybrid_vlm_output_tokens,
+ hybrid_vlm_model_name,
+ ) = hybrid_result
+ else:
+ page_line_level_ocr_results_with_words = (
+ hybrid_result[0]
+ if isinstance(hybrid_result, tuple)
+ else hybrid_result
+ )
+ hybrid_vlm_input_tokens = 0
+ hybrid_vlm_output_tokens = 0
+ hybrid_vlm_model_name = ""
+ if isinstance(hybrid_result, tuple) and len(hybrid_result) == 4:
+ vlm_total_input_tokens += hybrid_vlm_input_tokens
+ vlm_total_output_tokens += hybrid_vlm_output_tokens
+ if hybrid_vlm_model_name and not vlm_model_name:
+ vlm_model_name = hybrid_vlm_model_name
+
+ if all_page_line_level_ocr_results_with_words is None:
+ all_page_line_level_ocr_results_with_words = list()
+
+ all_page_line_level_ocr_results_with_words.append(
+ page_line_level_ocr_results_with_words
+ )
+ # print(
+ # f"[Performing image-based processing] page {reported_page_number}/{_ocr_gui_total_str}: "
+ # "Textract — appended page to aggregate ocr_results_with_words list"
+ # )
+
+ # Optional additional Bedrock VLM pass to detect people
+ # and inject [FACE] entries into the word-level OCR structure for AWS Textract.
+ if (
+ not _skip_inline_custom_vlm_detection
+ and text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION
+ and "CUSTOM_VLM_FACES" in chosen_redact_entities
+ and isinstance(page_line_level_ocr_results_with_words, dict)
+ and page_line_level_ocr_results_with_words.get("results")
+ and image is not None
+ ):
+ try:
+ _gui_tqdm_subphase(
+ ocr_progress_bar,
+ progress,
+ "Face detection (VLM, Textract path)",
+ reported_page_number,
+ _ocr_gui_total_str,
+ )
+ # print(
+ # f"[Performing image-based processing] page {reported_page_number}/{_ocr_gui_total_str}: "
+ # "Textract optional — Bedrock VLM person detection"
+ # )
+ image_name = (
+ os.path.basename(image_path)
+ if isinstance(image_path, str)
+ else f"{file_name}_{reported_page_number}.png"
+ )
+
+ # Use Bedrock VLM for people detection
+ model_choice = CLOUD_VLM_MODEL_CHOICE
+ normalised_coords_range = (
+ 999 # Full-page prompt uses 0-999 for all Bedrock models
+ )
+
+ people_ocr_result = _bedrock_page_ocr_predict(
+ image,
+ image_name=image_name,
+ normalised_coords_range=normalised_coords_range,
+ output_folder=output_folder,
+ detect_people_only=True,
+ model_choice=model_choice,
+ bedrock_runtime=bedrock_runtime,
+ page_index_0=page_no,
+ )
+
+ # Unpack tuple: (Dict[str, List], int, int, str) -> (ocr_data, vlm_input_tokens, vlm_output_tokens, vlm_model_name)
+ if (
+ isinstance(people_ocr_result, tuple)
+ and len(people_ocr_result) == 4
+ ):
+ (
+ people_ocr,
+ people_vlm_input_tokens,
+ people_vlm_output_tokens,
+ people_vlm_model_name,
+ ) = people_ocr_result
+ # Accumulate VLM token usage
+ vlm_total_input_tokens += people_vlm_input_tokens
+ vlm_total_output_tokens += people_vlm_output_tokens
+ if people_vlm_model_name and not vlm_model_name:
+ vlm_model_name = people_vlm_model_name
+ else:
+ people_ocr = (
+ people_ocr_result[0]
+ if isinstance(people_ocr_result, tuple)
+ else people_ocr_result
+ )
+
+ # Convert people_ocr outputs into additional word-level entries
+ texts = people_ocr.get("text", [])
+ lefts = people_ocr.get("left", [])
+ tops = people_ocr.get("top", [])
+ widths = people_ocr.get("width", [])
+ heights = people_ocr.get("height", [])
+ confs = people_ocr.get("conf", [])
+
+ results_dict = page_line_level_ocr_results_with_words["results"]
+
+ # Determine a valid starting line number for synthetic [FACE] lines
+ existing_lines = []
+ for _line_key, _line_data in results_dict.items():
+ line_val = _line_data.get("line")
+ if isinstance(line_val, (int, float, str)):
+ try:
+ existing_lines.append(int(line_val))
+ except Exception:
+ continue
+ next_line_number = (
+ max(existing_lines) if existing_lines else 0
+ ) + 1
+
+ existing_keys = list(results_dict.keys())
+ person_index_start = len(existing_keys) + 1
+
+ for idx, text in enumerate(texts):
+ if text != "[FACE]":
+ continue
+ try:
+ left = int(lefts[idx])
+ top = int(tops[idx])
+ width = int(widths[idx])
+ height = int(heights[idx])
+ conf = float(confs[idx]) if idx < len(confs) else 0.0
+ except Exception:
+ continue
+
+ key = f"person_line_{person_index_start + idx}"
+ bbox = (left, top, left + width, top + height)
+ results_dict[key] = {
+ "line": int(next_line_number),
+ "text": "[FACE]",
+ "bounding_box": bbox,
+ "words": [
+ {
+ "text": "[FACE]",
+ "bounding_box": bbox,
+ "conf": conf,
+ "model": "bedrock-vlm",
+ }
+ ],
+ "conf": conf,
+ }
+ next_line_number += 1
+ except Exception as e:
+ print(
+ f"Warning: Bedrock VLM person detection failed on page {reported_page_number}: {e}"
+ )
+
+ # Optional additional Bedrock VLM pass to detect signatures
+ # and inject [SIGNATURE] entries into the word-level OCR structure for AWS Textract.
+ if (
+ not _skip_inline_custom_vlm_detection
+ and text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION
+ and "CUSTOM_VLM_SIGNATURE" in chosen_redact_entities
+ and isinstance(page_line_level_ocr_results_with_words, dict)
+ and page_line_level_ocr_results_with_words.get("results")
+ and image is not None
+ ):
+ try:
+ _gui_tqdm_subphase(
+ ocr_progress_bar,
+ progress,
+ "Signature detection (VLM, Textract path)",
+ reported_page_number,
+ _ocr_gui_total_str,
+ )
+ # print(
+ # f"[Performing image-based processing] page {reported_page_number}/{_ocr_gui_total_str}: "
+ # "Textract optional — Bedrock VLM signature detection"
+ # )
+ image_name = (
+ os.path.basename(image_path)
+ if isinstance(image_path, str)
+ else f"{file_name}_{reported_page_number}.png"
+ )
+
+ # Use Bedrock VLM for signature detection
+ model_choice = CLOUD_VLM_MODEL_CHOICE
+ normalised_coords_range = (
+ 999 # Full-page prompt uses 0-999 for all Bedrock models
+ )
+
+ sig_ocr_result = _bedrock_page_ocr_predict(
+ image,
+ image_name=image_name,
+ normalised_coords_range=normalised_coords_range,
+ output_folder=output_folder,
+ detect_signatures_only=True,
+ model_choice=model_choice,
+ bedrock_runtime=bedrock_runtime,
+ page_index_0=page_no,
+ )
+
+ # Unpack tuple: (Dict[str, List], int, int, str) -> (ocr_data, vlm_input_tokens, vlm_output_tokens, vlm_model_name)
+ if (
+ isinstance(sig_ocr_result, tuple)
+ and len(sig_ocr_result) == 4
+ ):
+ (
+ sig_ocr,
+ sig_vlm_input_tokens,
+ sig_vlm_output_tokens,
+ sig_vlm_model_name,
+ ) = sig_ocr_result
+ # Accumulate VLM token usage
+ vlm_total_input_tokens += sig_vlm_input_tokens
+ vlm_total_output_tokens += sig_vlm_output_tokens
+ if sig_vlm_model_name and not vlm_model_name:
+ vlm_model_name = sig_vlm_model_name
+ else:
+ sig_ocr = (
+ sig_ocr_result[0]
+ if isinstance(sig_ocr_result, tuple)
+ else sig_ocr_result
+ )
+
+ # Convert sig_ocr outputs into additional word-level entries
+ texts = sig_ocr.get("text", [])
+ lefts = sig_ocr.get("left", [])
+ tops = sig_ocr.get("top", [])
+ widths = sig_ocr.get("width", [])
+ heights = sig_ocr.get("height", [])
+ confs = sig_ocr.get("conf", [])
+
+ results_dict = page_line_level_ocr_results_with_words["results"]
+
+ # Determine a valid starting line number for synthetic [SIGNATURE] lines
+ existing_lines = []
+ for _line_key, _line_data in results_dict.items():
+ line_val = _line_data.get("line")
+ if isinstance(line_val, (int, float, str)):
+ try:
+ existing_lines.append(int(line_val))
+ except Exception:
+ continue
+ next_line_number = (
+ max(existing_lines) if existing_lines else 0
+ ) + 1
+
+ existing_keys = list(results_dict.keys())
+ sig_index_start = len(existing_keys) + 1
+
+ for idx, text in enumerate(texts):
+ if text != "[SIGNATURE]":
+ continue
+ try:
+ left = int(lefts[idx])
+ top = int(tops[idx])
+ width = int(widths[idx])
+ height = int(heights[idx])
+ conf = float(confs[idx]) if idx < len(confs) else 0.0
+ except Exception:
+ continue
+
+ key = f"signature_line_{sig_index_start + idx}"
+ bbox = (left, top, left + width, top + height)
+ results_dict[key] = {
+ "line": int(next_line_number),
+ "text": "[SIGNATURE]",
+ "bounding_box": bbox,
+ "words": [
+ {
+ "text": "[SIGNATURE]",
+ "bounding_box": bbox,
+ "conf": conf,
+ "model": "bedrock-vlm",
+ }
+ ],
+ "conf": conf,
+ }
+ next_line_number += 1
+ except Exception as e:
+ print(
+ f"Warning: Bedrock VLM signature detection failed on page {reported_page_number}: {e}"
+ )
+
+ if selection_element_results:
+ selection_element_results_list.extend(selection_element_results)
+ if form_key_value_results:
+ form_key_value_results_list.extend(form_key_value_results)
+
+ # Convert to DataFrame and add to ongoing logging table
+ # Only process if page_line_level_ocr_results is initialized and has results
+ # print(
+ # f"[Performing image-based processing] page {reported_page_number}/{_ocr_gui_total_str}: "
+ # "tail — line-level DataFrame for logging table"
+ # )
+ try:
+ if (
+ page_line_level_ocr_results
+ and "results" in page_line_level_ocr_results
+ ):
+ line_level_ocr_results_df = pd.DataFrame(
+ [
+ {
+ "page": page_line_level_ocr_results["page"],
+ "text": result.text,
+ "left": result.left,
+ "top": result.top,
+ "width": result.width,
+ "height": result.height,
+ "line": result.line,
+ "conf": result.conf,
+ }
+ for result in page_line_level_ocr_results["results"]
+ ]
+ )
+ else:
+ line_level_ocr_results_df = pd.DataFrame()
+ except (UnboundLocalError, NameError, KeyError):
+ # page_line_level_ocr_results not initialized or missing expected structure
+ line_level_ocr_results_df = pd.DataFrame()
+
+ if not line_level_ocr_results_df.empty: # Ensure there are records to add
+ all_line_level_ocr_results_list.extend(
+ line_level_ocr_results_df.to_dict("records")
+ )
+ # print(
+ # f"[Performing image-based processing] page {reported_page_number}/{_ocr_gui_total_str}: "
+ # f"tail — extended all_line_level_ocr_results_list ({len(line_level_ocr_results_df)} rows)"
+ # )
+
+ # Save OCR visualization with bounding boxes (works for all OCR methods)
+ if (
+ (
+ text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION
+ and save_page_ocr_visualisations is True
+ )
+ or (
+ text_extraction_method == LOCAL_OCR_MODEL_TEXT_EXTRACT_OPTION
+ and save_page_ocr_visualisations is True
+ )
+ or (
+ text_extraction_method == BEDROCK_VLM_TEXT_EXTRACT_OPTION
+ and save_page_ocr_visualisations is True
+ )
+ ):
+ if (
+ page_line_level_ocr_results_with_words
+ and "results" in page_line_level_ocr_results_with_words
+ ):
+ # Ensure image is set - if None, try to use image_path or file_path for image files
+ image_for_visualization = image
+ if image_for_visualization is None:
+ # If image is None and input file is an image file, try to use image_path or file_path
+ if is_pdf(file_path) is False:
+ if isinstance(image_path, str) and image_path:
+ # Try to use image_path if it's a valid string
+ image_for_visualization = image_path
+ elif isinstance(file_path, str) and file_path:
+ # Fall back to using the original file_path for image files
+ image_for_visualization = file_path
+ else:
+ # For PDF files, we need an image object or image path
+ if isinstance(image_path, str) and image_path:
+ image_for_visualization = image_path
+
+ # Only proceed if we have a valid image or image path
+ if image_for_visualization is not None:
+ # print(
+ # f"[Performing image-based processing] page {reported_page_number}/{_ocr_gui_total_str}: "
+ # "tail — visualise_ocr_words_bounding_boxes"
+ # )
+ # Store the length before the call to detect new additions
+ log_files_output_paths_length_before = len(
+ log_files_output_paths
+ )
+ log_files_output_paths = visualise_ocr_words_bounding_boxes(
+ image_for_visualization,
+ page_line_level_ocr_results_with_words["results"],
+ image_name=f"{file_name}_{reported_page_number}",
+ output_folder=output_folder,
+ text_extraction_method=text_extraction_method,
+ chosen_local_ocr_model=chosen_local_ocr_model,
+ log_files_output_paths=log_files_output_paths,
+ textract_hybrid_bedrock_used=hybrid_textract_bedrock_vlm,
+ )
+ # If config is enabled and a new visualization file was added, add it to out_file_paths
+ if (
+ INCLUDE_OCR_VISUALISATION_IN_OUTPUT_FILES
+ and log_files_output_paths is not None
+ and len(log_files_output_paths)
+ > log_files_output_paths_length_before
+ ):
+ # Get the newly added visualization file path (last item in the list)
+ new_visualisation_path = log_files_output_paths[-1]
+ if new_visualisation_path not in out_file_paths:
+ out_file_paths.append(new_visualisation_path)
+ else:
+ print(
+ f"Warning: Could not determine image for visualization at page {reported_page_number}. Skipping visualization."
+ )
+
+ # Store OCR results and page metadata for second pass (PII detection)
+ if page_no >= page_min and page_no < page_max:
+ # print(
+ # f"[Performing image-based processing] page {reported_page_number}/{_ocr_gui_total_str}: "
+ # "tail — storing ocr_results_by_page for second pass"
+ # )
+ ocr_results_by_page[page_no] = {
+ "page_line_level_ocr_results": page_line_level_ocr_results,
+ "page_line_level_ocr_results_with_words": page_line_level_ocr_results_with_words,
+ "page_signature_recogniser_results": page_signature_recogniser_results,
+ "page_handwriting_recogniser_results": page_handwriting_recogniser_results,
+ "handwriting_or_signature_boxes": handwriting_or_signature_boxes,
+ "image_path": image_path,
+ "pymupdf_page": pymupdf_page,
+ "original_cropbox": original_cropbox,
+ "page_width": page_width,
+ "page_height": page_height,
+ "image": image,
+ "reported_page_number": reported_page_number,
+ }
+ # else:
+ # print(
+ # f"[Performing image-based processing] page {reported_page_number}/{_ocr_gui_total_str}: "
+ # "tail — page outside range, skipped ocr_results_by_page"
+ # )
+
+ # Skip PII detection and redaction in first pass - we'll do it in second pass
+ # Continue to next page for OCR
+ continue
+
+ # Parallel Textract first pass: run analyse_page_with_textract in worker threads, then merge in page order
+ if use_ocr_parallel_textract and ocr_parallel_jobs_textract:
+ num_textract_pages = len(ocr_parallel_jobs_textract)
+ print(
+ f"First pass: Running AWS Textract on {num_textract_pages} pages (parallel), then processing results..."
+ )
+ try:
+ progress(
+ 0.45,
+ desc=f"Running AWS Textract on {num_textract_pages} pages (parallel)",
+ )
+ except Exception:
+ pass
+
+ def _textract_first_pass_worker(job):
+ if job.get("cached_text_blocks") is not None:
+ text_blocks = {
+ "page_no": job["reported_page_number"],
+ "data": job["cached_text_blocks"],
+ }
+ new_textract_request_metadata = "cached"
+ else:
+ # Reopen image from path and convert to bytes inside the worker to avoid
+ # holding all page image bytes in memory in the main process.
+ image_path = job.get("image_path", "")
+ page_no_str = job.get("reported_page_number", "")
+ try:
+ if isinstance(image_path, str) and image_path:
+ normalized_path = os.path.normpath(os.path.abspath(image_path))
+ image = Image.open(normalized_path)
+ image_buffer = io.BytesIO()
+ image.save(image_buffer, format="PNG")
+ pdf_page_as_bytes = image_buffer.getvalue()
+ else:
+ raise ValueError(
+ f"Invalid image_path for Textract job page {page_no_str}: {image_path}"
+ )
+ except Exception as e:
+ raise Exception(
+ f"Could not open image for Textract job page {page_no_str}: {e}"
+ )
+
+ text_blocks, new_textract_request_metadata = analyse_page_with_textract(
+ pdf_page_as_bytes,
+ job["reported_page_number"],
+ textract_client,
+ handwrite_signature_checkbox,
+ )
+ (
+ page_line_level_ocr_results,
+ handwriting_or_signature_boxes,
+ page_signature_recogniser_results,
+ page_handwriting_recogniser_results,
+ page_line_level_ocr_results_with_words,
+ selection_element_results,
+ form_key_value_results,
+ ) = json_to_ocrresult(
+ text_blocks,
+ job["page_width"],
+ job["page_height"],
+ job["reported_page_number"],
+ )
+ return (
+ text_blocks,
+ new_textract_request_metadata,
+ page_line_level_ocr_results,
+ handwriting_or_signature_boxes,
+ page_signature_recogniser_results,
+ page_handwriting_recogniser_results,
+ page_line_level_ocr_results_with_words,
+ selection_element_results,
+ form_key_value_results,
+ )
+
+ max_workers = min(_ocr_first_pass_max_workers, len(ocr_parallel_jobs_textract))
+ results = [None] * len(ocr_parallel_jobs_textract)
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
+ future_to_index = {
+ executor.submit(_textract_first_pass_worker, job): idx
+ for idx, job in enumerate(ocr_parallel_jobs_textract)
+ }
+ with tqdm(
+ as_completed(future_to_index),
+ total=len(future_to_index),
+ unit="pages",
+ desc="AWS Textract OCR calls",
+ ) as textract_pbar:
+ for future in textract_pbar:
+ idx = future_to_index[future]
+ results[idx] = future.result()
+
+ # Merge parallel results into existing loaded data so we don't overwrite
+ # pages that were in the JSON but not in this run (e.g. EFFICIENT_OCR only processes OCR pages).
+ new_pages_from_parallel = [r[0] for r in results]
+ pages_by_no = {str(pg.get("page_no")): pg for pg in new_pages_from_parallel}
+ existing_pages = textract_data.get("pages", [])
+ seen_page_nos = set()
+ merged_pages = []
+ for pg in existing_pages:
+ pno = str(pg.get("page_no"))
+ seen_page_nos.add(pno)
+ merged_pages.append(pages_by_no.get(pno, pg))
+ for pg in new_pages_from_parallel:
+ pno = str(pg.get("page_no"))
+ if pno not in seen_page_nos:
+ merged_pages.append(pg)
+ seen_page_nos.add(pno)
+ merged_pages.sort(key=lambda p: int(p.get("page_no", 0)))
+ textract_data = {"pages": merged_pages}
+ textract_request_metadata.extend(r[1] for r in results)
+ if (
+ textract_json_file_path
+ and textract_json_file_path not in log_files_output_paths
+ ):
+ log_files_output_paths.append(textract_json_file_path)
+
+ if all_page_line_level_ocr_results_with_words is None:
+ all_page_line_level_ocr_results_with_words = list()
+
+ print("First pass: Processing Textract results per page...")
+ try:
+ progress(0.5, desc="Processing Textract results per page")
+ except Exception:
+ pass
+ post_process_pbar = tqdm(
+ zip(ocr_parallel_jobs_textract, results),
+ total=len(results),
+ unit="pages",
+ desc="Processing Textract OCR (per page)",
+ )
+ for job, result in post_process_pbar:
+ (
+ _text_blocks,
+ _metadata,
+ page_line_level_ocr_results,
+ handwriting_or_signature_boxes,
+ page_signature_recogniser_results,
+ page_handwriting_recogniser_results,
+ page_line_level_ocr_results_with_words,
+ selection_element_results,
+ form_key_value_results,
+ ) = result
+ page_no = job["page_no"]
+ reported_page_number = job["reported_page_number"]
+ post_process_pbar.set_postfix_str(f"page {reported_page_number}")
+ image_path = job["image_path"]
+ # Reopen image on demand instead of storing it in every job to reduce memory usage.
+ try:
+ if isinstance(image_path, str) and image_path:
+ normalized_path = os.path.normpath(os.path.abspath(image_path))
+ image = Image.open(normalized_path)
+ else:
+ image = None
+ except Exception:
+ image = None
+ page_width = job["page_width"]
+ page_height = job["page_height"]
+ original_cropbox = job["original_cropbox"]
+ pymupdf_page = ocr_pymupdf_pages_textract[page_no]
+
+ # Hybrid Textract + Bedrock VLM: re-run low-confidence lines with Bedrock VLM
+ if (
+ hybrid_textract_bedrock_vlm
+ and text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION
+ and image is not None
+ and CLOUD_VLM_MODEL_CHOICE
+ and bedrock_runtime is not None
+ ):
+ print(
+ f"Hybrid Textract + Bedrock VLM: re-running low-confidence lines for page {reported_page_number} (threshold={HYBRID_TEXTRACT_BEDROCK_VLM_CONFIDENCE_THRESHOLD}, padding={HYBRID_TEXTRACT_BEDROCK_VLM_PADDING})"
+ )
+ image_name_par = (
+ os.path.basename(image_path)
+ if isinstance(image_path, str)
+ else f"{file_name}_{reported_page_number}.png"
+ )
+ hybrid_result_par = _process_textract_page_with_hybrid_bedrock_vlm(
+ page_line_level_ocr_results,
+ page_line_level_ocr_results_with_words,
+ image,
+ page_width,
+ page_height,
+ HYBRID_TEXTRACT_BEDROCK_VLM_CONFIDENCE_THRESHOLD,
+ HYBRID_TEXTRACT_BEDROCK_VLM_PADDING,
+ bedrock_runtime,
+ CLOUD_VLM_MODEL_CHOICE,
+ output_folder,
+ image_name_par,
+ )
+ if isinstance(hybrid_result_par, tuple) and len(hybrid_result_par) == 4:
+ (
+ page_line_level_ocr_results_with_words,
+ hybrid_vlm_input_tokens_par,
+ hybrid_vlm_output_tokens_par,
+ hybrid_vlm_model_name_par,
+ ) = hybrid_result_par
+ else:
+ page_line_level_ocr_results_with_words = (
+ hybrid_result_par[0]
+ if isinstance(hybrid_result_par, tuple)
+ else hybrid_result_par
+ )
+ hybrid_vlm_input_tokens_par = 0
+ hybrid_vlm_output_tokens_par = 0
+ hybrid_vlm_model_name_par = ""
+ if isinstance(hybrid_result_par, tuple) and len(hybrid_result_par) == 4:
+ vlm_total_input_tokens += hybrid_vlm_input_tokens_par
+ vlm_total_output_tokens += hybrid_vlm_output_tokens_par
+ if hybrid_vlm_model_name_par and not vlm_model_name:
+ vlm_model_name = hybrid_vlm_model_name_par
+
+ all_page_line_level_ocr_results_with_words.append(
+ page_line_level_ocr_results_with_words
+ )
+ if selection_element_results:
+ selection_element_results_list.extend(selection_element_results)
+ if form_key_value_results:
+ form_key_value_results_list.extend(form_key_value_results)
+
+ # Textract person/signature VLM injection (same logic as sequential path)
+ if (
+ not _skip_inline_custom_vlm_detection
+ and text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION
+ and "CUSTOM_VLM_FACES" in chosen_redact_entities
+ and isinstance(page_line_level_ocr_results_with_words, dict)
+ and page_line_level_ocr_results_with_words.get("results")
+ and image is not None
+ ):
+ try:
+ _pp_total = str(
+ post_process_pbar.total
+ if getattr(post_process_pbar, "total", None)
+ else len(results)
+ )
+ _gui_tqdm_subphase(
+ post_process_pbar,
+ progress,
+ "Face detection (VLM, Textract parallel)",
+ str(reported_page_number),
+ _pp_total,
+ )
+ image_name = (
+ os.path.basename(image_path)
+ if isinstance(image_path, str)
+ else f"{file_name}_{reported_page_number}.png"
+ )
+ model_choice = CLOUD_VLM_MODEL_CHOICE
+ normalised_coords_range = (
+ 999 # Full-page prompt uses 0-999 for all Bedrock models
+ )
+ people_ocr_result = _bedrock_page_ocr_predict(
+ image,
+ image_name=image_name,
+ normalised_coords_range=normalised_coords_range,
+ output_folder=output_folder,
+ detect_people_only=True,
+ model_choice=model_choice,
+ bedrock_runtime=bedrock_runtime,
+ page_index_0=page_no,
+ )
+ if (
+ isinstance(people_ocr_result, tuple)
+ and len(people_ocr_result) == 4
+ ):
+ (
+ people_ocr,
+ people_vlm_input_tokens,
+ people_vlm_output_tokens,
+ people_vlm_model_name,
+ ) = people_ocr_result
+ vlm_total_input_tokens += people_vlm_input_tokens
+ vlm_total_output_tokens += people_vlm_output_tokens
+ if people_vlm_model_name and not vlm_model_name:
+ vlm_model_name = people_vlm_model_name
+ else:
+ people_ocr = (
+ people_ocr_result[0]
+ if isinstance(people_ocr_result, tuple)
+ else people_ocr_result
+ )
+ texts = people_ocr.get("text", [])
+ lefts = people_ocr.get("left", [])
+ tops = people_ocr.get("top", [])
+ widths = people_ocr.get("width", [])
+ heights = people_ocr.get("height", [])
+ confs = people_ocr.get("conf", [])
+ results_dict = page_line_level_ocr_results_with_words["results"]
+ existing_lines = []
+ for _line_key, _line_data in results_dict.items():
+ line_val = _line_data.get("line")
+ if isinstance(line_val, (int, float, str)):
+ try:
+ existing_lines.append(int(line_val))
+ except Exception:
+ continue
+ next_line_number = (
+ max(existing_lines) if existing_lines else 0
+ ) + 1
+ existing_keys = list(results_dict.keys())
+ person_index_start = len(existing_keys) + 1
+ for idx, text in enumerate(texts):
+ if text != "[FACE]":
+ continue
+ try:
+ left = int(lefts[idx])
+ top = int(tops[idx])
+ width = int(widths[idx])
+ height = int(heights[idx])
+ conf = float(confs[idx]) if idx < len(confs) else 0.0
+ except Exception:
+ continue
+ key = f"person_line_{person_index_start + idx}"
+ bbox = (left, top, left + width, top + height)
+ results_dict[key] = {
+ "line": int(next_line_number),
+ "text": "[FACE]",
+ "bounding_box": bbox,
+ "words": [
+ {
+ "text": "[FACE]",
+ "bounding_box": bbox,
+ "conf": conf,
+ "model": "bedrock-vlm",
+ }
+ ],
+ "conf": conf,
+ }
+ next_line_number += 1
+ except Exception as e:
+ print(
+ f"Warning: Bedrock VLM person detection failed on page {reported_page_number}: {e}"
+ )
+
+ if (
+ not _skip_inline_custom_vlm_detection
+ and text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION
+ and "CUSTOM_VLM_SIGNATURE" in chosen_redact_entities
+ and isinstance(page_line_level_ocr_results_with_words, dict)
+ and page_line_level_ocr_results_with_words.get("results")
+ and image is not None
+ ):
+ try:
+ _pp_total_sig = str(
+ post_process_pbar.total
+ if getattr(post_process_pbar, "total", None)
+ else len(results)
+ )
+ _gui_tqdm_subphase(
+ post_process_pbar,
+ progress,
+ "Signature detection (VLM, Textract parallel)",
+ str(reported_page_number),
+ _pp_total_sig,
+ )
+ image_name = (
+ os.path.basename(image_path)
+ if isinstance(image_path, str)
+ else f"{file_name}_{reported_page_number}.png"
+ )
+ model_choice = CLOUD_VLM_MODEL_CHOICE
+ normalised_coords_range = (
+ 999 # Full-page prompt uses 0-999 for all Bedrock models
+ )
+ sig_ocr_result = _bedrock_page_ocr_predict(
+ image,
+ image_name=image_name,
+ normalised_coords_range=normalised_coords_range,
+ output_folder=output_folder,
+ detect_signatures_only=True,
+ model_choice=model_choice,
+ bedrock_runtime=bedrock_runtime,
+ page_index_0=page_no,
+ )
+ if isinstance(sig_ocr_result, tuple) and len(sig_ocr_result) == 4:
+ (
+ sig_ocr,
+ sig_vlm_input_tokens,
+ sig_vlm_output_tokens,
+ sig_vlm_model_name,
+ ) = sig_ocr_result
+ vlm_total_input_tokens += sig_vlm_input_tokens
+ vlm_total_output_tokens += sig_vlm_output_tokens
+ if sig_vlm_model_name and not vlm_model_name:
+ vlm_model_name = sig_vlm_model_name
+ else:
+ sig_ocr = (
+ sig_ocr_result[0]
+ if isinstance(sig_ocr_result, tuple)
+ else sig_ocr_result
+ )
+ texts = sig_ocr.get("text", [])
+ lefts = sig_ocr.get("left", [])
+ tops = sig_ocr.get("top", [])
+ widths = sig_ocr.get("width", [])
+ heights = sig_ocr.get("height", [])
+ confs = sig_ocr.get("conf", [])
+ results_dict = page_line_level_ocr_results_with_words["results"]
+ existing_lines = []
+ for _line_key, _line_data in results_dict.items():
+ line_val = _line_data.get("line")
+ if isinstance(line_val, (int, float, str)):
+ try:
+ existing_lines.append(int(line_val))
+ except Exception:
+ continue
+ next_line_number = (
+ max(existing_lines) if existing_lines else 0
+ ) + 1
+ existing_keys = list(results_dict.keys())
+ sig_index_start = len(existing_keys) + 1
+ for idx, text in enumerate(texts):
+ if text != "[SIGNATURE]":
+ continue
+ try:
+ left = int(lefts[idx])
+ top = int(tops[idx])
+ width = int(widths[idx])
+ height = int(heights[idx])
+ conf = float(confs[idx]) if idx < len(confs) else 0.0
+ except Exception:
+ continue
+ key = f"signature_line_{sig_index_start + idx}"
+ bbox = (left, top, left + width, top + height)
+ results_dict[key] = {
+ "line": int(next_line_number),
+ "text": "[SIGNATURE]",
+ "bounding_box": bbox,
+ "words": [
+ {
+ "text": "[SIGNATURE]",
+ "bounding_box": bbox,
+ "conf": conf,
+ "model": "bedrock-vlm",
+ }
+ ],
+ "conf": conf,
+ }
+ next_line_number += 1
+ except Exception as e:
+ print(
+ f"Warning: Bedrock VLM signature detection failed on page {reported_page_number}: {e}"
+ )
+
+ try:
+ if (
+ page_line_level_ocr_results
+ and "results" in page_line_level_ocr_results
+ ):
+ line_level_ocr_results_df = pd.DataFrame(
+ [
+ {
+ "page": page_line_level_ocr_results["page"],
+ "text": result.text,
+ "left": result.left,
+ "top": result.top,
+ "width": result.width,
+ "height": result.height,
+ "line": result.line,
+ "conf": result.conf,
+ }
+ for result in page_line_level_ocr_results["results"]
+ ]
+ )
+ else:
+ line_level_ocr_results_df = pd.DataFrame()
+ except (UnboundLocalError, NameError, KeyError):
+ line_level_ocr_results_df = pd.DataFrame()
+
+ if not line_level_ocr_results_df.empty:
+ all_line_level_ocr_results_list.extend(
+ line_level_ocr_results_df.to_dict("records")
+ )
+
+ if (
+ text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION
+ and save_page_ocr_visualisations is True
+ ):
+ if (
+ page_line_level_ocr_results_with_words
+ and "results" in page_line_level_ocr_results_with_words
+ ):
+ image_for_visualization = image
+ if image_for_visualization is None:
+ if is_pdf(file_path) is False:
+ if isinstance(image_path, str) and image_path:
+ image_for_visualization = image_path
+ elif isinstance(file_path, str) and file_path:
+ image_for_visualization = file_path
+ else:
+ if isinstance(image_path, str) and image_path:
+ image_for_visualization = image_path
+ if image_for_visualization is not None:
+ log_files_output_paths_length_before = len(
+ log_files_output_paths
+ )
+ log_files_output_paths = visualise_ocr_words_bounding_boxes(
+ image_for_visualization,
+ page_line_level_ocr_results_with_words["results"],
+ image_name=f"{file_name}_{reported_page_number}",
+ output_folder=output_folder,
+ text_extraction_method=text_extraction_method,
+ chosen_local_ocr_model=chosen_local_ocr_model,
+ log_files_output_paths=log_files_output_paths,
+ textract_hybrid_bedrock_used=hybrid_textract_bedrock_vlm,
+ )
+ if (
+ INCLUDE_OCR_VISUALISATION_IN_OUTPUT_FILES
+ and log_files_output_paths is not None
+ and len(log_files_output_paths)
+ > log_files_output_paths_length_before
+ ):
+ new_visualisation_path = log_files_output_paths[-1]
+ if new_visualisation_path not in out_file_paths:
+ out_file_paths.append(new_visualisation_path)
+ else:
+ print(
+ f"Warning: Could not determine image for visualization at page {reported_page_number}. Skipping visualization."
+ )
+
+ ocr_results_by_page[page_no] = {
+ "page_line_level_ocr_results": page_line_level_ocr_results,
+ "page_line_level_ocr_results_with_words": page_line_level_ocr_results_with_words,
+ "page_signature_recogniser_results": page_signature_recogniser_results,
+ "page_handwriting_recogniser_results": page_handwriting_recogniser_results,
+ "handwriting_or_signature_boxes": handwriting_or_signature_boxes,
+ "image_path": image_path,
+ "pymupdf_page": pymupdf_page,
+ "original_cropbox": original_cropbox,
+ "page_width": page_width,
+ "page_height": page_height,
+ "image": image,
+ "reported_page_number": reported_page_number,
+ }
+
+ # SECOND PASS: Perform PII detection on all pages using stored OCR results
+ print("Second pass: Performing PII detection on all pages...")
+
+ # Optional: run PII detection in parallel for Local and AWS Comprehend (skip when CUSTOM_VLM_FACES is used).
+ # IMPORTANT: never parallelise PII when using inference-server or local-transformers LLM backends.
+ pii_results_by_page_image = {}
+ _use_parallel_image_pii = (
+ pii_identification_method in (LOCAL_PII_OPTION, AWS_PII_OPTION)
+ and pii_identification_method
+ not in (INFERENCE_SERVER_PII_OPTION, LOCAL_TRANSFORMERS_LLM_PII_OPTION)
+ and (chosen_redact_entities or chosen_redact_comprehend_entities)
+ and not (
+ pii_identification_method == AWS_PII_OPTION
+ and "CUSTOM_VLM_FACES" in (chosen_redact_comprehend_entities or [])
+ )
+ )
+ if _use_parallel_image_pii and ocr_results_by_page:
+ _pii_pages = (
+ list(page_loop_pages)
+ if page_loop_pages is not None
+ else list(range(page_loop_start, page_loop_end))
+ )
+ _pii_jobs = []
+ for _pno in _pii_pages:
+ if _pno not in ocr_results_by_page:
+ continue
+ _pdata = ocr_results_by_page[_pno]
+ _plocr = _pdata.get("page_line_level_ocr_results")
+ _plocrw = _pdata.get("page_line_level_ocr_results_with_words")
+ if (
+ not _plocr
+ or not isinstance(_plocr, dict)
+ or "results" not in _plocr
+ or not _plocrw
+ ):
+ continue
+ if isinstance(_plocrw, list):
+ if (
+ not _plocrw
+ or not isinstance(_plocrw[0], dict)
+ or "results" not in _plocrw[0]
+ ):
+ continue
+ elif not isinstance(_plocrw, dict) or "results" not in _plocrw:
+ continue
+ _pii_jobs.append((_pno, _pdata))
+ text_analyzer_kwargs_parallel = {}
+ if _pii_jobs:
+ _n = len(_pii_jobs)
+ _workers = min(MAX_WORKERS, _n)
+ with ThreadPoolExecutor(max_workers=_workers) as executor:
+ _results = list(
+ tqdm(
+ executor.map(
+ lambda item: _run_image_pii_for_one_page(
+ item[0],
+ item[1],
+ image_analyser,
+ chosen_redact_comprehend_entities,
+ chosen_llm_entities,
+ pii_identification_method,
+ comprehend_client,
+ bedrock_runtime,
+ chosen_redact_entities,
+ language,
+ allow_list,
+ score_threshold,
+ nlp_analyser,
+ custom_llm_instructions,
+ file_name,
+ text_analyzer_kwargs_parallel,
+ ),
+ _pii_jobs,
+ ),
+ total=_n,
+ unit="pages",
+ desc="Detecting PII (parallel, image path)",
+ )
+ )
+ for (
+ _pno,
+ _boxes,
+ _cq,
+ _llm_name,
+ _llm_in,
+ _llm_out,
+ ) in _results:
+ pii_results_by_page_image[_pno] = (
+ _boxes,
+ _cq,
+ _llm_name,
+ _llm_in,
+ _llm_out,
+ )
+
+ # Precompute redact_whole_page set for O(1) membership in the loop (same speedup as redact_text_pdf)
+ redact_whole_page_set_image = set()
+ if redact_whole_page_list:
+ for _p in redact_whole_page_list:
+ try:
+ redact_whole_page_set_image.add(int(_p))
+ except (TypeError, ValueError):
+ redact_whole_page_set_image.add(_p)
+
+ if page_loop_pages is not None:
+ pii_progress_bar = tqdm(
+ page_loop_pages,
+ unit="pages",
+ desc=(
+ "Applying redactions to pages"
+ if pii_results_by_page_image
+ else "Detecting PII (following image-based OCR)"
+ ),
+ )
+ else:
+ pii_progress_bar = tqdm(
+ range(page_loop_start, page_loop_end),
+ unit="pages remaining",
+ desc=(
+ "Applying redactions to pages"
+ if pii_results_by_page_image
+ else "Detecting PII (following image-based OCR)"
+ ),
+ )
+
+ _pii_gui_total_pages = (
+ len(page_loop_pages)
+ if page_loop_pages is not None
+ else max(0, page_loop_end - page_loop_start)
+ )
+ _pii_gui_total_str = str(_pii_gui_total_pages) if _pii_gui_total_pages else "?"
+
+ # Initialize redacted_image - will be updated inside loop for image files
+ redacted_image = None
+
+ for page_no in pii_progress_bar:
+
+ reported_page_number = str(page_no + 1)
+ # print(f"PII Detection - Current page: {reported_page_number}")
+
+ # Retrieve stored OCR results and page metadata
+ if page_no not in ocr_results_by_page:
+ print(
+ f"Warning: No OCR results found for page {reported_page_number}, skipping PII detection"
+ )
+ continue
+
+ page_data = ocr_results_by_page[page_no]
+ page_line_level_ocr_results = page_data["page_line_level_ocr_results"]
+ page_line_level_ocr_results_with_words = page_data[
+ "page_line_level_ocr_results_with_words"
+ ]
+ page_signature_recogniser_results = page_data[
+ "page_signature_recogniser_results"
+ ]
+ page_handwriting_recogniser_results = page_data[
+ "page_handwriting_recogniser_results"
+ ]
+ handwriting_or_signature_boxes = page_data["handwriting_or_signature_boxes"]
+ image_path = page_data["image_path"]
+ pymupdf_page = page_data["pymupdf_page"]
+ original_cropbox = page_data["original_cropbox"]
+ page_width = page_data["page_width"]
+ page_height = page_data["page_height"]
+ image = page_data["image"]
+ reported_page_number = page_data["reported_page_number"]
+
+ try:
+ pii_progress_bar.set_postfix_str(
+ f"PII · page {reported_page_number}/{_pii_gui_total_str}",
+ refresh=False,
+ )
+ except Exception:
+ pass
+
+ # Initialize redacted_image for image files as fallback (will be updated if redactions are applied)
+ if is_pdf(file_path) is False and redacted_image is None:
+ # Try to get image from image_path or use the image from page_data
+ if isinstance(image_path, str):
+ try:
+ normalized_path = os.path.normpath(os.path.abspath(image_path))
+ is_gradio_temp = (
+ "gradio" in normalized_path.lower()
+ and "temp" in normalized_path.lower()
+ )
+ if is_gradio_temp or validate_path_containment(
+ normalized_path, input_folder
+ ):
+ redacted_image = Image.open(normalized_path)
+ else:
+ redacted_image = image if image is not None else None
+ except Exception as e:
+ print(f"Error loading image for redacted_image fallback: {e}")
+ redacted_image = image if image is not None else None
+ elif isinstance(image_path, Image.Image):
+ redacted_image = image_path
+ else:
+ redacted_image = image if image is not None else None
+
+ page_image_annotations = {"image": image_path, "boxes": []}
+ page_break_return = False
+
+ # Skip if OCR results are missing or invalid
+ if (
+ not page_line_level_ocr_results
+ or not isinstance(page_line_level_ocr_results, dict)
+ or "results" not in page_line_level_ocr_results
+ or not page_line_level_ocr_results_with_words
+ or not isinstance(page_line_level_ocr_results_with_words, dict)
+ or "results" not in page_line_level_ocr_results_with_words
+ ):
+ print(
+ f"Warning: Missing or invalid OCR results for page {reported_page_number}, skipping PII detection"
+ )
+ # Still need to handle page_image_annotations and current_loop_page for consistency
+ page_image_annotations = {"image": image_path, "boxes": []}
+ # Check if the image_path already exists in annotations_all_pages
+ existing_index = next(
+ (
+ index
+ for index, ann in enumerate(annotations_all_pages)
+ if ann.get("image") == page_image_annotations["image"]
+ ),
+ None,
+ )
+ if existing_index is not None:
+ annotations_all_pages[existing_index] = page_image_annotations
+ else:
+ annotations_all_pages.append(page_image_annotations)
+ current_loop_page += 1
+ continue
+
+ # Include VLM (face/signature) boxes in outputs even when "Only extract text (no redaction)"
+ _include_vlm_boxes_in_outputs = _run_face_pass or _textract_face_identification
+ _vlm_boxes_only_no_redaction = (
+ False # set True when we add VLM boxes to outputs only (no PDF redaction)
+ )
+ if (
+ pii_identification_method != NO_REDACTION_PII_OPTION
+ or RETURN_PDF_FOR_REVIEW is True
+ or _include_vlm_boxes_in_outputs
+ ):
+ page_redaction_bounding_boxes = list()
+ comprehend_query_number_new = 0
+ redact_whole_page = False
+
+ if pii_identification_method != NO_REDACTION_PII_OPTION:
+ # Step 2: Analyse text and identify PII
+ if chosen_redact_entities or chosen_redact_comprehend_entities:
+
+ # Set up inference server or local transformers parameters for PII detection
+ text_analyzer_kwargs = {}
+ if pii_identification_method == INFERENCE_SERVER_PII_OPTION:
+ text_analyzer_kwargs["inference_method"] = "inference-server"
+ text_analyzer_kwargs["api_url"] = INFERENCE_SERVER_API_URL
+ # Use INFERENCE_SERVER_LLM_PII_MODEL_CHOICE for inference server PII detection
+ text_analyzer_kwargs["model_choice"] = (
+ INFERENCE_SERVER_LLM_PII_MODEL_CHOICE
+ )
+ elif pii_identification_method == LOCAL_TRANSFORMERS_LLM_PII_OPTION:
+ # Set up local transformers LLM parameters
+ text_analyzer_kwargs["inference_method"] = "local"
+ # Use LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE as default model for local transformers
+ text_analyzer_kwargs["model_choice"] = (
+ LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE
+ )
+
+ # Optional additional Bedrock VLM pass to detect people
+ # and inject [FACE] entries into the word-level OCR structure for AWS Comprehend.
+ if (
+ not _skip_inline_custom_vlm_detection
+ and pii_identification_method == AWS_PII_OPTION
+ and "CUSTOM_VLM_FACES" in chosen_redact_comprehend_entities
+ and isinstance(page_line_level_ocr_results_with_words, dict)
+ and page_line_level_ocr_results_with_words.get("results")
+ and image is not None
+ ):
+ try:
+ _gui_tqdm_subphase(
+ pii_progress_bar,
+ progress,
+ "Face detection (VLM, during PII)",
+ str(reported_page_number),
+ _pii_gui_total_str,
+ )
+ image_name = (
+ os.path.basename(image_path)
+ if isinstance(image_path, str)
+ else f"{file_name}_{reported_page_number}.png"
+ )
+
+ # Use Bedrock VLM for people detection
+ model_choice = CLOUD_VLM_MODEL_CHOICE
+ normalised_coords_range = 999 # Full-page prompt uses 0-999 for all Bedrock models
+
+ people_ocr_result = _bedrock_page_ocr_predict(
+ image,
+ image_name=image_name,
+ normalised_coords_range=normalised_coords_range,
+ output_folder=output_folder,
+ detect_people_only=True,
+ model_choice=model_choice,
+ bedrock_runtime=bedrock_runtime,
+ page_index_0=page_no,
+ )
+
+ # Unpack tuple: (Dict[str, List], int, int, str) -> (ocr_data, vlm_input_tokens, vlm_output_tokens, vlm_model_name)
+ if (
+ isinstance(people_ocr_result, tuple)
+ and len(people_ocr_result) == 4
+ ):
+ (
+ people_ocr,
+ people_vlm_input_tokens,
+ people_vlm_output_tokens,
+ people_vlm_model_name,
+ ) = people_ocr_result
+ # Accumulate VLM token usage
+ vlm_total_input_tokens += people_vlm_input_tokens
+ vlm_total_output_tokens += people_vlm_output_tokens
+ if people_vlm_model_name and not vlm_model_name:
+ vlm_model_name = people_vlm_model_name
+ else:
+ people_ocr = (
+ people_ocr_result[0]
+ if isinstance(people_ocr_result, tuple)
+ else people_ocr_result
+ )
+
+ # Convert people_ocr outputs into additional word-level entries
+ texts = people_ocr.get("text", [])
+ lefts = people_ocr.get("left", [])
+ tops = people_ocr.get("top", [])
+ widths = people_ocr.get("width", [])
+ heights = people_ocr.get("height", [])
+ confs = people_ocr.get("conf", [])
+
+ results_dict = page_line_level_ocr_results_with_words[
+ "results"
+ ]
+
+ # Determine a valid starting line number for synthetic [FACE] lines
+ existing_lines = []
+ for _line_key, _line_data in results_dict.items():
+ line_val = _line_data.get("line")
+ if isinstance(line_val, (int, float, str)):
+ try:
+ existing_lines.append(int(line_val))
+ except Exception:
+ continue
+ next_line_number = (
+ max(existing_lines) if existing_lines else 0
+ ) + 1
+
+ existing_keys = list(results_dict.keys())
+ person_index_start = len(existing_keys) + 1
+
+ for idx, text in enumerate(texts):
+ if text != "[FACE]":
+ continue
+ try:
+ left = int(lefts[idx])
+ top = int(tops[idx])
+ width = int(widths[idx])
+ height = int(heights[idx])
+ conf = (
+ float(confs[idx]) if idx < len(confs) else 0.0
+ )
+ except Exception:
+ continue
+
+ key = f"person_line_{person_index_start + idx}"
+ bbox = (left, top, left + width, top + height)
+ results_dict[key] = {
+ "line": int(next_line_number),
+ "text": "[FACE]",
+ "bounding_box": bbox,
+ "words": [
+ {
+ "text": "[FACE]",
+ "bounding_box": bbox,
+ "conf": conf,
+ "model": "bedrock-vlm",
+ }
+ ],
+ "conf": conf,
+ }
+ next_line_number += 1
+ except Exception as e:
+ print(
+ f"Warning: Bedrock VLM person detection failed on page {reported_page_number}: {e}"
+ )
+
+ # Optional additional Bedrock VLM pass to detect signatures
+ # and inject [SIGNATURE] entries into the word-level OCR structure for AWS Comprehend.
+ # Skipped when defer_inline_custom_vlm_detection_pass: post-pass covers PDF/decision table.
+ if (
+ not _skip_inline_custom_vlm_detection
+ and pii_identification_method == AWS_PII_OPTION
+ and "CUSTOM_VLM_SIGNATURE" in chosen_redact_comprehend_entities
+ and isinstance(page_line_level_ocr_results_with_words, dict)
+ and page_line_level_ocr_results_with_words.get("results")
+ and image is not None
+ ):
+ try:
+ _gui_tqdm_subphase(
+ pii_progress_bar,
+ progress,
+ "Signature detection (VLM, during PII)",
+ str(reported_page_number),
+ _pii_gui_total_str,
+ )
+ image_name = (
+ os.path.basename(image_path)
+ if isinstance(image_path, str)
+ else f"{file_name}_{reported_page_number}.png"
+ )
+
+ # Use Bedrock VLM for signature detection
+ model_choice = CLOUD_VLM_MODEL_CHOICE
+ normalised_coords_range = 999 # Full-page prompt uses 0-999 for all Bedrock models
+
+ sig_ocr_result = _bedrock_page_ocr_predict(
+ image,
+ image_name=image_name,
+ normalised_coords_range=normalised_coords_range,
+ output_folder=output_folder,
+ detect_signatures_only=True,
+ model_choice=model_choice,
+ bedrock_runtime=bedrock_runtime,
+ page_index_0=page_no,
+ )
+
+ # Unpack tuple: (Dict[str, List], int, int, str) -> (ocr_data, vlm_input_tokens, vlm_output_tokens, vlm_model_name)
+ if (
+ isinstance(sig_ocr_result, tuple)
+ and len(sig_ocr_result) == 4
+ ):
+ (
+ sig_ocr,
+ sig_vlm_input_tokens,
+ sig_vlm_output_tokens,
+ sig_vlm_model_name,
+ ) = sig_ocr_result
+ # Accumulate VLM token usage
+ vlm_total_input_tokens += sig_vlm_input_tokens
+ vlm_total_output_tokens += sig_vlm_output_tokens
+ if sig_vlm_model_name and not vlm_model_name:
+ vlm_model_name = sig_vlm_model_name
+ else:
+ sig_ocr = (
+ sig_ocr_result[0]
+ if isinstance(sig_ocr_result, tuple)
+ else sig_ocr_result
+ )
+
+ # Convert sig_ocr outputs into additional word-level entries
+ texts = sig_ocr.get("text", [])
+ lefts = sig_ocr.get("left", [])
+ tops = sig_ocr.get("top", [])
+ widths = sig_ocr.get("width", [])
+ heights = sig_ocr.get("height", [])
+ confs = sig_ocr.get("conf", [])
+
+ results_dict = page_line_level_ocr_results_with_words[
+ "results"
+ ]
+
+ # Determine a valid starting line number for synthetic [SIGNATURE] lines
+ existing_lines = []
+ for _line_key, _line_data in results_dict.items():
+ line_val = _line_data.get("line")
+ if isinstance(line_val, (int, float, str)):
+ try:
+ existing_lines.append(int(line_val))
+ except Exception:
+ continue
+ next_line_number = (
+ max(existing_lines) if existing_lines else 0
+ ) + 1
+
+ existing_keys = list(results_dict.keys())
+ sig_index_start = len(existing_keys) + 1
+
+ for idx, text in enumerate(texts):
+ if text != "[SIGNATURE]":
+ continue
+ try:
+ left = int(lefts[idx])
+ top = int(tops[idx])
+ width = int(widths[idx])
+ height = int(heights[idx])
+ conf = (
+ float(confs[idx]) if idx < len(confs) else 0.0
+ )
+ except Exception:
+ continue
+
+ key = f"signature_line_{sig_index_start + idx}"
+ bbox = (left, top, left + width, top + height)
+ results_dict[key] = {
+ "line": int(next_line_number),
+ "text": "[SIGNATURE]",
+ "bounding_box": bbox,
+ "words": [
+ {
+ "text": "[SIGNATURE]",
+ "bounding_box": bbox,
+ "conf": conf,
+ "model": "bedrock-vlm",
+ }
+ ],
+ "conf": conf,
+ }
+ next_line_number += 1
+ except Exception as e:
+ print(
+ f"Warning: Bedrock VLM signature detection failed on page {reported_page_number}: {e}"
+ )
+
+ # Call analyze_text for all PII detection methods (Local, AWS Comprehend, LLM, Inference Server)
+ # Use precomputed result when parallel PII was run (Local/AWS Comprehend).
+ if page_no in pii_results_by_page_image:
+ (
+ page_redaction_bounding_boxes,
+ comprehend_query_number_new,
+ llm_model_name_page,
+ llm_input_tokens_page,
+ llm_output_tokens_page,
+ ) = pii_results_by_page_image[page_no]
+ else:
+ (
+ page_redaction_bounding_boxes,
+ comprehend_query_number_new,
+ llm_model_name_page,
+ llm_input_tokens_page,
+ llm_output_tokens_page,
+ ) = image_analyser.analyze_text(
+ page_line_level_ocr_results["results"],
+ page_line_level_ocr_results_with_words["results"],
+ chosen_redact_comprehend_entities=chosen_redact_comprehend_entities,
+ chosen_llm_entities=chosen_llm_entities,
+ pii_identification_method=pii_identification_method,
+ comprehend_client=comprehend_client,
+ bedrock_runtime=bedrock_runtime,
+ custom_entities=chosen_redact_entities,
+ language=language,
+ allow_list=allow_list,
+ score_threshold=score_threshold,
+ nlp_analyser=nlp_analyser,
+ custom_llm_instructions=custom_llm_instructions,
+ file_name=file_name,
+ page_number=int(reported_page_number),
+ **text_analyzer_kwargs,
+ )
+
+ comprehend_query_number = (
+ comprehend_query_number + comprehend_query_number_new
+ )
+
+ # Accumulate LLM token usage across pages
+ llm_total_input_tokens += llm_input_tokens_page
+ llm_total_output_tokens += llm_output_tokens_page
+ if llm_model_name_page and not llm_model_name:
+ llm_model_name = llm_model_name_page
+
+ else:
+ page_redaction_bounding_boxes = list()
+
+ # Merge redaction bounding boxes that are close together
+ # This happens regardless of whether entities were chosen, as long as PII detection is enabled
+ page_merged_redaction_bboxes = merge_img_bboxes(
+ page_redaction_bounding_boxes,
+ page_line_level_ocr_results_with_words["results"],
+ page_signature_recogniser_results,
+ page_handwriting_recogniser_results,
+ handwrite_signature_checkbox,
+ )
+
+ else:
+ page_merged_redaction_bboxes = list()
+
+ if is_pdf(file_path) is True:
+ int_reported_page_number = int(reported_page_number)
+ redact_whole_page = (
+ int_reported_page_number in redact_whole_page_set_image
+ if redact_whole_page_set_image
+ else False
+ )
+
+ # Check if there are question answer boxes
+ if form_key_value_results_list:
+ page_merged_redaction_bboxes.extend(
+ convert_page_question_answer_to_custom_image_recognizer_results(
+ form_key_value_results_list,
+ page_sizes_df,
+ reported_page_number,
+ )
+ )
+
+ # When "Only extract text (no redaction)" but we have VLM (face/signature) boxes:
+ # add them to annotations and decision table without applying redactions to the PDF.
+ if (
+ pii_identification_method == NO_REDACTION_PII_OPTION
+ and page_merged_redaction_bboxes
+ ):
+ _vlm_boxes_only_no_redaction = True
+ all_image_annotations_boxes = list()
+ for box in page_merged_redaction_bboxes:
+ try:
+ img_annotation_box = {
+ "xmin": box.left,
+ "ymin": box.top,
+ "xmax": box.left + box.width,
+ "ymax": box.top + box.height,
+ "label": getattr(box, "entity_type", "Redaction"),
+ "text": getattr(box, "text", "") or "",
+ "color": CUSTOM_BOX_COLOUR,
+ }
+ all_image_annotations_boxes.append(
+ fill_missing_box_ids(img_annotation_box)
+ )
+ except AttributeError:
+ continue
+ page_image_annotations = {
+ "image": image_path,
+ "boxes": all_image_annotations_boxes,
+ }
+ existing_index = next(
+ (
+ index
+ for index, ann in enumerate(annotations_all_pages)
+ if ann.get("image") == page_image_annotations["image"]
+ ),
+ None,
+ )
+ if existing_index is not None:
+ annotations_all_pages[existing_index] = page_image_annotations
+ else:
+ annotations_all_pages.append(page_image_annotations)
+ decision_process_table = pd.DataFrame(
+ [
+ {
+ "text": result.text,
+ "xmin": result.left,
+ "ymin": result.top,
+ "xmax": result.left + result.width,
+ "ymax": result.top + result.height,
+ "label": result.entity_type,
+ "start": result.start,
+ "end": result.end,
+ "score": result.score,
+ "page": reported_page_number,
+ }
+ for result in page_merged_redaction_bboxes
+ ]
+ )
+ if not decision_process_table.empty:
+ all_pages_decision_process_list.extend(
+ decision_process_table.to_dict("records")
+ )
+ else:
+ # 3. Draw the merged boxes
+ ## Apply annotations to pdf with pymupdf (pass dimensions to skip .loc in redact_page_with_pymupdf)
+ redact_result = redact_page_with_pymupdf(
+ pymupdf_page,
+ page_merged_redaction_bboxes,
+ image_path,
+ redact_whole_page=redact_whole_page,
+ original_cropbox=original_cropbox,
+ page_sizes_df=page_sizes_df,
+ input_folder=input_folder,
+ image_dimensions_override={
+ "image_width": page_width,
+ "image_height": page_height,
+ },
+ )
+
+ # Handle dual page objects if returned
+ if isinstance(redact_result[0], tuple):
+ (
+ pymupdf_page,
+ pymupdf_applied_redaction_page,
+ ), page_image_annotations = redact_result
+ # Store the final page with its original page number for later use
+ if not hasattr(redact_image_pdf, "_applied_redaction_pages"):
+ redact_image_pdf._applied_redaction_pages = list()
+ redact_image_pdf._applied_redaction_pages.append(
+ (pymupdf_applied_redaction_page, page_no)
+ )
+ else:
+ pymupdf_page, page_image_annotations = redact_result
+ # When dual output is requested but this page had no redaction boxes,
+ # we still need an "applied" page entry so the final PDF replace loop
+ # replaces every page. Otherwise only pages with at least one redaction
+ # get replaced, and other pages stay as review-style (annotations only).
+ if RETURN_PDF_FOR_REVIEW and RETURN_REDACTED_PDF:
+ if not hasattr(
+ redact_image_pdf, "_applied_redaction_pages"
+ ):
+ redact_image_pdf._applied_redaction_pages = list()
+ applied_doc = pymupdf.open()
+ applied_doc.insert_pdf(
+ pymupdf_page.parent,
+ from_page=page_no,
+ to_page=page_no,
+ )
+ applied_page_copy = applied_doc[0]
+ redact_image_pdf._applied_redaction_pages.append(
+ (applied_page_copy, page_no)
+ )
+
+ # If an image_path file, draw onto the image_path
+ elif is_pdf(file_path) is False:
+ if isinstance(image_path, str):
+ # Normalise and validate path safety before checking existence
+ normalized_path = os.path.normpath(os.path.abspath(image_path))
+
+ # Check if it's a Gradio temporary file
+ is_gradio_temp = (
+ "gradio" in normalized_path.lower()
+ and "temp" in normalized_path.lower()
+ )
+
+ if is_gradio_temp or validate_path_containment(
+ normalized_path, INPUT_FOLDER
+ ):
+ image = Image.open(normalized_path)
+ else:
+ print(f"Path validation failed for: {normalized_path}")
+ # You might want to handle this case differently
+ continue # or raise an exception
+ elif isinstance(image_path, Image.Image):
+ image = image_path
+ else:
+ # Assume image_path is an image
+ image = image_path
+
+ fill = CUSTOM_BOX_COLOUR # Fill colour for redactions
+ draw = ImageDraw.Draw(image)
+
+ all_image_annotations_boxes = list()
+
+ for box in page_merged_redaction_bboxes:
+
+ try:
+ x0 = box.left
+ y0 = box.top
+ x1 = x0 + box.width
+ y1 = y0 + box.height
+ label = box.entity_type # Attempt to get the label
+ text = box.text
+ except AttributeError as e:
+ print(f"Error accessing box attributes: {e}")
+ label = "Redaction" # Default label if there's an error
+
+ # Check if coordinates are valid numbers
+ if any(v is None for v in [x0, y0, x1, y1]):
+ print(f"Invalid coordinates for box: {box}")
+ continue # Skip this box if coordinates are invalid
+
+ img_annotation_box = {
+ "xmin": x0,
+ "ymin": y0,
+ "xmax": x1,
+ "ymax": y1,
+ "label": label,
+ "color": CUSTOM_BOX_COLOUR,
+ "text": text,
+ }
+ img_annotation_box = fill_missing_box_ids(img_annotation_box)
+
+ # Directly append the dictionary with the required keys
+ all_image_annotations_boxes.append(img_annotation_box)
+
+ # Draw the rectangle
+ try:
+ draw.rectangle([x0, y0, x1, y1], fill=fill)
+ except Exception as e:
+ print(f"Error drawing rectangle: {e}")
+
+ # Update redacted_image with the redacted version for image files
+ redacted_image = image
+
+ page_image_annotations = {
+ "image": image_path,
+ "boxes": all_image_annotations_boxes,
+ }
+
+ # Convert decision process to table (skip when already done in NO_REDACTION VLM-only branch)
+ if not _vlm_boxes_only_no_redaction:
+ decision_process_table = pd.DataFrame(
+ [
+ {
+ "text": result.text,
+ "xmin": result.left,
+ "ymin": result.top,
+ "xmax": result.left + result.width,
+ "ymax": result.top + result.height,
+ "label": result.entity_type,
+ "start": result.start,
+ "end": result.end,
+ "score": result.score,
+ "page": reported_page_number,
+ }
+ for result in page_merged_redaction_bboxes
+ ]
+ )
+
+ # all_pages_decision_process_list.append(decision_process_table.to_dict('records'))
+
+ if not decision_process_table.empty: # Ensure there are records to add
+ all_pages_decision_process_list.extend(
+ decision_process_table.to_dict("records")
+ )
+
+ decision_process_table = fill_missing_ids(decision_process_table)
+
+ toc = time.perf_counter()
+
+ time_taken = toc - tic
+
+ # Break if time taken is greater than max_time seconds
+ if time_taken > max_time:
+ print("Processing for", max_time, "seconds, breaking loop.")
+ page_break_return = True
+ progress.close(_tqdm=pii_progress_bar)
+ tqdm._instances.clear()
+
+ if is_pdf(file_path) is False:
+ # Ensure redacted_image is set before appending (timeout case)
+ if redacted_image is None:
+ # Fallback: try to use image_path or image from page_data
+ if isinstance(image_path, str):
+ try:
+ normalized_path = os.path.normpath(
+ os.path.abspath(image_path)
+ )
+ is_gradio_temp = (
+ "gradio" in normalized_path.lower()
+ and "temp" in normalized_path.lower()
+ )
+ if is_gradio_temp or validate_path_containment(
+ normalized_path, input_folder
+ ):
+ redacted_image = Image.open(normalized_path)
+ else:
+ redacted_image = (
+ image if image is not None else image_path
+ )
+ except Exception as e:
+ print(
+ f"Error loading image for redacted_image timeout fallback: {e}"
+ )
+ redacted_image = (
+ image if image is not None else image_path
+ )
+ elif isinstance(image_path, Image.Image):
+ redacted_image = image_path
+ else:
+ redacted_image = image if image is not None else image_path
+
+ if redacted_image is not None:
+ pdf_image_file_paths.append(
+ redacted_image
+ ) # .append(image_path)
+ pymupdf_doc = pdf_image_file_paths
+ else:
+ print(
+ f"Warning: redacted_image is None for image file {file_path} in timeout case, skipping append"
+ )
+
+ # Check if the image_path already exists in annotations_all_pages
+ existing_index = next(
+ (
+ index
+ for index, ann in enumerate(annotations_all_pages)
+ if ann["image"] == page_image_annotations["image"]
+ ),
+ None,
+ )
+ if existing_index is not None:
+ # Replace the existing annotation
+ annotations_all_pages[existing_index] = page_image_annotations
+ else:
+ # Append new annotation if it doesn't exist
+ annotations_all_pages.append(page_image_annotations)
+
+ # Save word level options
+ if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
+ if original_textract_data != textract_data:
+ # Write the updated existing textract data back to the JSON file
+ secure_file_write(
+ output_folder,
+ file_name + textract_suffix + "_textract.json",
+ json.dumps(textract_data, separators=(",", ":")),
+ )
+
+ if textract_json_file_path not in log_files_output_paths:
+ log_files_output_paths.append(textract_json_file_path)
+
+ all_pages_decision_process_table = pd.DataFrame(
+ all_pages_decision_process_list
+ )
+
+ all_line_level_ocr_results_df = pd.DataFrame(
+ all_line_level_ocr_results_list
+ )
+ if selection_element_results_list:
+ selection_element_results_list_df = pd.DataFrame(
+ selection_element_results_list
+ )
+ if form_key_value_results_list:
+ pd.DataFrame(form_key_value_results_list)
+ form_key_value_results_list_df = (
+ convert_question_answer_to_dataframe(
+ form_key_value_results_list, page_sizes_df
+ )
+ )
+
+ current_loop_page += 1
+
+ return (
+ pymupdf_doc,
+ all_pages_decision_process_table,
+ log_files_output_paths,
+ textract_request_metadata,
+ annotations_all_pages,
+ current_loop_page,
+ page_break_return,
+ all_line_level_ocr_results_df,
+ comprehend_query_number,
+ all_page_line_level_ocr_results,
+ all_page_line_level_ocr_results_with_words,
+ selection_element_results_list_df,
+ form_key_value_results_list_df,
+ out_file_paths,
+ )
+
+ # If it's an image file
+ if is_pdf(file_path) is False:
+ # Ensure redacted_image is set before appending
+ if redacted_image is None:
+ # Fallback: try to use image_path or image from page_data
+ if isinstance(image_path, str):
+ try:
+ normalized_path = os.path.normpath(os.path.abspath(image_path))
+ is_gradio_temp = (
+ "gradio" in normalized_path.lower()
+ and "temp" in normalized_path.lower()
+ )
+ if is_gradio_temp or validate_path_containment(
+ normalized_path, input_folder
+ ):
+ redacted_image = Image.open(normalized_path)
+ else:
+ redacted_image = image if image is not None else image_path
+ except Exception as e:
+ print(
+ f"Error loading image for redacted_image final fallback: {e}"
+ )
+ redacted_image = image if image is not None else image_path
+ elif isinstance(image_path, Image.Image):
+ redacted_image = image_path
+ else:
+ redacted_image = image if image is not None else image_path
+
+ if redacted_image is not None:
+ pdf_image_file_paths.append(redacted_image) # .append(image_path)
+ pymupdf_doc = pdf_image_file_paths
+ else:
+ print(
+ f"Warning: redacted_image is None for image file {file_path}, skipping append"
+ )
+
+ # Check if the image_path already exists in annotations_all_pages
+ existing_index = next(
+ (
+ index
+ for index, ann in enumerate(annotations_all_pages)
+ if ann["image"] == page_image_annotations["image"]
+ ),
+ None,
+ )
+ if existing_index is not None:
+ # Replace the existing annotation
+ annotations_all_pages[existing_index] = page_image_annotations
+ else:
+ # Append new annotation if it doesn't exist
+ annotations_all_pages.append(page_image_annotations)
+
+ current_loop_page += 1
+
+ # Break if new page is a multiple of chosen page_break_val
+ if current_loop_page % page_break_val == 0:
+ print(
+ f"current_loop_page: {current_loop_page} is a multiple of page_break_val: {page_break_val}, breaking loop"
+ )
+ page_break_return = True
+ progress.close(_tqdm=pii_progress_bar)
+ tqdm._instances.clear()
+
+ if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
+ # Write the updated existing textract data back to the JSON file
+ if original_textract_data != textract_data:
+ secure_file_write(
+ output_folder,
+ file_name + textract_suffix + "_textract.json",
+ json.dumps(textract_data, separators=(",", ":")),
+ )
+
+ if textract_json_file_path not in log_files_output_paths:
+ log_files_output_paths.append(textract_json_file_path)
+
+ if text_extraction_method == LOCAL_OCR_MODEL_TEXT_EXTRACT_OPTION:
+ if (
+ original_all_page_line_level_ocr_results_with_words
+ != all_page_line_level_ocr_results_with_words
+ ):
+ # Write the updated existing local OCR data back to the JSON file
+ with open(
+ all_page_line_level_ocr_results_with_words_json_file_path, "w"
+ ) as json_file:
+ json.dump(
+ all_page_line_level_ocr_results_with_words,
+ json_file,
+ separators=(",", ":"),
+ ) # indent=4 makes the JSON file pretty-printed
+
+ if (
+ all_page_line_level_ocr_results_with_words_json_file_path
+ not in log_files_output_paths
+ ):
+ log_files_output_paths.append(
+ all_page_line_level_ocr_results_with_words_json_file_path
+ )
+
+ all_pages_decision_process_table = pd.DataFrame(
+ all_pages_decision_process_list
+ )
+
+ all_line_level_ocr_results_df = pd.DataFrame(
+ all_line_level_ocr_results_list
+ )
+
+ if selection_element_results_list:
+ selection_element_results_list_df = pd.DataFrame(
+ selection_element_results_list
+ )
+ if form_key_value_results_list:
+ pd.DataFrame(form_key_value_results_list)
+ form_key_value_results_list_df = convert_question_answer_to_dataframe(
+ form_key_value_results_list, page_sizes_df
+ )
+
+ return (
+ pymupdf_doc,
+ all_pages_decision_process_table,
+ log_files_output_paths,
+ textract_request_metadata,
+ annotations_all_pages,
+ current_loop_page,
+ page_break_return,
+ all_line_level_ocr_results_df,
+ comprehend_query_number,
+ all_page_line_level_ocr_results,
+ all_page_line_level_ocr_results_with_words,
+ selection_element_results_list_df,
+ form_key_value_results_list_df,
+ out_file_paths,
+ )
+
+ if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
+ # Write the updated existing textract data back to the JSON file
+
+ if overwrite_existing_ocr_results or original_textract_data != textract_data:
+ secure_file_write(
+ output_folder,
+ file_name + textract_suffix + "_textract.json",
+ json.dumps(textract_data, separators=(",", ":")),
+ )
+
+ if textract_json_file_path not in log_files_output_paths:
+ log_files_output_paths.append(textract_json_file_path)
+
+ if text_extraction_method == LOCAL_OCR_MODEL_TEXT_EXTRACT_OPTION:
+ # print(
+ # f"Writing updated existing local OCR data back to the JSON file: {all_page_line_level_ocr_results_with_words_json_file_path}"
+ # )
+ if (
+ overwrite_existing_ocr_results
+ or original_all_page_line_level_ocr_results_with_words
+ != all_page_line_level_ocr_results_with_words
+ ):
+ # Write the updated existing textract data back to the JSON file
+ with open(
+ all_page_line_level_ocr_results_with_words_json_file_path, "w"
+ ) as json_file:
+ json.dump(
+ all_page_line_level_ocr_results_with_words,
+ json_file,
+ separators=(",", ":"),
+ ) # indent=4 makes the JSON file pretty-printed
+
+ if (
+ all_page_line_level_ocr_results_with_words_json_file_path
+ not in log_files_output_paths
+ ):
+ log_files_output_paths.append(
+ all_page_line_level_ocr_results_with_words_json_file_path
+ )
+
+ # Build outputs as DataFrames (lists can contain either DataFrames or dict rows).
+ # Some code paths append per-page DataFrames; others append dict records.
+ if all_pages_decision_process_list:
+ _dp_dfs = [
+ x for x in all_pages_decision_process_list if isinstance(x, pd.DataFrame)
+ ]
+ _dp_rows = [x for x in all_pages_decision_process_list if isinstance(x, dict)]
+ if _dp_dfs and _dp_rows:
+ all_pages_decision_process_table = pd.concat(
+ _dp_dfs + [pd.DataFrame(_dp_rows)], ignore_index=True
+ )
+ elif _dp_dfs:
+ all_pages_decision_process_table = pd.concat(_dp_dfs, ignore_index=True)
+ else:
+ all_pages_decision_process_table = pd.DataFrame(_dp_rows)
+ else:
+ all_pages_decision_process_table = pd.DataFrame()
+
+ if all_line_level_ocr_results_list:
+ _ocr_dfs = [
+ x for x in all_line_level_ocr_results_list if isinstance(x, pd.DataFrame)
+ ]
+ _ocr_rows = [x for x in all_line_level_ocr_results_list if isinstance(x, dict)]
+ if _ocr_dfs and _ocr_rows:
+ all_line_level_ocr_results_df = pd.concat(
+ _ocr_dfs + [pd.DataFrame(_ocr_rows)], ignore_index=True
+ )
+ elif _ocr_dfs:
+ all_line_level_ocr_results_df = pd.concat(_ocr_dfs, ignore_index=True)
+ else:
+ all_line_level_ocr_results_df = pd.DataFrame(_ocr_rows)
+ else:
+ all_line_level_ocr_results_df = pd.DataFrame(
+ columns=["page", "text", "left", "top", "width", "height", "line", "conf"]
+ )
+
+ # Convert decision table and ocr results to relative coordinates
+ _pages_pdf_pts = set(pages_in_pdf_points) if pages_in_pdf_points else None
+ all_pages_decision_process_table = divide_coordinates_by_page_sizes(
+ all_pages_decision_process_table,
+ page_sizes_df,
+ xmin="xmin",
+ xmax="xmax",
+ ymin="ymin",
+ ymax="ymax",
+ pages_in_pdf_points=_pages_pdf_pts,
+ )
+
+ all_line_level_ocr_results_df = divide_coordinates_by_page_sizes(
+ all_line_level_ocr_results_df,
+ page_sizes_df,
+ xmin="left",
+ xmax="width",
+ ymin="top",
+ ymax="height",
+ )
+
+ if selection_element_results_list:
+ selection_element_results_list_df = pd.DataFrame(selection_element_results_list)
+ if form_key_value_results_list:
+ pd.DataFrame(form_key_value_results_list)
+ form_key_value_results_list_df = convert_question_answer_to_dataframe(
+ form_key_value_results_list, page_sizes_df
+ )
+
+ return (
+ pymupdf_doc,
+ all_pages_decision_process_table,
+ log_files_output_paths,
+ textract_request_metadata,
+ annotations_all_pages,
+ current_loop_page,
+ page_break_return,
+ all_line_level_ocr_results_df,
+ comprehend_query_number,
+ all_page_line_level_ocr_results,
+ all_page_line_level_ocr_results_with_words,
+ selection_element_results_list_df,
+ form_key_value_results_list_df,
+ out_file_paths,
+ llm_model_name,
+ llm_total_input_tokens,
+ llm_total_output_tokens,
+ vlm_model_name,
+ vlm_total_input_tokens,
+ vlm_total_output_tokens,
+ )
+
+
+###
+# PIKEPDF TEXT DETECTION/REDACTION
+###
+
+
+def get_text_container_characters(text_container: LTTextContainer):
+
+ if isinstance(text_container, LTTextContainer):
+ characters = [
+ char
+ for line in text_container
+ if isinstance(line, LTTextLine) or isinstance(line, LTTextLineHorizontal)
+ for char in line
+ ]
+
+ return characters
+ return []
+
+
+def create_line_level_ocr_results_from_characters(
+ char_objects: List, line_number: int
+) -> Tuple[List[OCRResult], List[List]]:
+ """
+ Create OCRResult objects based on a list of pdfminer LTChar objects.
+ This version is corrected to use the specified OCRResult class definition.
+ """
+ line_level_results_out = list()
+ line_level_characters_out = list()
+ character_objects_out = list()
+
+ full_text = ""
+ # [x0, y0, x1, y1]
+ overall_bbox = [float("inf"), float("inf"), float("-inf"), float("-inf")]
+
+ for char in char_objects:
+ character_objects_out.append(char)
+
+ if isinstance(char, LTAnno):
+ added_text = char.get_text()
+ full_text += added_text
+
+ if "\n" in added_text:
+ if full_text.strip():
+ # Create OCRResult for line
+ line_level_results_out.append(
+ OCRResult(
+ text=full_text.strip(),
+ left=round(overall_bbox[0], 2),
+ top=round(overall_bbox[1], 2),
+ width=round(overall_bbox[2] - overall_bbox[0], 2),
+ height=round(overall_bbox[3] - overall_bbox[1], 2),
+ line=line_number,
+ )
+ )
+ line_level_characters_out.append(character_objects_out)
+
+ # Reset for the next line
+ character_objects_out = list()
+ full_text = ""
+ overall_bbox = [
+ float("inf"),
+ float("inf"),
+ float("-inf"),
+ float("-inf"),
+ ]
+ line_number += 1
+ continue
+
+ # This part handles LTChar objects
+ added_text = clean_unicode_text(
+ char.get_text(), preserve_international_scripts=True
+ )
+ full_text += added_text
+
+ x0, y0, x1, y1 = char.bbox
+ overall_bbox[0] = min(overall_bbox[0], x0)
+ overall_bbox[1] = min(overall_bbox[1], y0)
+ overall_bbox[2] = max(overall_bbox[2], x1)
+ overall_bbox[3] = max(overall_bbox[3], y1)
+
+ # Process the last line
+ if full_text.strip():
+ line_number += 1
+ line_ocr_result = OCRResult(
+ text=full_text.strip(),
+ left=round(overall_bbox[0], 2),
+ top=round(overall_bbox[1], 2),
+ width=round(overall_bbox[2] - overall_bbox[0], 2),
+ height=round(overall_bbox[3] - overall_bbox[1], 2),
+ line=line_number,
+ )
+ line_level_results_out.append(line_ocr_result)
+ line_level_characters_out.append(character_objects_out)
+
+ return line_level_results_out, line_level_characters_out
+
+
+def generate_words_for_line(line_chars: List) -> List[Dict[str, Any]]:
+ """
+ Generates word-level results for a single, pre-defined line of characters.
+
+ This robust version correctly identifies word breaks by:
+ 1. Treating specific punctuation characters as standalone words.
+ 2. Explicitly using space characters (' ') as a primary word separator.
+ 3. Using a geometric gap between characters as a secondary, heuristic separator.
+
+ Args:
+ line_chars: A list of pdfminer.six LTChar/LTAnno objects for one line.
+
+ Returns:
+ A list of dictionaries, where each dictionary represents an individual word.
+ """
+ # We only care about characters with coordinates and text for word building.
+ text_chars = [c for c in line_chars if hasattr(c, "bbox") and c.get_text()]
+
+ if not text_chars:
+ return []
+
+ # Sort characters by horizontal position for correct processing.
+ text_chars.sort(key=lambda c: c.bbox[0])
+
+ # NEW: Define punctuation that should be split into separate words.
+ # The hyphen '-' is intentionally excluded to keep words like 'high-tech' together.
+ PUNCTUATION_TO_SPLIT = {".", ",", "?", "!", ":", ";", "(", ")", "[", "]", "{", "}"}
+
+ line_words = list()
+ current_word_text = ""
+ current_word_bbox = [float("inf"), float("inf"), -1, -1] # [x0, y0, x1, y1]
+ prev_char = None
+
+ def finalize_word():
+ nonlocal current_word_text, current_word_bbox
+ # Only add the word if it contains non-space text
+ if current_word_text.strip():
+ # Safeguard: avoid emitting y=-1 (e.g. single-char word edge case)
+ if current_word_bbox[1] == -1 and current_word_bbox[3] != -1:
+ current_word_bbox[1] = current_word_bbox[3]
+ # bbox from [x0, y0, x1, y1] to your required format
+ final_bbox = [
+ round(current_word_bbox[0], 2),
+ round(current_word_bbox[3], 2), # Note: using y1 from pdfminer bbox
+ round(current_word_bbox[2], 2),
+ round(current_word_bbox[1], 2), # Note: using y0 from pdfminer bbox
+ ]
+ line_words.append(
+ {
+ "text": current_word_text.strip(),
+ "bounding_box": final_bbox,
+ "conf": 100.0,
+ }
+ )
+ # Reset for the next word
+ current_word_text = ""
+ current_word_bbox = [float("inf"), float("inf"), -1, -1]
+
+ for char in text_chars:
+ char_text = clean_unicode_text(
+ char.get_text(), preserve_international_scripts=True
+ )
+
+ # 1. NEW: Check for splitting punctuation first.
+ if char_text in PUNCTUATION_TO_SPLIT:
+ # Finalize any word that came immediately before the punctuation.
+ finalize_word()
+
+ # Treat the punctuation itself as a separate word.
+ px0, py0, px1, py1 = char.bbox
+ punc_bbox = [round(px0, 2), round(py1, 2), round(px1, 2), round(py0, 2)]
+ line_words.append(
+ {"text": char_text, "bounding_box": punc_bbox, "conf": 100.0}
+ )
+
+ prev_char = char
+ continue # Skip to the next character
+
+ # 2. Primary Signal: Is the character a space?
+ if char_text.isspace():
+ finalize_word() # End the preceding word
+ prev_char = char
+ continue # Skip to the next character, do not add the space to any word
+
+ # 3. Secondary Signal: Is there a large geometric gap?
+ if prev_char:
+ # A gap is considered a word break if it's larger than a fraction of the font size.
+ space_threshold = prev_char.size * 0.25 # 25% of the char size
+ min_gap = 1.0 # Or at least 1.0 unit
+ gap = (
+ char.bbox[0] - prev_char.bbox[2]
+ ) # gap = current_char.x0 - prev_char.x1
+
+ if gap > max(space_threshold, min_gap):
+ finalize_word() # Found a gap, so end the previous word.
+
+ # Append the character's text and update the bounding box for the current word
+ current_word_text += char_text
+
+ x0, y0, x1, y1 = char.bbox
+ # First character of this word: set bbox from character (avoids -1 for single-char words)
+ if current_word_bbox[1] == -1 or current_word_bbox[3] == -1:
+ current_word_bbox = [x0, y0, x1, y1]
+ else:
+ current_word_bbox[0] = min(current_word_bbox[0], x0)
+ current_word_bbox[1] = min(
+ current_word_bbox[3], y0
+ ) # pdfminer y0 is bottom
+ current_word_bbox[2] = max(current_word_bbox[2], x1)
+ current_word_bbox[3] = max(current_word_bbox[1], y1) # pdfminer y1 is top
+
+ prev_char = char
+
+ # After the loop, finalize the last word that was being built.
+ finalize_word()
+
+ return line_words
+
+
+def process_page_to_structured_ocr(
+ all_char_objects: List,
+ page_number: int,
+ text_line_number: int, # This will now be treated as the STARTING line number
+) -> Tuple[Dict[str, Any], List[OCRResult], List[List]]:
+ """
+ Orchestrates the OCR process, correctly handling multiple lines.
+
+ Returns:
+ A tuple containing:
+ 1. A dictionary with detailed line/word results for the page.
+ 2. A list of the complete OCRResult objects for each line.
+ 3. A list of lists, containing the character objects for each line.
+ """
+ page_data = {"page": str(page_number), "results": {}}
+
+ # Step 1: Get definitive lines and their character groups.
+ # This function correctly returns all lines found in the input characters.
+ line_results, lines_char_groups = create_line_level_ocr_results_from_characters(
+ all_char_objects, text_line_number
+ )
+
+ if not line_results:
+ return {}, [], []
+
+ # Step 2: Iterate through each found line and generate its words.
+ for i, (line_info, char_group) in enumerate(zip(line_results, lines_char_groups)):
+
+ current_line_number = line_info.line # text_line_number + i
+
+ word_level_results = generate_words_for_line(char_group)
+
+ # Create a unique, incrementing line number for each iteration.
+
+ line_key = f"text_line_{current_line_number}"
+
+ line_bbox = [
+ line_info.left,
+ line_info.top,
+ line_info.left + line_info.width,
+ line_info.top + line_info.height,
+ ]
+
+ # Now, each line is added to the dictionary with its own unique key.
+ page_data["results"][line_key] = {
+ "line": current_line_number, # Use the unique line number
+ "text": line_info.text,
+ "bounding_box": line_bbox,
+ "words": word_level_results,
+ "conf": 100.0,
+ }
+
+ # The list of OCRResult objects is already correct.
+ line_level_ocr_results_list = line_results
+
+ # Return the structured dictionary, the list of OCRResult objects, and the character groups
+ return page_data, line_level_ocr_results_list, lines_char_groups
+
+
+def _extract_text_from_single_page(file_path: str, page_no: int) -> Tuple[
+ int,
+ List,
+ List,
+ pd.DataFrame,
+ List[Dict[str, Any]],
+]:
+ """
+ Extract selectable text and structured OCR-style results for a single PDF page.
+ Used by redact_text_pdf for parallel text extraction (ThreadPoolExecutor).
+ Only reads file_path; safe to call from multiple threads.
+
+ Returns:
+ Tuple of (page_no, all_page_line_level_text_extraction_results_list,
+ all_page_line_text_extraction_characters, page_text_ocr_outputs,
+ page_ocr_results_with_words).
+ """
+ reported_page_number = page_no + 1
+ all_page_line_level_text_extraction_results_list = list()
+ all_page_line_text_extraction_characters = list()
+ page_ocr_results_with_words = list()
+ page_text_ocr_outputs_list = list()
+
+ empty_ocr_df = pd.DataFrame(
+ columns=[
+ "page",
+ "text",
+ "left",
+ "top",
+ "width",
+ "height",
+ "line",
+ "conf",
+ ]
+ )
+
+ text_line_no = 1
+ for page_layout in extract_pages(file_path, page_numbers=[page_no], maxpages=1):
+ for text_container in page_layout:
+ characters = list()
+ if isinstance(text_container, LTTextContainer) or isinstance(
+ text_container, LTAnno
+ ):
+ characters = get_text_container_characters(text_container)
+
+ (
+ line_level_ocr_results_with_words,
+ line_level_text_results_list,
+ line_characters,
+ ) = process_page_to_structured_ocr(
+ characters,
+ page_number=int(reported_page_number),
+ text_line_number=text_line_no,
+ )
+
+ text_line_no += len(line_level_text_results_list)
+
+ if line_level_text_results_list:
+ line_level_text_results_df = pd.DataFrame(
+ [
+ {
+ "page": page_no + 1,
+ "text": (result.text).strip(),
+ "left": result.left,
+ "top": result.top,
+ "width": result.width,
+ "height": result.height,
+ "line": result.line,
+ "conf": 100.0,
+ }
+ for result in line_level_text_results_list
+ ]
+ )
+ page_text_ocr_outputs_list.append(line_level_text_results_df)
+
+ all_page_line_level_text_extraction_results_list.extend(
+ line_level_text_results_list
+ )
+ all_page_line_text_extraction_characters.extend(line_characters)
+ page_ocr_results_with_words.append(line_level_ocr_results_with_words)
+
+ if page_text_ocr_outputs_list:
+ non_empty = [df for df in page_text_ocr_outputs_list if not df.empty]
+ page_text_ocr_outputs = (
+ pd.concat(non_empty, ignore_index=True) if non_empty else empty_ocr_df
+ )
+ else:
+ page_text_ocr_outputs = empty_ocr_df.copy()
+
+ return (
+ page_no,
+ all_page_line_level_text_extraction_results_list,
+ all_page_line_text_extraction_characters,
+ page_text_ocr_outputs,
+ page_ocr_results_with_words,
+ )
+
+
+# Punctuation that should be split into separate words (same as generate_words_for_line).
+_PUNCTUATION_TO_SPLIT = {".", ",", "?", "!", ":", ";", "(", ")", "[", "]", "{", "}"}
+
+
+def _words_from_line_chars_pymupdf(
+ line_chars: List[Dict[str, Any]], off_x: float, off_y: float
+) -> List[Dict[str, Any]]:
+ """
+ Build word-level results from PyMuPDF line characters, splitting punctuation
+ into separate words (same behaviour as generate_words_for_line).
+
+ Args:
+ line_chars: List of dicts with "text", "bbox" (x0,y0,x1,y1), "size".
+ off_x, off_y: Page offset to add to bounding boxes.
+
+ Returns:
+ List of {"text", "boundingBox": [x0,y0,x1,y1], "conf": 100.0}.
+ """
+ if not line_chars:
+ return []
+ # Sort by horizontal position
+ sorted_chars = sorted(line_chars, key=lambda c: c["bbox"][0])
+ line_words: List[Dict[str, Any]] = []
+ current_word_text = ""
+ current_word_bbox: List[float] = [float("inf"), float("inf"), -1, -1]
+ prev_char: Optional[Dict[str, Any]] = None
+
+ def finalize_word() -> None:
+ nonlocal current_word_text, current_word_bbox
+ if not current_word_text.strip():
+ current_word_text = ""
+ current_word_bbox = [float("inf"), float("inf"), -1, -1]
+ return
+ if current_word_bbox[1] == -1 and current_word_bbox[3] != -1:
+ current_word_bbox[1] = current_word_bbox[3]
+ if current_word_bbox[3] == -1 and current_word_bbox[1] != -1:
+ current_word_bbox[3] = current_word_bbox[1]
+ x0, y0, x1, y1 = current_word_bbox
+ line_words.append(
+ {
+ "text": current_word_text.strip(),
+ "boundingBox": [
+ round(x0 + off_x, 2),
+ round(y0 + off_y, 2),
+ round(x1 + off_x, 2),
+ round(y1 + off_y, 2),
+ ],
+ "conf": 100.0,
+ }
+ )
+ current_word_text = ""
+ current_word_bbox = [float("inf"), float("inf"), -1, -1]
+
+ for char in sorted_chars:
+ char_text = clean_unicode_text(
+ char["text"], preserve_international_scripts=True
+ )
+ bbox = char["bbox"]
+ x0, y0, x1, y1 = bbox
+ char.get("size", 12.0)
+
+ if char_text in _PUNCTUATION_TO_SPLIT:
+ finalize_word()
+ line_words.append(
+ {
+ "text": char_text,
+ "boundingBox": [
+ round(x0 + off_x, 2),
+ round(y0 + off_y, 2),
+ round(x1 + off_x, 2),
+ round(y1 + off_y, 2),
+ ],
+ "conf": 100.0,
+ }
+ )
+ prev_char = char
+ continue
+
+ if char_text.isspace():
+ finalize_word()
+ prev_char = char
+ continue
+
+ if prev_char is not None:
+ space_threshold = prev_char.get("size", 12.0) * 0.25
+ min_gap = 1.0
+ gap = x0 - prev_char["bbox"][2]
+ if gap > max(space_threshold, min_gap):
+ finalize_word()
+
+ current_word_text += char_text
+ if current_word_bbox[1] == -1 or current_word_bbox[3] == -1:
+ current_word_bbox = [x0, y0, x1, y1]
+ else:
+ current_word_bbox[0] = min(current_word_bbox[0], x0)
+ current_word_bbox[1] = min(current_word_bbox[1], y0)
+ current_word_bbox[2] = max(current_word_bbox[2], x1)
+ current_word_bbox[3] = max(current_word_bbox[3], y1)
+ prev_char = char
+
+ finalize_word()
+ return line_words
+
+
+def process_page_to_structured_ocr_pymupdf(
+ page: pymupdf.Page, page_number: int, start_line_number: int
+) -> Tuple[Dict[str, Any], List[OCRResult], List[List[Dict]]]:
+
+ # 1. Extract once (no page.get_text("words") needed; words built from chars)
+ raw_dict = page.get_text("rawdict")
+
+ # Coordinates from get_text() are in CropBox-local space (0,0 = CropBox top-left).
+ # Compute the offset needed to convert to MediaBox-local space, which is what the
+ # downstream coordinate division (divide_coordinates_by_page_sizes) and the
+ # image_annotator display expect. When CropBox == MediaBox the offset is 0.
+ off_x = (
+ page.cropbox.x0 - page.mediabox.x0
+ ) # = -page.mediabox.x0 (cropbox.x0 is always 0)
+ off_y = page.cropbox.y0 - page.mediabox.y0 # = -page.mediabox.y0
+
+ page_data = {"page": str(page_number), "results": {}}
+ line_results = []
+ lines_char_groups = []
+ valid_line_count = 0
+
+ for block in raw_dict["blocks"]:
+ if "lines" not in block:
+ continue
+
+ for line in block["lines"]:
+ # Join text and filter blanks
+ line_text = "".join(
+ ["".join([c["c"] for c in s["chars"]]) for s in line["spans"]]
+ )
+ if not line_text.strip():
+ continue
+
+ current_line_idx = start_line_number + valid_line_count
+ lx0, ly0, lx1, ly1 = line["bbox"]
+
+ # 2. Collect Character Objects in MediaBox-local space (same as lines/words).
+ # Raw get_text("rawdict") char bboxes are CropBox-local; merge_text_bounding_boxes
+ # uses these dicts — without the offset, merged redaction boxes stay CropBox-local
+ # while convert_redaction_boxes_pymupdf_to_pdf assumes MediaBox-local coords.
+ line_chars = []
+ for span in line["spans"]:
+ for char in span["chars"]:
+ cx0, cy0, cx1, cy1 = char["bbox"]
+ line_chars.append(
+ {
+ "text": char["c"],
+ "bbox": [
+ cx0 + off_x,
+ cy0 + off_y,
+ cx1 + off_x,
+ cy1 + off_y,
+ ],
+ "size": span["size"],
+ }
+ )
+
+ # 3. Create OCRResult (Corrected math)
+ line_obj = OCRResult(
+ text=line_text.strip(),
+ left=round(lx0 + off_x, 2), # Position + Offset
+ top=round(ly0 + off_y, 2), # Position + Offset
+ width=round(lx1 - lx0, 2), # Distance (No offset!)
+ height=round(ly1 - ly0, 2), # Distance (No offset!)
+ line=current_line_idx,
+ conf=100.0,
+ model="PyMuPDF",
+ )
+
+ # 4. Build words from line chars with punctuation split into separate words
+ # (char bboxes already include off_x/off_y)
+ word_level_results = _words_from_line_chars_pymupdf(line_chars, 0.0, 0.0)
+
+ # 5. Build final dictionary
+ line_key = f"text_line_{current_line_idx}"
+ page_data["results"][line_key] = {
+ "line": current_line_idx,
+ "text": line_text.strip(),
+ "bounding_box": [
+ round(lx0 + off_x, 2),
+ round(ly0 + off_y, 2),
+ round(lx1 + off_x, 2),
+ round(ly1 + off_y, 2),
+ ],
+ "words": word_level_results,
+ "conf": 100.0,
+ }
+
+ line_results.append(line_obj)
+ lines_char_groups.append(line_chars)
+ valid_line_count += 1
+
+ return page_data, line_results, lines_char_groups
+
+
+def _extract_text_from_single_page_pymupdf(pdf_bytes: bytes, page_no: int) -> Tuple[
+ int,
+ List[Any], # OCRResult objects
+ List[List[Dict]], # Character groups per line
+ pd.DataFrame, # Structured DataFrame
+ List[Dict[str, Any]], # Page OCR results with words
+]:
+ """
+ Optimized version using PyMuPDF.
+ Maintains the same return signature for parallel processing.
+ """
+ reported_page_number = page_no + 1
+
+ # Initialize containers
+ all_page_line_results = [] # List of OCRResult objects
+ all_page_char_groups = [] # List of List of char dicts
+ page_ocr_results_with_words = [] # List of page_data dicts
+
+ # 1. Open the document inside the function for thread safety
+ doc = pymupdf.open(stream=pdf_bytes, filetype="pdf")
+ page = doc[page_no]
+
+ # 2. Call our helper to get structured data
+ # We pass 1 as the starting line number for this page
+ page_data, line_results, char_groups = process_page_to_structured_ocr_pymupdf(
+ page=page, page_number=reported_page_number, start_line_number=1
+ )
+
+ # 3. Handle the return objects to match your original structure
+ all_page_line_results.extend(line_results)
+ all_page_char_groups.extend(char_groups)
+
+ # Your original code expected a list of these dicts (per container)
+ # Since PyMuPDF processes the page as a whole, we wrap it in a list.
+ page_ocr_results_with_words.append(page_data)
+
+ # 4. Create the DataFrame
+ if line_results:
+ page_text_ocr_outputs = pd.DataFrame(
+ [
+ {
+ "page": reported_page_number,
+ "text": result.text.strip(),
+ "left": result.left,
+ "top": result.top,
+ "width": result.width,
+ "height": result.height,
+ "line": result.line,
+ "conf": 100.0,
+ }
+ for result in line_results
+ ]
+ )
+ else:
+ page_text_ocr_outputs = pd.DataFrame(
+ columns=["page", "text", "left", "top", "width", "height", "line", "conf"]
+ )
+
+ doc.close() # Always close for memory efficiency
+
+ return (
+ page_no,
+ all_page_line_results,
+ all_page_char_groups,
+ page_text_ocr_outputs,
+ page_ocr_results_with_words,
+ )
+
+
+def page_has_extractable_text(
+ file_path: str, page_no: int, min_words: int = EFFICIENT_OCR_MIN_WORDS
+) -> bool:
+ """
+ Determine if a single PDF page has enough selectable text to use the text-only
+ route (using pdfminer). Used by EFFICIENT_OCR to decide whether to use
+ redact_text_pdf or redact_image_pdf for that page.
+
+ Prefer get_pages_extractable_text_single_pass() when checking many pages:
+ it opens the PDF once instead of once per page.
+
+ Args:
+ file_path: Path to the PDF file.
+ page_no: 0-based page index.
+ min_words: Minimum number of words required on the page to use text-only
+ route; below this the page will use OCR. Defaults to EFFICIENT_OCR_MIN_WORDS.
+
+ Returns:
+ True if the page yields at least min_words of extractable text;
+ False otherwise (page will use OCR).
+ """
+ try:
+ for _, has_text in get_pages_extractable_text_single_pass(
+ file_path, [page_no], min_words
+ ):
+ return has_text
+ return False
+ except Exception:
+ return False
+
+
+def get_pages_extractable_text_single_pass(
+ file_path: str,
+ page_numbers_0based: List[int],
+ min_words: int = EFFICIENT_OCR_MIN_WORDS,
+):
+ """
+ Determine which pages have enough selectable text in a single PDF pass.
+ Opens the file once and iterates over the given pages, avoiding N separate
+ open/parse cycles. Use this instead of calling page_has_extractable_text
+ in a loop or thread pool for the EFFICIENT_OCR initial check.
+
+ Args:
+ file_path: Path to the PDF file.
+ page_numbers_0based: 0-based page indices to check.
+ min_words: Minimum words required to use text-only route.
+
+ Yields:
+ (page_no, has_enough_text) for each page in order.
+ """
+ if not page_numbers_0based:
+ return
+ try:
+ # Single extract_pages call: one file open for all requested pages
+ for page_no, page_layout in zip(
+ page_numbers_0based,
+ extract_pages(
+ file_path,
+ page_numbers=page_numbers_0based,
+ maxpages=len(page_numbers_0based),
+ ),
+ ):
+ word_count = 0
+ for text_container in page_layout:
+ if not isinstance(text_container, LTTextContainer):
+ continue
+ characters = get_text_container_characters(text_container)
+ if not characters:
+ continue
+ raw_text = "".join(
+ c.get_text() for c in characters if hasattr(c, "get_text")
+ )
+ if raw_text.strip():
+ word_count += len([w for w in raw_text.split() if w.strip()])
+ if word_count >= min_words:
+ break
+ yield (page_no, word_count >= min_words)
+ except Exception:
+ for p in page_numbers_0based:
+ yield (p, False)
+
+
+def create_text_redaction_process_results(
+ analyser_results, analysed_bounding_boxes, page_num
+):
+ decision_process_table = pd.DataFrame()
+
+ if len(analyser_results) > 0 and len(analysed_bounding_boxes) > 0:
+ # Create summary df of annotations to be made
+ analysed_bounding_boxes_df_new = pd.DataFrame(analysed_bounding_boxes)
+
+ # Support both camelCase (merge_text_bounding_boxes) and snake_case
+ bbox_col = (
+ "boundingBox"
+ if "boundingBox" in analysed_bounding_boxes_df_new.columns
+ else "bounding_box"
+ )
+ if bbox_col not in analysed_bounding_boxes_df_new.columns:
+ return decision_process_table
+
+ # Split the bounding box list into four separate columns
+ analysed_bounding_boxes_df_new[["xmin", "ymin", "xmax", "ymax"]] = (
+ analysed_bounding_boxes_df_new[bbox_col].apply(pd.Series)
+ )
+
+ # Convert the new columns to integers (if needed)
+ # analysed_bounding_boxes_df_new.loc[:, ['xmin', 'ymin', 'xmax', 'ymax']] = (analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']].astype(float) / 5).round() * 5
+
+ analysed_bounding_boxes_df_text = (
+ analysed_bounding_boxes_df_new["result"]
+ .astype(str)
+ .str.split(",", expand=True)
+ .replace(".*: ", "", regex=True)
+ )
+ analysed_bounding_boxes_df_text.columns = ["label", "start", "end", "score"]
+ analysed_bounding_boxes_df_new = pd.concat(
+ [analysed_bounding_boxes_df_new, analysed_bounding_boxes_df_text], axis=1
+ )
+ analysed_bounding_boxes_df_new["page"] = page_num + 1
+
+ decision_process_table = pd.concat(
+ [decision_process_table, analysed_bounding_boxes_df_new], axis=0
+ ).drop("result", axis=1)
+
+ return decision_process_table
+
+
+def _build_one_pikepdf_redaction_annotation(analysed_bounding_box: dict):
+ """Build a single pikepdf redaction annotation; safe to run in a thread."""
+ bounding_box = analysed_bounding_box["boundingBox"]
+ return Dictionary(
+ Type=Name.Annot,
+ Subtype=Name.Square,
+ QuadPoints=[
+ round(bounding_box[0], 2),
+ round(bounding_box[3], 2),
+ round(bounding_box[2], 2),
+ round(bounding_box[3], 2),
+ round(bounding_box[0], 2),
+ round(bounding_box[1], 2),
+ round(bounding_box[2], 2),
+ round(bounding_box[1], 2),
+ ],
+ Rect=[
+ round(bounding_box[0], 2),
+ round(bounding_box[1], 2),
+ round(bounding_box[2], 2),
+ round(bounding_box[3], 2),
+ ],
+ C=list(CUSTOM_BOX_COLOUR),
+ IC=[0, 0, 0],
+ CA=1,
+ T=analysed_bounding_box["result"].entity_type,
+ Contents=analysed_bounding_box["text"],
+ BS=Dictionary(W=0, S=Name.S),
+ )
+
+
+def convert_redaction_boxes_pymupdf_to_pdf(
+ analysed_bounding_boxes: List[dict], page_height: float
+) -> List[dict]:
+ """
+ Convert redaction box coordinates from PyMuPDF (top-left origin, y down)
+ to PDF Rect convention (bottom-left origin, y up). Does not mutate inputs.
+ """
+ if not analysed_bounding_boxes or page_height <= 0:
+ return list(analysed_bounding_boxes) if analysed_bounding_boxes else []
+ out = []
+ for box in analysed_bounding_boxes:
+ box = copy.deepcopy(box)
+ bbox = box.get("boundingBox") or box.get("bounding_box")
+ if not bbox or len(bbox) < 4:
+ out.append(box)
+ continue
+ left, y0, right, y1 = bbox[0], bbox[1], bbox[2], bbox[3]
+ # PyMuPDF: y0 = top (smaller), y1 = bottom (larger). PDF: bottom < top (y up).
+ pdf_y_bottom = page_height - y1
+ pdf_y_top = page_height - y0
+ box["boundingBox"] = [left, pdf_y_bottom, right, pdf_y_top]
+ if "bounding_box" in box:
+ box["bounding_box"] = box["boundingBox"]
+ out.append(box)
+ return out
+
+
+def _run_image_pii_for_one_page(
+ page_no: int,
+ page_data: dict,
+ image_analyser,
+ chosen_redact_comprehend_entities: List[str],
+ chosen_llm_entities: List[str],
+ pii_identification_method: str,
+ comprehend_client,
+ bedrock_runtime,
+ chosen_redact_entities: List[str],
+ language: str,
+ allow_list: List[str],
+ score_threshold: float,
+ nlp_analyser,
+ custom_llm_instructions: str,
+ file_name: str,
+ text_analyzer_kwargs: dict,
+) -> Tuple[int, List, int, str, int, int]:
+ """
+ Run image_analyser.analyze_text for a single page (used for parallel Local/AWS Comprehend in redact_image_pdf).
+ Returns (page_no, page_redaction_bounding_boxes, comprehend_query_number_new, llm_model_name, llm_input_tokens, llm_output_tokens).
+ """
+ page_line_level_ocr_results = page_data.get("page_line_level_ocr_results")
+ page_line_level_ocr_results_with_words = page_data.get(
+ "page_line_level_ocr_results_with_words"
+ )
+ reported_page_number = page_data.get("reported_page_number", str(page_no + 1))
+ if (
+ not page_line_level_ocr_results
+ or not isinstance(page_line_level_ocr_results, dict)
+ or "results" not in page_line_level_ocr_results
+ ):
+ return (page_no, list(), 0, "", 0, 0)
+ if not page_line_level_ocr_results_with_words:
+ return (page_no, list(), 0, "", 0, 0)
+ if isinstance(page_line_level_ocr_results_with_words, list):
+ if not page_line_level_ocr_results_with_words:
+ return (page_no, list(), 0, "", 0, 0)
+ ocr_with_words = page_line_level_ocr_results_with_words[0]
+ else:
+ ocr_with_words = page_line_level_ocr_results_with_words
+ if not isinstance(ocr_with_words, dict) or "results" not in ocr_with_words:
+ return (page_no, list(), 0, "", 0, 0)
+ (
+ page_redaction_bounding_boxes,
+ comprehend_query_number_new,
+ llm_model_name_page,
+ llm_input_tokens_page,
+ llm_output_tokens_page,
+ ) = image_analyser.analyze_text(
+ page_line_level_ocr_results["results"],
+ ocr_with_words["results"],
+ chosen_redact_comprehend_entities=chosen_redact_comprehend_entities,
+ chosen_llm_entities=chosen_llm_entities,
+ pii_identification_method=pii_identification_method,
+ comprehend_client=comprehend_client,
+ bedrock_runtime=bedrock_runtime,
+ custom_entities=chosen_redact_entities,
+ language=language,
+ allow_list=allow_list,
+ score_threshold=score_threshold,
+ nlp_analyser=nlp_analyser,
+ custom_llm_instructions=custom_llm_instructions,
+ file_name=file_name,
+ page_number=int(reported_page_number),
+ **text_analyzer_kwargs,
+ )
+ return (
+ page_no,
+ page_redaction_bounding_boxes,
+ comprehend_query_number_new,
+ llm_model_name_page,
+ llm_input_tokens_page,
+ llm_output_tokens_page,
+ )
+
+
+def _run_pii_for_one_page(
+ extraction_result: Tuple,
+ language: str,
+ chosen_redact_entities: List[str],
+ chosen_redact_comprehend_entities: List[str],
+ allow_list: List[str],
+ pii_identification_method: str,
+ nlp_analyser,
+ score_threshold: float,
+ custom_entities: List[str],
+ comprehend_client,
+ comprehend_query_number: int,
+ bedrock_runtime,
+ model_choice: str,
+ custom_llm_instructions: str,
+ chosen_llm_entities: List[str],
+ output_folder: str,
+ file_name: str,
+) -> Tuple[int, List, str, int, int]:
+ """
+ Run PII identification for a single page (used for parallel Local/AWS Comprehend).
+ Returns (page_no, page_redaction_bounding_boxes, llm_model_name, llm_input_tokens, llm_output_tokens).
+ """
+ (
+ page_no,
+ line_level_text_results_list,
+ line_characters,
+ _page_text_ocr_outputs,
+ _page_ocr_results_with_words,
+ ) = extraction_result
+ reported_page_number = page_no + 1
+ page_analyser_results = list()
+ page_redaction_bounding_boxes = list()
+ (
+ page_redaction_bounding_boxes,
+ llm_model_name_page,
+ llm_input_tokens_page,
+ llm_output_tokens_page,
+ ) = run_page_text_redaction(
+ language,
+ chosen_redact_entities,
+ chosen_redact_comprehend_entities,
+ line_level_text_results_list,
+ line_characters,
+ page_analyser_results,
+ page_redaction_bounding_boxes,
+ comprehend_client,
+ allow_list,
+ pii_identification_method,
+ nlp_analyser,
+ score_threshold,
+ custom_entities,
+ comprehend_query_number,
+ bedrock_runtime=bedrock_runtime,
+ model_choice=model_choice,
+ custom_llm_instructions=custom_llm_instructions,
+ chosen_llm_entities=chosen_llm_entities,
+ output_folder=output_folder,
+ file_name=file_name,
+ page_number=reported_page_number,
+ )
+ return (
+ page_no,
+ page_redaction_bounding_boxes,
+ llm_model_name_page,
+ llm_input_tokens_page,
+ llm_output_tokens_page,
+ )
+
+
+def create_pikepdf_annotations_for_bounding_boxes(analysed_bounding_boxes):
+ if not analysed_bounding_boxes:
+ return []
+ n = len(analysed_bounding_boxes)
+ max_workers = min(MAX_WORKERS, n)
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
+ pikepdf_redaction_annotations_on_page = list(
+ executor.map(
+ _build_one_pikepdf_redaction_annotation, analysed_bounding_boxes
+ )
+ )
+ return pikepdf_redaction_annotations_on_page
+
+
+def redact_text_pdf(
+ file_path: str, # Path to the PDF file to be redacted
+ language: str, # Language of the PDF content
+ chosen_redact_entities: List[str], # List of entities to be redacted
+ chosen_redact_comprehend_entities: List[str],
+ allow_list: List[str] = None, # Optional list of allowed entities
+ page_min: int = 0, # Minimum page number to start redaction
+ page_max: int = 0, # Maximum page number to end redaction
+ current_loop_page: int = 0, # Current page being processed in the loop
+ page_break_return: bool = False, # Flag to indicate if a page break should be returned
+ annotations_all_pages: List[dict] = list(), # List of annotations across all pages
+ all_line_level_ocr_results_df: pd.DataFrame = pd.DataFrame(
+ columns=["page", "text", "left", "top", "width", "height", "line", "conf"]
+ ), # DataFrame for OCR results
+ all_pages_decision_process_table: pd.DataFrame = pd.DataFrame(
+ columns=[
+ "image_path",
+ "page",
+ "label",
+ "xmin",
+ "xmax",
+ "ymin",
+ "ymax",
+ "text",
+ "id",
+ ]
+ ), # DataFrame for decision process table
+ pymupdf_doc: List = list(), # List of PyMuPDF documents
+ all_page_line_level_ocr_results_with_words: List = list(),
+ pii_identification_method: str = "Local",
+ comprehend_query_number: int = 0,
+ comprehend_client="",
+ in_deny_list: List[str] = list(),
+ redact_whole_page_list: List[str] = list(),
+ max_fuzzy_spelling_mistakes_num: int = 1,
+ match_fuzzy_whole_phrase_bool: bool = True,
+ page_sizes_df: pd.DataFrame = pd.DataFrame(),
+ original_cropboxes: List[dict] = list(),
+ text_extraction_only: bool = False,
+ output_folder: str = OUTPUT_FOLDER,
+ input_folder: str = INPUT_FOLDER,
+ page_break_val: int = int(PAGE_BREAK_VALUE), # Value for page break
+ max_time: int = int(MAX_TIME_VALUE),
+ nlp_analyser: AnalyzerEngine = nlp_analyser,
+ progress: Progress = Progress(track_tqdm=True), # Progress tracking object
+ bedrock_runtime=None,
+ model_choice: str = CLOUD_LLM_PII_MODEL_CHOICE,
+ custom_llm_instructions: str = "",
+ chosen_llm_entities: List[str] = None,
+ efficient_ocr: bool = EFFICIENT_OCR,
+ pages_to_process: Optional[List[int]] = None,
+ efficient_ocr_extraction_pass: bool = False,
+ pre_extracted_results: Optional[List[Tuple]] = None,
+):
+ # Initialize LLM token tracking variables
+ llm_total_input_tokens = 0
+ llm_total_output_tokens = 0
+ llm_model_name = ""
+ """
+ Redact chosen entities from a PDF that is made up of multiple pages that are not images.
+
+ Input Variables:
+ - file_path: Path to the PDF file to be redacted
+ - language: Language of the PDF content
+ - chosen_redact_entities: List of entities to be redacted
+ - chosen_redact_comprehend_entities: List of entities to be redacted for AWS Comprehend
+ - allow_list: Optional list of allowed entities
+ - page_min: Minimum page number to start redaction
+ - page_max: Maximum page number to end redaction
+ - text_extraction_method: Type of analysis to perform
+ - current_loop_page: Current page being processed in the loop
+ - page_break_return: Flag to indicate if a page break should be returned
+ - annotations_all_pages: List of annotations across all pages
+ - all_line_level_ocr_results_df: DataFrame for OCR results
+ - all_pages_decision_process_table: DataFrame for decision process table
+ - pymupdf_doc: List of PyMuPDF documents
+ - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
+ - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
+ - comprehend_client (optional): A connection to the AWS Comprehend service via the boto3 package.
+ - in_deny_list (optional, List[str]): A list of custom words that the user has chosen specifically to redact.
+ - redact_whole_page_list (optional, List[str]): A list of pages to fully redact.
+ - max_fuzzy_spelling_mistakes_num (int, optional): The maximum number of spelling mistakes allowed in a searched phrase for fuzzy matching. Can range from 0-9.
+ - match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
+ - page_sizes_df (pd.DataFrame, optional): A pandas dataframe of PDF page sizes in PDF or image format.
+ - original_cropboxes (List[dict], optional): A list of dictionaries containing pymupdf cropbox information.
+ - text_extraction_only (bool, optional): Should the function only extract text, or also do redaction.
+ - language (str, optional): The language to do AWS Comprehend calls. Defaults to value of language if not provided.
+ - output_folder (str, optional): The output folder for the function
+ - input_folder (str, optional): The folder for file inputs.
+ - page_break_val: Value for page break
+ - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
+ - nlp_analyser (AnalyzerEngine, optional): The nlp_analyser object to use for entity detection. Defaults to nlp_analyser.
+ - efficient_ocr (bool, optional): Whether to use efficient OCR. Defaults to EFFICIENT_OCR.
+ - progress: Progress tracking object
+ """
+
+ tic = time.perf_counter()
+
+ if isinstance(all_line_level_ocr_results_df, pd.DataFrame):
+ all_line_level_ocr_results_list = [all_line_level_ocr_results_df]
+
+ if isinstance(all_pages_decision_process_table, pd.DataFrame):
+ # Convert decision outputs to list of dataframes:
+ all_pages_decision_process_list = [all_pages_decision_process_table]
+
+ if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
+ out_message = "Connection to AWS Comprehend service not found."
+ raise Exception(out_message)
+
+ # Try updating the supported languages for the spacy analyser
+ try:
+ nlp_analyser = create_nlp_analyser(language, existing_nlp_analyser=nlp_analyser)
+ # Check list of nlp_analyser recognisers and languages
+ if language != "en":
+ gr.Info(
+ f"Language: {language} only supports the following entity detection: {str(nlp_analyser.registry.get_supported_entities(languages=[language]))}"
+ )
+
+ except Exception as e:
+ print(f"Error creating nlp_analyser for {language}: {e}")
+ raise Exception(f"Error creating nlp_analyser for {language}: {e}")
+
+ # Update custom word list analyser object with any new words that have been added to the custom deny list
+ if in_deny_list:
+ nlp_analyser.registry.remove_recognizer("CUSTOM")
+ new_custom_recogniser = custom_word_list_recogniser(in_deny_list)
+ nlp_analyser.registry.add_recognizer(new_custom_recogniser)
+
+ nlp_analyser.registry.remove_recognizer("CustomWordFuzzyRecognizer")
+ new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(
+ supported_entities=["CUSTOM_FUZZY"],
+ custom_list=in_deny_list,
+ spelling_mistakes_max=max_fuzzy_spelling_mistakes_num,
+ search_whole_phrase=match_fuzzy_whole_phrase_bool,
+ )
+ nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
+
+ # Open with Pikepdf to get text lines
+ pikepdf_pdf = Pdf.open(file_path)
+ number_of_pages = len(pikepdf_pdf.pages)
+
+ file_name = get_file_name_without_type(file_path)
+
+ if not all_page_line_level_ocr_results_with_words:
+ all_page_line_level_ocr_results_with_words = list()
+
+ # Check that page_min and page_max are within expected ranges
+ if page_max > number_of_pages or page_max == 0:
+ page_max = number_of_pages
+
+ if page_min <= 0:
+ page_min = 0
+ else:
+ page_min = page_min - 1
+
+ ###
+ if current_loop_page == 0:
+ page_loop_start = page_min
+ else:
+ page_loop_start = current_loop_page
+
+ page_loop_end = page_max
+
+ # When pages_to_process is provided (e.g. from efficient_ocr), iterate only over those pages (1-based list).
+ if pages_to_process is not None:
+ page_loop_pages = sorted([p - 1 for p in pages_to_process]) # 0-indexed, sorted
+ else:
+ page_loop_pages = None
+
+ print("Page range: ", str(page_loop_start + 1), "to", str(page_loop_end))
+
+ # First pass: parallel text extraction (read-only per page, thread-safe).
+ # When pre_extracted_results is provided (e.g. EFFICIENT_OCR second pass), reuse them.
+ if page_loop_pages is not None:
+ pages_to_iterate = list(page_loop_pages)
+ else:
+ pages_to_iterate = list(range(page_loop_start, page_loop_end))
+
+ if pre_extracted_results is not None and pages_to_process is not None:
+ pages_set = set(p - 1 for p in pages_to_process)
+ extraction_results = [r for r in pre_extracted_results if r[0] in pages_set]
+ extraction_results.sort(key=lambda r: r[0])
+ else:
+ max_workers = min(MAX_WORKERS, len(pages_to_iterate))
+
+ with open(file_path, "rb") as f:
+ pdf_buffer = f.read()
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
+ extraction_results = list(
+ tqdm(
+ executor.map(
+ lambda p: _extract_text_from_single_page_pymupdf(pdf_buffer, p),
+ pages_to_iterate,
+ ),
+ total=len(pages_to_iterate),
+ unit="pages",
+ desc="Extracting text (simple text extraction)",
+ )
+ )
+ extraction_results.sort(key=lambda r: r[0])
+
+ # Optional: run PII detection in parallel for Local and AWS Comprehend (bounded by MAX_WORKERS).
+ # IMPORTANT: never parallelize PII when using inference-server or local-transformers LLM backends.
+ pii_results_by_page = {}
+ if (
+ pii_identification_method in (LOCAL_PII_OPTION, AWS_PII_OPTION)
+ and pii_identification_method
+ not in (INFERENCE_SERVER_PII_OPTION, LOCAL_TRANSFORMERS_LLM_PII_OPTION)
+ and not text_extraction_only
+ and (chosen_redact_entities or chosen_redact_comprehend_entities)
+ and extraction_results
+ ):
+ num_pages = len(extraction_results)
+ max_workers = min(MAX_WORKERS, num_pages)
+ progress(
+ 0.45,
+ desc=f"Detecting PII in parallel ({pii_identification_method}, {num_pages} pages)",
+ )
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
+ pii_results_list = list(
+ tqdm(
+ executor.map(
+ lambda ext: _run_pii_for_one_page(
+ ext,
+ language,
+ chosen_redact_entities,
+ chosen_redact_comprehend_entities,
+ allow_list,
+ pii_identification_method,
+ nlp_analyser,
+ score_threshold,
+ custom_entities,
+ comprehend_client,
+ comprehend_query_number,
+ bedrock_runtime,
+ model_choice,
+ custom_llm_instructions,
+ chosen_llm_entities,
+ output_folder,
+ file_name,
+ ),
+ extraction_results,
+ ),
+ total=num_pages,
+ unit="pages",
+ desc="Detecting PII (parallel)",
+ )
+ )
+ for (
+ page_no,
+ page_redaction_bounding_boxes,
+ llm_name,
+ llm_in,
+ llm_out,
+ ) in pii_results_list:
+ pii_results_by_page[page_no] = (
+ page_redaction_bounding_boxes,
+ llm_name,
+ llm_in,
+ llm_out,
+ )
+
+ # Precompute per-page lookups so the loop does O(1) access instead of repeated DataFrame .loc
+ # (Same approach as redaction_review: pass image_dimensions_override to skip per-call .loc in redact_page_with_pymupdf.)
+ image_path_by_page = {}
+ page_to_image_dimensions = {}
+ if not page_sizes_df.empty and "page" in page_sizes_df.columns:
+ if "image_path" in page_sizes_df.columns:
+ for _page_one_based, _row in page_sizes_df.set_index("page")[
+ "image_path"
+ ].items():
+ try:
+ image_path_by_page[int(_page_one_based)] = _row
+ except (TypeError, ValueError):
+ pass
+ if (
+ "image_width" in page_sizes_df.columns
+ and "image_height" in page_sizes_df.columns
+ ):
+ sub = page_sizes_df[
+ ["page", "image_width", "image_height"]
+ ].drop_duplicates("page")
+ for _, row in sub.iterrows():
+ p = row["page"]
+ if pd.notna(p):
+ w, h = row["image_width"], row["image_height"]
+ if pd.notna(w) and pd.notna(h):
+ try:
+ page_to_image_dimensions[int(p)] = {
+ "image_width": float(w),
+ "image_height": float(h),
+ }
+ except (TypeError, ValueError):
+ pass
+ redact_whole_page_set = set()
+ if redact_whole_page_list:
+ for _p in redact_whole_page_list:
+ try:
+ redact_whole_page_set.add(int(_p))
+ except (TypeError, ValueError):
+ redact_whole_page_set.add(_p)
+
+ progress_bar_redact = tqdm(
+ extraction_results,
+ unit="pages",
+ desc=(
+ "Applying redactions to pages"
+ if pii_results_by_page
+ else (
+ "Extracting text (efficient OCR word-count pass)"
+ if text_extraction_only
+ else "Detecting PII (following simple text extraction)"
+ )
+ ),
+ )
+
+ for extraction_result in progress_bar_redact:
+ (
+ page_no,
+ all_page_line_level_text_extraction_results_list,
+ all_page_line_text_extraction_characters,
+ page_text_ocr_outputs,
+ page_ocr_results_with_words,
+ ) = extraction_result
+ reported_page_number = str(page_no + 1)
+ # Create annotations for every page, even if blank.
+
+ # Image path: use precomputed lookup to avoid O(n) .loc per page
+ image_path = image_path_by_page.get(int(reported_page_number), "")
+ if image_path == "" or (isinstance(image_path, float) and pd.isna(image_path)):
+ image_path = ""
+
+ # EFFICIENT_OCR: use placeholder for text-only pages so annotations match page_sizes
+ # and we don't create real images (placeholders have mediabox for coordinate division).
+ if pages_to_process and (
+ not image_path
+ or "placeholder_image" in str(image_path)
+ or "image_placeholder" in str(image_path)
+ ):
+ image_path = f"placeholder_image_{page_no}.png"
+
+ page_image_annotations = {"image": image_path, "boxes": []} # image
+
+ pymupdf_page = pymupdf_doc.load_page(page_no)
+ pymupdf_page.set_cropbox(pymupdf_page.mediabox) # Set CropBox to MediaBox
+
+ page_analyser_results = list()
+ page_redaction_bounding_boxes = list()
+ page_decision_process_table = pd.DataFrame(
+ columns=[
+ "image_path",
+ "page",
+ "label",
+ "xmin",
+ "xmax",
+ "ymin",
+ "ymax",
+ "text",
+ "id",
+ ]
+ )
+ pikepdf_redaction_annotations_on_page = list()
+ all_page_line_level_ocr_results_with_words.extend(page_ocr_results_with_words)
+
+ ### REDACTION
+
+ if (
+ not text_extraction_only
+ and pii_identification_method != NO_REDACTION_PII_OPTION
+ ):
+ if chosen_redact_entities or chosen_redact_comprehend_entities:
+ if page_no in pii_results_by_page:
+ (
+ page_redaction_bounding_boxes,
+ llm_model_name_page,
+ llm_input_tokens_page,
+ llm_output_tokens_page,
+ ) = pii_results_by_page[page_no]
+ else:
+ (
+ page_redaction_bounding_boxes,
+ llm_model_name_page,
+ llm_input_tokens_page,
+ llm_output_tokens_page,
+ ) = run_page_text_redaction(
+ language,
+ chosen_redact_entities,
+ chosen_redact_comprehend_entities,
+ all_page_line_level_text_extraction_results_list,
+ all_page_line_text_extraction_characters,
+ page_analyser_results,
+ page_redaction_bounding_boxes,
+ comprehend_client,
+ allow_list,
+ pii_identification_method,
+ nlp_analyser,
+ score_threshold,
+ custom_entities,
+ comprehend_query_number,
+ bedrock_runtime=bedrock_runtime,
+ model_choice=model_choice,
+ custom_llm_instructions=custom_llm_instructions,
+ chosen_llm_entities=chosen_llm_entities,
+ output_folder=output_folder,
+ file_name=file_name,
+ page_number=int(reported_page_number),
+ )
+
+ if (
+ not page_redaction_bounding_boxes
+ and pii_identification_method == AWS_PII_OPTION
+ and (chosen_redact_comprehend_entities or chosen_redact_entities)
+ ):
+ # print(
+ # f"EFFICIENT_OCR (text path): No PII bounding boxes for page {reported_page_number}. "
+ # "Check that the page has text and that selected entity types match Comprehend output."
+ # )
+ pass
+
+ # Accumulate LLM token usage across pages
+ llm_total_input_tokens += llm_input_tokens_page
+ llm_total_output_tokens += llm_output_tokens_page
+ if llm_model_name_page and not llm_model_name:
+ llm_model_name = llm_model_name_page
+
+ # Annotate redactions on page (convert PyMuPDF top-left coords to PDF bottom-left for Rect)
+ page_height = pymupdf_page.mediabox.height
+ boxes_for_pdf = convert_redaction_boxes_pymupdf_to_pdf(
+ page_redaction_bounding_boxes, page_height
+ )
+ pikepdf_redaction_annotations_on_page = (
+ create_pikepdf_annotations_for_bounding_boxes(boxes_for_pdf)
+ )
+
+ else:
+ pikepdf_redaction_annotations_on_page = list()
+
+ # Make pymupdf page redactions (use set for O(1) membership)
+ int_reported_page_number = int(reported_page_number)
+ redact_whole_page = (
+ int_reported_page_number in redact_whole_page_set
+ if redact_whole_page_set
+ else False
+ )
+
+ # page.cropbox is always reported by PyMuPDF in MediaBox-local coordinates,
+ # so the stored original_cropbox is already in the correct coordinate system
+ # for set_cropbox_safely after set_cropbox(mediabox) has been applied.
+ # No offset transformation is needed here.
+ _orig_cb = original_cropboxes[page_no]
+
+ redact_result = redact_page_with_pymupdf(
+ pymupdf_page,
+ pikepdf_redaction_annotations_on_page,
+ image_path,
+ redact_whole_page=redact_whole_page,
+ convert_pikepdf_to_pymupdf_coords=True,
+ original_cropbox=_orig_cb,
+ page_sizes_df=page_sizes_df,
+ input_folder=input_folder,
+ image_dimensions_override=page_to_image_dimensions.get(
+ int(reported_page_number)
+ ),
+ )
+
+ # Handle dual page objects if returned
+ if isinstance(redact_result[0], tuple):
+ (
+ pymupdf_page,
+ pymupdf_applied_redaction_page,
+ ), page_image_annotations = redact_result
+ # Store the final page with its original page number for later use
+ if not hasattr(redact_text_pdf, "_applied_redaction_pages"):
+ redact_text_pdf._applied_redaction_pages = list()
+ redact_text_pdf._applied_redaction_pages.append(
+ (pymupdf_applied_redaction_page, page_no)
+ )
+ else:
+ pymupdf_page, page_image_annotations = redact_result
+ # When dual output is requested but this page had no redaction boxes,
+ # we still need an "applied" page entry so the final PDF replace loop
+ # replaces every page (same fix as in redact_image_pdf).
+ if RETURN_PDF_FOR_REVIEW and RETURN_REDACTED_PDF:
+ if not hasattr(redact_text_pdf, "_applied_redaction_pages"):
+ redact_text_pdf._applied_redaction_pages = list()
+ applied_doc = pymupdf.open()
+ applied_doc.insert_pdf(
+ pymupdf_page.parent,
+ from_page=page_no,
+ to_page=page_no,
+ )
+ applied_page_copy = applied_doc[0]
+ redact_text_pdf._applied_redaction_pages.append(
+ (applied_page_copy, page_no)
+ )
+
+ # Create decision process table
+ page_decision_process_table = create_text_redaction_process_results(
+ page_analyser_results,
+ page_redaction_bounding_boxes,
+ current_loop_page,
+ )
+
+ if not page_decision_process_table.empty:
+ all_pages_decision_process_list.append(page_decision_process_table)
+
+ # Join extracted text outputs for all lines together
+ if not page_text_ocr_outputs.empty:
+ page_text_ocr_outputs = page_text_ocr_outputs.sort_values(
+ ["line"]
+ ).reset_index(drop=True)
+ page_text_ocr_outputs = page_text_ocr_outputs.loc[
+ :,
+ [
+ "page",
+ "text",
+ "left",
+ "top",
+ "width",
+ "height",
+ "line",
+ "conf",
+ ],
+ ]
+ all_line_level_ocr_results_list.append(page_text_ocr_outputs)
+
+ toc = time.perf_counter()
+
+ time_taken = toc - tic
+
+ # Break if time taken is greater than max_time seconds
+ if time_taken > max_time:
+ print("Processing for", max_time, "seconds, breaking.")
+ page_break_return = True
+ progress.close(_tqdm=progress_bar_redact)
+ tqdm._instances.clear()
+
+ # Check if the image already exists in annotations_all_pages
+ existing_index = next(
+ (
+ index
+ for index, ann in enumerate(annotations_all_pages)
+ if ann["image"] == page_image_annotations["image"]
+ ),
+ None,
+ )
+ if existing_index is not None:
+ # Replace the existing annotation
+ annotations_all_pages[existing_index] = page_image_annotations
+ else:
+ # Append new annotation if it doesn't exist
+ annotations_all_pages.append(page_image_annotations)
+
+ # Write logs
+ # Filter out empty DataFrames before concatenation to avoid FutureWarning
+ non_empty_decision_process = [
+ df for df in all_pages_decision_process_list if not df.empty
+ ]
+ if non_empty_decision_process:
+ all_pages_decision_process_table = pd.concat(
+ non_empty_decision_process, ignore_index=True
+ )
+ else:
+ all_pages_decision_process_table = pd.DataFrame(
+ columns=[
+ "text",
+ "xmin",
+ "ymin",
+ "xmax",
+ "ymax",
+ "label",
+ "start",
+ "end",
+ "score",
+ "page",
+ "id",
+ ]
+ )
+
+ non_empty_ocr_results = [
+ df for df in all_line_level_ocr_results_list if not df.empty
+ ]
+ if non_empty_ocr_results:
+ all_line_level_ocr_results_df = pd.concat(
+ non_empty_ocr_results, ignore_index=True
+ )
+ else:
+ all_line_level_ocr_results_df = pd.DataFrame(
+ columns=[
+ "page",
+ "text",
+ "left",
+ "top",
+ "width",
+ "height",
+ "line",
+ "conf",
+ ]
+ )
+
+ current_loop_page += 1
+
+ early_result = (
+ pymupdf_doc,
+ all_pages_decision_process_table,
+ all_line_level_ocr_results_df,
+ annotations_all_pages,
+ current_loop_page,
+ page_break_return,
+ comprehend_query_number,
+ all_page_line_level_ocr_results_with_words,
+ )
+ if efficient_ocr_extraction_pass:
+ return early_result + (
+ llm_model_name,
+ llm_total_input_tokens,
+ llm_total_output_tokens,
+ extraction_results,
+ )
+ return early_result
+
+ # Check if the image already exists in annotations_all_pages
+ existing_index = next(
+ (
+ index
+ for index, ann in enumerate(annotations_all_pages)
+ if ann["image"] == page_image_annotations["image"]
+ ),
+ None,
+ )
+ if existing_index is not None:
+ # Replace the existing annotation
+ annotations_all_pages[existing_index] = page_image_annotations
+ else:
+ # Append new annotation if it doesn't exist
+ annotations_all_pages.append(page_image_annotations)
+
+ current_loop_page += 1
+
+ # Break if new page is a multiple of page_break_val
+ if current_loop_page % page_break_val == 0:
+ page_break_return = True
+ progress.close(_tqdm=progress_bar_redact)
+
+ # Write logs
+ # Filter out empty DataFrames before concatenation to avoid FutureWarning
+ non_empty_decision_process = [
+ df for df in all_pages_decision_process_list if not df.empty
+ ]
+ if non_empty_decision_process:
+ all_pages_decision_process_table = pd.concat(
+ non_empty_decision_process, ignore_index=True
+ )
+ else:
+ all_pages_decision_process_table = pd.DataFrame(
+ columns=[
+ "text",
+ "xmin",
+ "ymin",
+ "xmax",
+ "ymax",
+ "label",
+ "start",
+ "end",
+ "score",
+ "page",
+ "id",
+ ]
+ )
+
+ page_break_result = (
+ pymupdf_doc,
+ all_pages_decision_process_table,
+ all_line_level_ocr_results_df,
+ annotations_all_pages,
+ current_loop_page,
+ page_break_return,
+ comprehend_query_number,
+ all_page_line_level_ocr_results_with_words,
+ )
+ if efficient_ocr_extraction_pass:
+ return page_break_result + (
+ llm_model_name,
+ llm_total_input_tokens,
+ llm_total_output_tokens,
+ extraction_results,
+ )
+ return page_break_result
+
+ # Write all page outputs
+ # Filter out empty DataFrames before concatenation to avoid FutureWarning
+ non_empty_decision_process = [
+ df for df in all_pages_decision_process_list if not df.empty
+ ]
+ if non_empty_decision_process:
+ all_pages_decision_process_table = pd.concat(
+ non_empty_decision_process, ignore_index=True
+ )
+ else:
+ all_pages_decision_process_table = pd.DataFrame(
+ columns=[
+ "text",
+ "xmin",
+ "ymin",
+ "xmax",
+ "ymax",
+ "label",
+ "start",
+ "end",
+ "score",
+ "page",
+ "id",
+ ]
+ )
+
+ non_empty_ocr_results = [
+ df for df in all_line_level_ocr_results_list if not df.empty
+ ]
+ if non_empty_ocr_results:
+ all_line_level_ocr_results_df = pd.concat(
+ non_empty_ocr_results, ignore_index=True
+ )
+ else:
+ all_line_level_ocr_results_df = pd.DataFrame(
+ columns=["page", "text", "left", "top", "width", "height", "line", "conf"]
+ )
+
+ if not all_pages_decision_process_table.empty:
+
+ # Convert decision table to relative coordinates
+ all_pages_decision_process_table = divide_coordinates_by_page_sizes(
+ all_pages_decision_process_table,
+ page_sizes_df,
+ xmin="xmin",
+ xmax="xmax",
+ ymin="ymin",
+ ymax="ymax",
+ )
+
+ # Convert decision table to relative coordinates
+ if not all_line_level_ocr_results_df.empty:
+
+ all_line_level_ocr_results_df = divide_coordinates_by_page_sizes(
+ all_line_level_ocr_results_df,
+ page_sizes_df,
+ xmin="left",
+ xmax="width",
+ ymin="top",
+ ymax="height",
+ coordinates_in_pdf_points=True,
+ )
+
+ # Remove empty dictionary items from ocr results with words
+ all_page_line_level_ocr_results_with_words = [
+ d for d in all_page_line_level_ocr_results_with_words if d
+ ]
+
+ result = (
+ pymupdf_doc,
+ all_pages_decision_process_table,
+ all_line_level_ocr_results_df,
+ annotations_all_pages,
+ current_loop_page,
+ page_break_return,
+ comprehend_query_number,
+ all_page_line_level_ocr_results_with_words,
+ llm_model_name,
+ llm_total_input_tokens,
+ llm_total_output_tokens,
+ )
+ if efficient_ocr_extraction_pass:
+ return result + (extraction_results,)
+ return result
+
+
+def _pil_ocr_viz_text_size(font: ImageFont.ImageFont, text: str) -> Tuple[int, int]:
+ if not text:
+ text = " "
+ if hasattr(font, "getbbox"):
+ left, top, right, bottom = font.getbbox(text)
+ return max(1, right - left), max(1, bottom - top)
+ dr = ImageDraw.Draw(Image.new("RGB", (1, 1)))
+ bb = dr.textbbox((0, 0), text, font=font)
+ return max(1, bb[2] - bb[0]), max(1, bb[3] - bb[1])
+
+
+def _pil_ocr_viz_text_bbox(
+ font: ImageFont.ImageFont, text: str
+) -> Tuple[int, int, int, int]:
+ if not text:
+ text = " "
+ if hasattr(font, "getbbox"):
+ return font.getbbox(text)
+ dr = ImageDraw.Draw(Image.new("RGB", (1, 1)))
+ return dr.textbbox((0, 0), text, font=font)
+
+
+def _pil_ocr_viz_load_font(font_path: Optional[str], size: int) -> ImageFont.ImageFont:
+ if font_path and os.path.isfile(font_path):
+ try:
+ return ImageFont.truetype(font_path, size)
+ except OSError:
+ pass
+ return ImageFont.load_default()
+
+
+def _ocr_viz_bgr_to_rgb(bgr: Tuple[int, int, int]) -> Tuple[int, int, int]:
+ return (int(bgr[2]), int(bgr[1]), int(bgr[0]))
+
+
+def _ocr_viz_is_punctuation_only_token(text: str) -> bool:
+ s = text.strip()
+ if not s:
+ return False
+ return not any(ch.isalpha() or ch.isdigit() for ch in s)
+
+
+def _ocr_viz_punctuation_only_vertical_fraction(text: str) -> float:
+ """
+ Vertical position within the OCR word box (0 = top, 1 = bottom of box).
+ Dash-like and symmetric separators are centred; comma / full stop class sit lower.
+ """
+ compact = "".join(ch for ch in text if not ch.isspace())
+ if not compact:
+ return 0.5
+ middle_chars = frozenset("-‐−–—‾_·∙•‧;:|/\\+=*()[]{}\"'«»“”‘’")
+ if all(c in middle_chars for c in compact):
+ return 0.5
+ return 2.0 / 3.0
+
+
+def _ocr_viz_overlay_vertical_ref_y(
+ y1: int, y2: int, text: str, line_y_c: float
+) -> float:
+ if y2 <= y1:
+ return float(line_y_c)
+ if not _ocr_viz_is_punctuation_only_token(text):
+ return float(line_y_c)
+ frac = _ocr_viz_punctuation_only_vertical_fraction(text)
+ return float(y1) + (y2 - y1) * frac
+
+
+def _ocr_viz_draw_position_from_left_edge(
+ text: str,
+ font: ImageFont.ImageFont,
+ left_edge_x: float,
+ ref_y: float,
+ img_width: int,
+ img_height: int,
+ box_y1: Optional[int] = None,
+ box_y2: Optional[int] = None,
+) -> Tuple[int, int]:
+ """Return top-left draw coordinates that place the ink bbox at the requested position.
+
+ If box_y1/box_y2 are provided the rendered ink is also clamped so it never
+ escapes the word's own OCR bounding box vertically.
+ """
+ try:
+ left, top, right, bottom = _pil_ocr_viz_text_bbox(font, text)
+ except (TypeError, ValueError, OSError):
+ return int(left_edge_x), int(ref_y)
+
+ draw_x = int(round(left_edge_x - left))
+ draw_y = int(round(ref_y - (top + bottom) / 2.0))
+
+ # Clamp vertically to the word's own OCR box so text never escapes it.
+ if box_y1 is not None and box_y2 is not None:
+ # Ideal: ink top ≥ box_y1 and ink bottom ≤ box_y2.
+ min_draw_y = box_y1 - top # ink top lands on box_y1
+ max_draw_y = box_y2 - bottom # ink bottom lands on box_y2
+ if min_draw_y <= max_draw_y:
+ draw_y = max(min_draw_y, min(max_draw_y, draw_y))
+ else:
+ # Ink is taller than the box - centre it (should be rare after height cap).
+ draw_y = int(round((box_y1 + box_y2) / 2.0 - (top + bottom) / 2.0))
+
+ # Finally clamp to image canvas.
+ if draw_x + left < 0:
+ draw_x -= draw_x + left
+ if draw_x + right > img_width:
+ draw_x -= draw_x + right - img_width
+ if draw_y + top < 0:
+ draw_y -= draw_y + top
+ if draw_y + bottom > img_height:
+ draw_y -= draw_y + bottom - img_height
+
+ return draw_x, draw_y
+
+
+def visualise_ocr_words_bounding_boxes(
+ image: Union[str, Image.Image],
+ ocr_results: Dict[str, Any],
+ image_name: str = None,
+ output_folder: str = OUTPUT_FOLDER,
+ text_extraction_method: str = None,
+ visualisation_folder: str = None,
+ add_legend: bool = True,
+ chosen_local_ocr_model: str = None,
+ log_files_output_paths: List[str] = list(),
+ textract_hybrid_bedrock_used: bool = False,
+) -> None:
+ """
+ Visualizes OCR bounding boxes with confidence-based colors and a legend.
+ Handles word-level OCR results from Textract and Tesseract.
+
+ Args:
+ image: The PIL Image object or image path
+ ocr_results: Dictionary containing word-level OCR results
+ image_name: Optional name for the saved image file
+ output_folder: Output folder path
+ text_extraction_method: The text extraction method being used (determines folder name)
+ visualisation_folder: Subfolder name for visualizations (auto-determined if not provided)
+ add_legend: Whether to add a legend to the visualization
+ log_files_output_paths: List of file paths used for saving redaction process logging results.
+ textract_hybrid_bedrock_used: When True and Textract is the method, use hybrid Textract+Bedrock
+ folder and label for the saved visualization.
+ """
+ # Determine visualization folder based on text extraction method
+ # Initialize base_model_name with a default value
+ base_model_name = "OCR" # Default fallback value
+
+ if visualisation_folder is None:
+ if (
+ text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION
+ and textract_hybrid_bedrock_used
+ ):
+ base_model_name = "Textract + Bedrock VLM (hybrid)"
+ visualisation_folder = "hybrid_textract_bedrock_visualisations"
+ elif text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
+ base_model_name = "Textract"
+ visualisation_folder = "textract_visualisations"
+ elif (
+ text_extraction_method == LOCAL_OCR_MODEL_TEXT_EXTRACT_OPTION
+ and chosen_local_ocr_model == "tesseract"
+ ):
+ base_model_name = "Tesseract"
+ visualisation_folder = "tesseract_visualisations"
+ elif (
+ text_extraction_method == LOCAL_OCR_MODEL_TEXT_EXTRACT_OPTION
+ and chosen_local_ocr_model == "hybrid-paddle"
+ ):
+ base_model_name = "Tesseract"
+ visualisation_folder = "hybrid_paddle_visualisations"
+ elif (
+ text_extraction_method == LOCAL_OCR_MODEL_TEXT_EXTRACT_OPTION
+ and chosen_local_ocr_model == "paddle"
+ ):
+ base_model_name = "Paddle"
+ visualisation_folder = "paddle_visualisations"
+ elif (
+ text_extraction_method == LOCAL_OCR_MODEL_TEXT_EXTRACT_OPTION
+ and chosen_local_ocr_model == "hybrid-vlm"
+ ):
+ base_model_name = "Tesseract"
+ visualisation_folder = "hybrid_vlm_visualisations"
+ elif (
+ text_extraction_method == LOCAL_OCR_MODEL_TEXT_EXTRACT_OPTION
+ and chosen_local_ocr_model == "hybrid-paddle-vlm"
+ ):
+ base_model_name = "Paddle"
+ visualisation_folder = "hybrid_paddle_vlm_visualisations"
+ elif (
+ text_extraction_method == LOCAL_OCR_MODEL_TEXT_EXTRACT_OPTION
+ and chosen_local_ocr_model == "hybrid-paddle-inference-server"
+ ):
+ base_model_name = "Paddle"
+ visualisation_folder = "hybrid_paddle_inference_server_visualisations"
+ elif (
+ text_extraction_method == LOCAL_OCR_MODEL_TEXT_EXTRACT_OPTION
+ and chosen_local_ocr_model == "vlm"
+ ):
+ base_model_name = "VLM"
+ visualisation_folder = "vlm_visualisations"
+ elif (
+ text_extraction_method == LOCAL_OCR_MODEL_TEXT_EXTRACT_OPTION
+ and chosen_local_ocr_model == "inference-server"
+ ):
+ base_model_name = "Inference Server"
+ visualisation_folder = "inference_server_visualisations"
+ elif text_extraction_method == BEDROCK_VLM_TEXT_EXTRACT_OPTION:
+ # Matches word-level model string from _parse_vlm_page_ocr_response (model_name="Bedrock")
+ base_model_name = "Bedrock"
+ visualisation_folder = "bedrock_visualisations"
+ elif text_extraction_method == GEMINI_VLM_TEXT_EXTRACT_OPTION:
+ base_model_name = "Gemini"
+ visualisation_folder = "gemini_visualisations"
+ elif text_extraction_method == AZURE_OPENAI_VLM_TEXT_EXTRACT_OPTION:
+ base_model_name = "Azure/OpenAI"
+ visualisation_folder = "azure_openai_visualisations"
+ else:
+ base_model_name = "OCR"
+ visualisation_folder = "ocr_visualisations"
+
+ if not ocr_results:
+ return log_files_output_paths
+
+ if isinstance(image, str):
+ image = Image.open(image)
+ # Convert PIL image to OpenCV format
+ image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+
+ # Get image dimensions
+ height, width = image_cv.shape[:2]
+
+ # Detect if coordinates need conversion from PyMuPDF to image space
+ # This happens when Textract uses mediabox dimensions (PyMuPDF coordinates)
+ # instead of image pixel dimensions
+ # For non-Textract methods (VLM/inference-server), coordinates should already be in image pixel space,
+ # but we need to check if there's a size mismatch between coordinate space and visualization image
+ needs_coordinate_conversion = False
+ source_width = width
+ source_height = height
+
+ if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
+ # Collect all bounding box coordinates to detect coordinate system
+ all_x_coords = []
+ all_y_coords = []
+
+ for line_key, line_data in ocr_results.items():
+ if not isinstance(line_data, dict) or "words" not in line_data:
+ continue
+ words = line_data.get("words", [])
+ for word_data in words:
+ if not isinstance(word_data, dict):
+ continue
+ bbox = word_data.get("bounding_box", (0, 0, 0, 0))
+ if len(bbox) == 4:
+ x1, y1, x2, y2 = bbox
+ all_x_coords.extend([x1, x2])
+ all_y_coords.extend([y1, y2])
+ else:
+ # For non-Textract methods (Tesseract, VLM, inference-server, etc.),
+ # coordinates should be in image pixel space, but check if there's a size mismatch
+ # Collect all bounding box coordinates to detect coordinate space
+ all_x_coords = []
+ all_y_coords = []
+
+ for line_key, line_data in ocr_results.items():
+ if not isinstance(line_data, dict) or "words" not in line_data:
+ continue
+ words = line_data.get("words", [])
+ for word_data in words:
+ if not isinstance(word_data, dict):
+ continue
+ bbox = word_data.get("bounding_box", (0, 0, 0, 0))
+ if len(bbox) == 4:
+ x1, y1, x2, y2 = bbox
+ all_x_coords.extend([x1, x2])
+ all_y_coords.extend([y1, y2])
+
+ # Calculate scaling factors if conversion is needed
+ if needs_coordinate_conversion:
+ scale_x = width / source_width
+ scale_y = height / source_height
+ else:
+ scale_x = 1.0
+ scale_y = 1.0
+
+ # Define confidence ranges and colors for bounding boxes (bright colors)
+ confidence_ranges = [
+ (80, 100, (0, 255, 0), "High (80-100%)"), # Green
+ (50, 79, (0, 165, 255), "Medium (50-79%)"), # Orange
+ (0, 49, (0, 0, 255), "Low (0-49%)"), # Red
+ ]
+
+ # Define darker colors for text on white background
+ text_confidence_ranges = [
+ (80, 100, (0, 150, 0), "High (80-100%)"), # Dark Green
+ (50, 79, (0, 100, 200), "Medium (50-79%)"), # Dark Orange
+ (0, 49, (0, 0, 180), "Low (0-49%)"), # Dark Red
+ ]
+
+ # Process each line's words
+ for line_key, line_data in ocr_results.items():
+ if not isinstance(line_data, dict) or "words" not in line_data:
+ continue
+
+ words = line_data.get("words", [])
+
+ # Process each word in the line
+ for word_data in words:
+ if not isinstance(word_data, dict):
+ continue
+
+ text = word_data.get("text", "")
+ # Handle both 'conf' and 'confidence' field names for compatibility
+ conf = int(word_data.get("conf", word_data.get("confidence", 0)))
+
+ # Skip empty text or invalid confidence
+ if not text.strip() or conf == -1:
+ continue
+
+ # Get bounding box coordinates
+ bbox = word_data.get("bounding_box", (0, 0, 0, 0))
+ if len(bbox) != 4:
+ continue
+
+ x1, y1, x2, y2 = bbox
+
+ # Convert coordinates if needed (from PyMuPDF to image space)
+ if needs_coordinate_conversion:
+ x1 = x1 * scale_x
+ y1 = y1 * scale_y
+ x2 = x2 * scale_x
+ y2 = y2 * scale_y
+
+ # Ensure coordinates are within image bounds
+ x1 = max(0, min(int(x1), width))
+ y1 = max(0, min(int(y1), height))
+ x2 = max(0, min(int(x2), width))
+ y2 = max(0, min(int(y2), height))
+
+ # Skip if bounding box is invalid
+ if x2 <= x1 or y2 <= y1:
+ continue
+
+ # Check if word was replaced by a different model (e.g. Bedrock VLM in hybrid Textract route)
+ model = word_data.get("model", None)
+ is_replaced = model and model.lower() != base_model_name.lower()
+
+ # Determine bounding box color: grey for replaced words, otherwise by confidence
+ box_color = (0, 0, 255) # Default to red
+ if is_replaced:
+ box_color = (128, 128, 128) # Grey for model-replaced words
+ else:
+ for min_conf, max_conf, conf_color, _ in confidence_ranges:
+ if min_conf <= conf <= max_conf:
+ box_color = conf_color
+ break
+ cv2.rectangle(image_cv, (x1, y1), (x2, y2), box_color, 1)
+
+ # Show model replacement in legend when using a model that can have VLM/inference-server replacements
+ # or when hybrid Textract + Bedrock VLM was used (some lines/words may be from Bedrock VLM)
+ show_model_replacement_legend = (
+ textract_hybrid_bedrock_used
+ or chosen_local_ocr_model
+ in (
+ "hybrid-paddle-inference-server",
+ "hybrid-paddle-vlm",
+ "hybrid-vlm",
+ "inference-server",
+ )
+ )
+ # Add legend
+ if add_legend:
+ add_confidence_legend(
+ image_cv,
+ confidence_ranges,
+ show_model_replacement=show_model_replacement_legend,
+ )
+
+ # Second page: recognised text (PIL + system TTF — OpenCV putText is Latin-only → '?' for Cyrillic/CJK/etc.)
+ viz_font_path = get_ocr_visualisation_font_path()
+ text_page_rgb = Image.new("RGB", (width, height), (255, 255, 255))
+ draw_pil = ImageDraw.Draw(text_page_rgb)
+
+ # Pre-pass: compute a per-line representative font size and vertical centre so that
+ # all words in the same line share the same font size and are aligned on the same
+ # baseline, regardless of how tall their individual bounding boxes are.
+ absolute_max_font_pt = 56
+ max_text_height_fraction = 0.55 # normal cap: text ≤ 55% of box height
+ max_text_height_fraction_small = (
+ 0.95 # relaxed cap for small boxes where strict cap would be unreadable
+ )
+ min_readable_px = (
+ 10 # if strict cap gives fewer pixels than this, use the relaxed cap
+ )
+ line_font_sizes: dict = {}
+ line_y_centres: dict = {}
+ all_word_box_heights: List[int] = []
+
+ for _lk, _ld in ocr_results.items():
+ if not isinstance(_ld, dict) or "words" not in _ld:
+ continue
+ _valid_bboxes = []
+ for _wd in _ld["words"]:
+ if not isinstance(_wd, dict):
+ continue
+ if not _wd.get("text", "").strip():
+ continue
+ if int(_wd.get("conf", _wd.get("confidence", 0))) == -1:
+ continue
+ _bb = _wd.get("bounding_box", (0, 0, 0, 0))
+ if len(_bb) != 4:
+ continue
+ _bx1, _by1, _bx2, _by2 = _bb
+ if needs_coordinate_conversion:
+ _bx1, _by1 = _bx1 * scale_x, _by1 * scale_y
+ _bx2, _by2 = _bx2 * scale_x, _by2 * scale_y
+ _bx1 = max(0, min(int(_bx1), width))
+ _by1 = max(0, min(int(_by1), height))
+ _bx2 = max(0, min(int(_bx2), width))
+ _by2 = max(0, min(int(_by2), height))
+ if _bx2 > _bx1 and _by2 > _by1:
+ _valid_bboxes.append((_bx1, _by1, _bx2, _by2))
+ all_word_box_heights.append(_by2 - _by1)
+ if not _valid_bboxes:
+ continue
+ _line_y1 = min(b[1] for b in _valid_bboxes)
+ _line_y2 = max(b[3] for b in _valid_bboxes)
+ line_y_centres[_lk] = (_line_y1 + _line_y2) / 2
+ # Use the median box height so outliers (e.g. very tall or very short boxes)
+ # don't skew the font size for the whole line.
+ _heights = sorted(b[3] - b[1] for b in _valid_bboxes)
+ _representative_h = _heights[len(_heights) // 2]
+ line_font_sizes[_lk] = min(160, max(8, int(_representative_h * 1.8)))
+
+ # Page-wide ceiling so a few mistaken huge boxes cannot drive oversized type.
+ if all_word_box_heights:
+ viz_global_max_font_pt = min(
+ 160,
+ max(8, int(statistics.median(all_word_box_heights) * 1.8)),
+ )
+ else:
+ viz_global_max_font_pt = 160
+ viz_global_max_font_pt = min(viz_global_max_font_pt, absolute_max_font_pt)
+
+ # Process each line's words for text overlay
+ for line_key, line_data in ocr_results.items():
+ if not isinstance(line_data, dict) or "words" not in line_data:
+ continue
+
+ words = line_data.get("words", [])
+
+ # Group words by bounding box (to handle cases where multiple words share the same box)
+ # Use a small tolerance to consider boxes as "the same" if they're very close
+ bbox_tolerance = 5 # pixels
+ bbox_groups = {} # Maps (x1, y1, x2, y2) to list of word_data
+
+ for word_data in words:
+ if not isinstance(word_data, dict):
+ continue
+
+ text = word_data.get("text", "")
+ # Handle both 'conf' and 'confidence' field names for compatibility
+ conf = int(word_data.get("conf", word_data.get("confidence", 0)))
+
+ # Skip empty text or invalid confidence
+ if not text.strip() or conf == -1:
+ continue
+
+ # Get bounding box coordinates
+ bbox = word_data.get("bounding_box", (0, 0, 0, 0))
+ if len(bbox) != 4:
+ continue
+
+ x1, y1, x2, y2 = bbox
+
+ # Convert coordinates if needed (from PyMuPDF to image space)
+ if needs_coordinate_conversion:
+ x1 = x1 * scale_x
+ y1 = y1 * scale_y
+ x2 = x2 * scale_x
+ y2 = y2 * scale_y
+
+ # Ensure coordinates are within image bounds
+ x1 = max(0, min(int(x1), width))
+ y1 = max(0, min(int(y1), height))
+ x2 = max(0, min(int(x2), width))
+ y2 = max(0, min(int(y2), height))
+
+ # Skip if bounding box is invalid
+ if x2 <= x1 or y2 <= y1:
+ continue
+
+ # Round coordinates to nearest tolerance to group similar boxes
+ x1_rounded = (x1 // bbox_tolerance) * bbox_tolerance
+ y1_rounded = (y1 // bbox_tolerance) * bbox_tolerance
+ x2_rounded = (x2 // bbox_tolerance) * bbox_tolerance
+ y2_rounded = (y2 // bbox_tolerance) * bbox_tolerance
+
+ bbox_key = (x1_rounded, y1_rounded, x2_rounded, y2_rounded)
+
+ if bbox_key not in bbox_groups:
+ bbox_groups[bbox_key] = []
+ bbox_groups[bbox_key].append(
+ {"word_data": word_data, "original_bbox": (x1, y1, x2, y2)}
+ )
+
+ # Process each group of words
+ for bbox_key, word_group in bbox_groups.items():
+ if not word_group:
+ continue
+
+ # Use the first word's bounding box as the reference (they should all be similar)
+ x1, y1, x2, y2 = word_group[0]["original_bbox"]
+ box_width = x2 - x1
+ box_height = y2 - y1
+
+ # If only one word in the box, process it normally
+ if len(word_group) == 1:
+ word_data = word_group[0]["word_data"]
+ text = word_data.get("text", "")
+ conf = int(word_data.get("conf", word_data.get("confidence", 0)))
+
+ # Check if word was replaced by a different model
+ model = word_data.get("model", None)
+ is_replaced = model and model.lower() != base_model_name.lower()
+
+ # Text color always by confidence (replaced words still show confidence; box stays grey)
+ text_color = (0, 0, 180) # Default to dark red (BGR)
+ for min_conf, max_conf, conf_color, _ in text_confidence_ranges:
+ if min_conf <= conf <= max_conf:
+ text_color = conf_color
+ break
+
+ # Use the line-level font size cap so all words in a line stay
+ # visually consistent; only reduce further if the word is too wide.
+ max_pt = min(
+ line_font_sizes.get(
+ line_key, min(160, max(8, int(box_height * 1.8)))
+ ),
+ viz_global_max_font_pt,
+ )
+ strict_h = int(box_height * max_text_height_fraction)
+ allowed_text_height = max(
+ 1,
+ (
+ int(box_height * max_text_height_fraction_small)
+ if strict_h < min_readable_px
+ else strict_h
+ ),
+ )
+ min_pt = 1
+ pil_font = _pil_ocr_viz_load_font(viz_font_path, min_pt)
+ tw = 1
+ th = 1
+ for pt in range(max_pt, min_pt - 1, -1):
+ pil_font = _pil_ocr_viz_load_font(viz_font_path, pt)
+ tw, th = _pil_ocr_viz_text_size(pil_font, text)
+ if tw <= box_width * 0.95 and th <= allowed_text_height:
+ break
+ text_left_edge = x1 + (box_width - tw) / 2.0
+ # Word-like tokens: align to line vertical centre. Punctuation-only tokens use a
+ # fraction of the OCR box height so glyphs are not pushed to the bottom when the
+ # font bbox is much taller than a tight comma/period box.
+ line_y_c = line_y_centres.get(line_key, y1 + box_height / 2)
+ ref_y = _ocr_viz_overlay_vertical_ref_y(y1, y2, text, line_y_c)
+ draw_x, draw_y = _ocr_viz_draw_position_from_left_edge(
+ text, pil_font, text_left_edge, ref_y, width, height, y1, y2
+ )
+ draw_pil.text(
+ (draw_x, draw_y),
+ text,
+ font=pil_font,
+ fill=_ocr_viz_bgr_to_rgb(text_color),
+ )
+
+ if is_replaced:
+ draw_pil.rectangle(
+ [x1, y1, x2, y2],
+ outline=(128, 128, 128),
+ width=1,
+ )
+
+ else:
+ # Multiple words in the same box - arrange them side by side
+ # Extract texts and determine colors for each word
+ word_texts = []
+ word_colors = []
+ word_is_replaced = []
+
+ for item in word_group:
+ word_data = item["word_data"]
+ text = word_data.get("text", "")
+ conf = int(word_data.get("conf", word_data.get("confidence", 0)))
+ model = word_data.get("model", None)
+ is_replaced = model and model.lower() != base_model_name.lower()
+
+ # Text color always by confidence (replaced words still show confidence; box stays grey)
+ text_color = (0, 0, 180) # Default to dark red
+ for min_conf, max_conf, conf_color, _ in text_confidence_ranges:
+ if min_conf <= conf <= max_conf:
+ text_color = conf_color
+ break
+
+ word_texts.append(text)
+ word_colors.append(text_color)
+ word_is_replaced.append(is_replaced)
+
+ n = len(word_texts)
+ # Use the line-level font size cap so all words in a line stay
+ # visually consistent; only reduce further if the combined text is too wide.
+ max_pt = min(
+ line_font_sizes.get(
+ line_key, min(160, max(8, int(box_height * 1.8)))
+ ),
+ viz_global_max_font_pt,
+ )
+ strict_h = int(box_height * max_text_height_fraction)
+ allowed_text_height = max(
+ 1,
+ (
+ int(box_height * max_text_height_fraction_small)
+ if strict_h < min_readable_px
+ else strict_h
+ ),
+ )
+ min_pt = 1
+ pil_font = _pil_ocr_viz_load_font(viz_font_path, min_pt)
+ total_width = 0
+ max_text_height = 0
+ space_w = 1
+ for pt in range(max_pt, min_pt - 1, -1):
+ pil_font = _pil_ocr_viz_load_font(viz_font_path, pt)
+ total_width = 0
+ max_text_height = 0
+ space_w, _ = _pil_ocr_viz_text_size(pil_font, " ")
+ for i, wtext in enumerate(word_texts):
+ tw, th = _pil_ocr_viz_text_size(pil_font, wtext)
+ total_width += tw
+ max_text_height = max(max_text_height, th)
+ if i < n - 1:
+ total_width += space_w
+ if (
+ total_width <= box_width * 0.95
+ and max_text_height <= allowed_text_height
+ ):
+ break
+
+ current_x = x1 + (box_width - total_width) // 2
+ line_y_c = line_y_centres.get(line_key, y1 + box_height / 2)
+ if all(_ocr_viz_is_punctuation_only_token(wt) for wt in word_texts):
+ ref_y = _ocr_viz_overlay_vertical_ref_y(
+ y1, y2, "".join(word_texts), line_y_c
+ )
+ else:
+ ref_y = float(line_y_c)
+ for i, (wtext, text_color) in enumerate(zip(word_texts, word_colors)):
+ draw_x, draw_y = _ocr_viz_draw_position_from_left_edge(
+ wtext, pil_font, float(current_x), ref_y, width, height, y1, y2
+ )
+ draw_pil.text(
+ (draw_x, draw_y),
+ wtext,
+ font=pil_font,
+ fill=_ocr_viz_bgr_to_rgb(text_color),
+ )
+ tw, _ = _pil_ocr_viz_text_size(pil_font, wtext)
+ current_x += tw
+ if i < n - 1:
+ current_x += space_w
+
+ if any(word_is_replaced):
+ draw_pil.rectangle(
+ [x1, y1, x2, y2],
+ outline=(128, 128, 128),
+ width=1,
+ )
+
+ text_page = cv2.cvtColor(np.asarray(text_page_rgb), cv2.COLOR_RGB2BGR)
+
+ # Add legend to second page
+ if add_legend:
+ add_confidence_legend(
+ text_page,
+ text_confidence_ranges,
+ show_model_replacement=show_model_replacement_legend,
+ )
+
+ # Concatenate images horizontally
+ combined_image = np.hstack([image_cv, text_page])
+
+ # Save the visualization
+ if output_folder:
+ textract_viz_folder = os.path.join(output_folder, visualisation_folder)
+
+ # Double-check the constructed path is safe
+ if not validate_folder_containment(textract_viz_folder, OUTPUT_FOLDER):
+ raise ValueError(
+ f"Unsafe textract visualisations folder path: {textract_viz_folder}"
+ )
+
+ os.makedirs(textract_viz_folder, exist_ok=True)
+
+ # Generate filename
+ if image_name:
+ # Extract page number from image_name if it follows the pattern _ at the end
+ # This handles cases like "document_1", "document.pdf_1", etc.
+ page_number = None
+ page_match = re.search(r"_(\d+)$", image_name)
+ if page_match:
+ page_number = page_match.group(1)
+ # Remove the page number suffix from image_name for base_name extraction
+ image_name_without_page = image_name[: page_match.start()]
+ else:
+ image_name_without_page = image_name
+
+ # Remove file extension if present
+ base_name = os.path.splitext(image_name_without_page)[0]
+
+ # Include page number in filename if it was found
+ if page_number:
+ filename = f"{base_name}_page_{page_number}_{visualisation_folder}.jpg"
+ else:
+ filename = f"{base_name}_{visualisation_folder}.jpg"
+ else:
+ timestamp = int(time.time())
+ filename = f"{visualisation_folder}_{timestamp}.jpg"
+
+ output_path = os.path.join(textract_viz_folder, filename)
+
+ # Save the combined image. Ensure that image file size is 500kb or less
+
+ max_filesize = 500 * 1024 # 500kb in bytes
+ quality = 95 # Start high, OpenCV JPEG quality range is 0-100
+
+ # Try lowering JPEG quality until file is below size limit
+ is_saved = False
+ while quality >= 10:
+ cv2.imwrite(
+ output_path, combined_image, [int(cv2.IMWRITE_JPEG_QUALITY), quality]
+ )
+ if (
+ os.path.exists(output_path)
+ and os.path.getsize(output_path) <= max_filesize
+ ):
+ is_saved = True
+ break
+ quality -= 5
+
+ if not is_saved:
+ # Save as lowest acceptable quality if cannot get under 500kb, or raise warning
+ cv2.imwrite(
+ output_path, combined_image, [int(cv2.IMWRITE_JPEG_QUALITY), 10]
+ )
+ # Optionally log warning here that file could not be compressed below 500kb
+
+ log_files_output_paths.append(output_path)
+
+ return log_files_output_paths
+
+
+def add_confidence_legend(
+ image_cv: np.ndarray,
+ confidence_ranges: List[Tuple],
+ show_model_replacement: bool = False,
+) -> None:
+ """
+ Adds a confidence legend to the visualization image.
+
+ Args:
+ image_cv: OpenCV image array
+ confidence_ranges: List of tuples containing (min_conf, max_conf, color, label)
+ show_model_replacement: Whether to include a legend entry for model replacements (grey)
+ """
+ height, width = image_cv.shape[:2]
+
+ # Calculate legend height based on number of items
+ num_items = len(confidence_ranges)
+ if show_model_replacement:
+ num_items += 1
+
+ # Scale the entire legend to ~13% of image width so it never dominates.
+ legend_width = max(90, min(200, int(width * 0.13)))
+ scale = legend_width / 200.0 # proportional to original 200px baseline
+
+ font_scale_title = max(0.28, round(0.55 * scale, 2))
+ font_scale_label = max(0.22, round(0.45 * scale, 2))
+ item_spacing = max(12, int(22 * scale))
+ box_size = max(7, int(13 * scale))
+ margin = max(4, int(8 * scale))
+
+ (_, title_h), _ = cv2.getTextSize(
+ "Confidence Levels", cv2.FONT_HERSHEY_SIMPLEX, font_scale_title, 1
+ )
+ legend_height = title_h + margin * 3 + num_items * item_spacing + margin
+
+ outer_pad = max(4, int(14 * scale))
+ legend_x = width - legend_width - outer_pad
+ legend_y = outer_pad
+
+ # Translucent white background
+ overlay = image_cv.copy()
+ cv2.rectangle(
+ overlay,
+ (legend_x, legend_y),
+ (legend_x + legend_width, legend_y + legend_height),
+ (255, 255, 255),
+ -1,
+ )
+ cv2.addWeighted(overlay, 0.5, image_cv, 0.5, 0, image_cv)
+
+ # Title
+ title_text = "Confidence Levels"
+ (title_w, title_h), _ = cv2.getTextSize(
+ title_text, cv2.FONT_HERSHEY_SIMPLEX, font_scale_title, 1
+ )
+ title_x = legend_x + max(0, (legend_width - title_w) // 2)
+ title_y = legend_y + title_h + margin
+ cv2.putText(
+ image_cv,
+ title_text,
+ (title_x, title_y),
+ cv2.FONT_HERSHEY_SIMPLEX,
+ font_scale_title,
+ (0, 0, 0),
+ 1,
+ )
+
+ start_y = title_y + item_spacing
+ item_index = 0
+
+ def _draw_legend_item(img, lx, iy, bsz, col, lbl, lbl_scale, mgn):
+ bx = lx + mgn
+ by = iy - bsz
+ cv2.rectangle(img, (bx, by), (bx + bsz, by + bsz), col, -1)
+ cv2.rectangle(img, (bx, by), (bx + bsz, by + bsz), (0, 0, 0), 1)
+ cv2.putText(
+ img,
+ lbl,
+ (bx + bsz + mgn, iy - max(1, mgn // 3)),
+ cv2.FONT_HERSHEY_SIMPLEX,
+ lbl_scale,
+ (0, 0, 0),
+ 1,
+ )
+
+ if show_model_replacement:
+ _draw_legend_item(
+ image_cv,
+ legend_x,
+ start_y + item_index * item_spacing,
+ box_size,
+ (128, 128, 128),
+ "Model Replacement",
+ font_scale_label,
+ margin,
+ )
+ item_index += 1
+
+ for i, (min_conf, max_conf, color, label) in enumerate(confidence_ranges):
+ _draw_legend_item(
+ image_cv,
+ legend_x,
+ start_y + (item_index + i) * item_spacing,
+ box_size,
+ color,
+ label,
+ font_scale_label,
+ margin,
+ )
diff --git a/tools/find_duplicate_pages.py b/tools/find_duplicate_pages.py
new file mode 100644
index 0000000000000000000000000000000000000000..b62d2742fd33da0139ad803d22b670cba30422a9
--- /dev/null
+++ b/tools/find_duplicate_pages.py
@@ -0,0 +1,2190 @@
+import os
+import re
+import time
+from collections import defaultdict
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import gradio as gr
+import pandas as pd
+import pymupdf
+from gradio import Progress
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+
+from tools.config import MAX_SIMULTANEOUS_FILES
+from tools.file_conversion import (
+ convert_annotation_data_to_dataframe,
+ fill_missing_box_ids_each_box,
+ word_level_ocr_df_to_line_level_ocr_df,
+)
+from tools.file_redaction import redact_whole_pymupdf_page
+from tools.helper_functions import OUTPUT_FOLDER
+from tools.load_spacy_model_custom_recognisers import nlp
+from tools.secure_path_utils import (
+ secure_path_join,
+ validate_folder_containment,
+ validate_path_safety,
+)
+
+number_of_zeros_to_add_to_index = 7 # Number of zeroes to add between page number and line numbers to get a unique page/line index value
+ID_MULTIPLIER = 100000
+# Define the set of punctuation characters for efficient lookup
+PUNCTUATION_TO_STRIP = {".", ",", "?", "!", ":", ";"}
+
+
+def _normalize_page_to_int(page: Any) -> Optional[int]:
+ """
+ Convert a page identifier to an integer page number.
+ Handles: int, numeric string, 'PageN_...' labels, and path/filenames.
+ Returns None if no page number can be determined.
+ """
+ if page is None:
+ return None
+ try:
+ return int(page)
+ except (ValueError, TypeError):
+ pass
+ s = str(page).strip()
+ # "Page1_File" style label
+ m = re.search(r"Page(\d+)_", s, re.IGNORECASE)
+ if m:
+ return int(m.group(1))
+ # Path/filename with _N.png
+ from tools.secure_regex_utils import safe_extract_page_number_from_path
+
+ n = safe_extract_page_number_from_path(s)
+ if n is not None:
+ return n
+ # First contiguous digit sequence (e.g. "A17_FlightPlan_..." -> 17)
+ m = re.search(r"(\d{1,10})", s)
+ if m:
+ return int(m.group(1))
+ return None
+
+
+def split_text_with_punctuation(text: str) -> List[str]:
+ """
+ A more concise version of the tokenization function using a single
+ powerful regex with re.findall.
+ """
+ # This single regex pattern finds either:
+ # 1. A sequence of one or more punctuation marks `[.,?!:;]+`
+ # 2. OR a sequence of one or more characters that are NOT punctuation or whitespace `[^.,?!:;\s]+`
+ pattern = re.compile(r"([.,?!:;]+|[^.,?!:;\s]+)")
+
+ final_list = list()
+ # We first split by whitespace to handle sentences correctly
+ for word in text.split():
+ # Then, for each whitespace-separated word, we tokenize it further
+ final_list.extend(pattern.findall(word))
+
+ return final_list
+
+
+def extract_indices_from_page_ranges(
+ results_df: pd.DataFrame,
+ start_col: str = "Page2_Start_Page",
+ end_col: str = "Page2_End_Page",
+ modulo_divisor_number_of_zeros: int = number_of_zeros_to_add_to_index, # Search for number of added
+ converted_index: bool = False, # Has the index been converted to the page_no + 0000 + line number format that needs the modulo divisor to convert back?
+) -> List[int]:
+ all_indices = set()
+ int("1" + modulo_divisor_number_of_zeros * "0")
+
+ for _, row in results_df.iterrows():
+ start_page = row[start_col]
+ end_page = row[end_col]
+ for encoded_page_id in range(start_page, end_page + 1):
+ if converted_index is True:
+ original_page, original_index = _parse_page_line_id(
+ encoded_page_id
+ ) # (encoded_page_id % modulo_divisor) - 1
+ else:
+ original_index = encoded_page_id
+
+ all_indices.add(original_index)
+ return sorted(list(all_indices))
+
+
+def punctuation_at_word_text_end(word_level_df_orig: pd.DataFrame) -> bool:
+ """
+ Check the first 1000 rows of word_level_df_orig to see if any of the strings
+ in 'word_text' end with a full stop '.', exclamation mark '!', or question mark '?',
+ for strings that do not contain these characters alone.
+
+ Args:
+ word_level_df_orig (pd.DataFrame): DataFrame containing word-level OCR data with 'word_text' column
+
+ Returns:
+ bool: True if any strings end with punctuation marks, False otherwise
+ """
+ # Get the first 1000 rows or all rows if less than 1000
+ sample_df = word_level_df_orig.head(1000)
+
+ # Check if 'word_text' column exists
+ if "word_text" not in sample_df.columns:
+ return False
+
+ # Define punctuation marks to check for
+ punctuation_marks = [".", "!", "?"]
+
+ # Check each word_text string
+ for word_text in sample_df["word_text"]:
+ if pd.isna(word_text) or not isinstance(word_text, str):
+ continue
+
+ # Skip strings that contain only punctuation marks
+ if word_text.strip() in punctuation_marks:
+ continue
+
+ # Check if the string ends with any of the punctuation marks
+ if any(word_text.rstrip().endswith(punct) for punct in punctuation_marks):
+ return True
+
+ return False
+
+
+def run_full_search_and_analysis(
+ search_query_text: str,
+ word_level_df_orig: pd.DataFrame,
+ similarity_threshold: float = 1,
+ combine_pages: bool = False,
+ min_word_count: int = 1,
+ min_consecutive_pages: int = 1,
+ greedy_match: bool = True,
+ remake_index: bool = False,
+ use_regex: bool = False,
+ progress=gr.Progress(track_tqdm=True),
+):
+ """
+ This function orchestrates the entire pipeline for finding duplicate pages based on a user's search query. It takes in the search query text, the original word-level OCR data, and various parameters to control the analysis. The function then:
+
+ 1. Converts the user's search query into a DataFrame format suitable for analysis.
+ 2. Prepares the main word-level OCR data for processing by converting it into the required format.
+ 3. Combines the search query DataFrame with the prepared OCR data DataFrame.
+ 4. Executes the similarity analysis on the combined data using the specified parameters such as similarity threshold, minimum word count, minimum consecutive pages, and greedy match strategy.
+
+ Parameters:
+ - search_query_text (str): The text entered by the user to search for in the OCR data. If use_regex=True, this is treated as a regex pattern.
+ - word_level_df_orig (pd.DataFrame): The original DataFrame containing word-level OCR data.
+ - similarity_threshold (float, optional): The minimum similarity score required for two pages to be considered duplicates. Defaults to 1.
+ - combine_pages (bool, optional): A flag indicating whether to combine text from the same page number within a file. Defaults to False.
+ - min_word_count (int, optional): The minimum number of words required for a page to be considered in the analysis. Defaults to 1.
+ - min_consecutive_pages (int, optional): The minimum number of consecutive pages required to be considered a match. Defaults to 1.
+ - greedy_match (bool, optional): A flag indicating whether to use a greedy strategy for matching consecutive pages. Defaults to True.
+ - remake_index (bool, optional): A flag indicating whether to remake the index of the DataFrame during processing. Defaults to False.
+ - use_regex (bool, optional): If True, treats search_query_text as a regex pattern instead of literal text. Defaults to False.
+ - progress (gr.Progress, optional): A Progress object to track the progress of the operation. Defaults to a Progress object with track_tqdm set to True.
+ """
+
+ if len(search_query_text) < 3:
+ raise Warning("Please use a search query with at least three letters.")
+ if len(search_query_text) > 100:
+ raise Warning("Please use a search query with at less than 100 characters.")
+
+ # For regex mode, we handle the query differently
+ if use_regex:
+ # Validate regex pattern
+ try:
+ re.compile(search_query_text)
+ except re.error as e:
+ raise Warning(f"Invalid regex pattern: {e}")
+
+ # For regex, we don't split into words - treat as single pattern
+ # Create a minimal DataFrame structure for the regex pattern
+ search_query_data = [
+ (
+ "user_search_query",
+ pd.DataFrame({"page": [1], "text": [search_query_text], "line": [1]}),
+ )
+ ]
+ query_word_length = 1 # For regex, we'll handle matching differently
+ min_consecutive_pages = 1 # Regex matches can be variable length
+ else:
+ # Original literal text matching logic
+ if punctuation_at_word_text_end(word_level_df_orig) is True:
+ do_punctuation_split = False
+ else:
+ do_punctuation_split = True
+
+ # Step 1: Process the user's search query string
+ search_query_data, query_word_length = create_dataframe_from_string(
+ search_query_text,
+ file_name="user_search_query",
+ split_words=True,
+ split_punctuation=do_punctuation_split,
+ )
+ if not search_query_data:
+ # Handle case where user submits an empty search string
+ raise Warning("Could not convert search string to required format")
+
+ if query_word_length > 25:
+ # Handle case where user submits an empty search string
+ raise Warning("Please use a query with less than 25 words")
+
+ # Overwrite min_consecutive_pages with the search string length
+ min_consecutive_pages = query_word_length
+
+ # Create word index from reference table
+
+ if word_level_df_orig.empty:
+ raise gr.Error(
+ "No word-level data to process. Please check that you have loaded in OCR data."
+ )
+
+ word_level_df_orig["index"] = word_level_df_orig.index
+ word_level_df = word_level_df_orig.copy()
+
+ # Step 2: Process the main word-level OCR DataFrame
+ word_level_data = convert_word_level_df(word_level_df, file_name="source_document")
+
+ # Step 3: Combine both data sources into one list
+ all_data_to_process = search_query_data + word_level_data
+ if not all_data_to_process:
+ raise gr.Error("No data to process. Please check your inputs.")
+
+ # Step 4: Run the combination logic
+ combined_df, _, full_out_ocr_df = combine_ocr_dataframes(
+ input_data=all_data_to_process,
+ combine_pages=combine_pages,
+ output_folder=None, # No need to save this intermediate file
+ remake_index=remake_index,
+ )
+
+ # Step 5: Run the final similarity analysis on the combined data
+ results_df, duplicate_files, full_data = identify_similar_text_sequences(
+ df_combined=combined_df,
+ similarity_threshold=similarity_threshold,
+ min_word_count=min_word_count,
+ min_consecutive_pages=min_consecutive_pages,
+ greedy_match=greedy_match,
+ combine_pages=combine_pages,
+ inter_file_only=True,
+ do_text_clean=False,
+ file1_name="user_search_query",
+ file2_name="source_document",
+ use_regex=use_regex,
+ progress=progress,
+ )
+
+ print("Finished text search")
+
+ # Map the results back to the reference data file
+ if remake_index is True:
+ results_df_index_list = extract_indices_from_page_ranges(
+ results_df, converted_index=True
+ )
+ else:
+ results_df_index_list = extract_indices_from_page_ranges(
+ results_df, converted_index=False
+ )
+
+ word_level_df_out = word_level_df_orig.loc[
+ word_level_df_orig["index"].isin(results_df_index_list),
+ ["page", "line", "word_text", "index"],
+ ]
+
+ return word_level_df_out, duplicate_files, full_data
+
+
+def create_all_data_to_process(
+ converted_data: pd.DataFrame, other_data_list: List[Tuple]
+):
+ all_data_to_process = converted_data + other_data_list
+ return all_data_to_process
+
+
+def convert_word_level_df(
+ word_level_df: pd.DataFrame, file_name: str = "converted_dataframe"
+) -> List[Tuple[str, pd.DataFrame]]:
+ """
+ Converts a word-level OCR DataFrame to the format for
+ combine_ocr_dataframes.
+
+ A simple renaming and selection of relevant columns
+
+ Args:
+ word_level_df (pd.DataFrame):
+ A DataFrame containing detailed OCR output. Must include at least
+ the columns: 'page', 'line', and 'word_text'.
+ file_name (str, optional):
+ A unique identifier or "dummy" filename to assign to the resulting
+ data. Defaults to "converted_dataframe".
+
+ Returns:
+ List[Tuple[str, pd.DataFrame]]:
+ A list containing a single tuple of (file_name, DataFrame), ready
+ to be used as input for the combine_ocr_dataframes function. The
+ DataFrame will have 'page' and 'text' columns.
+ """
+ # --- 1. Validate Input ---
+ required_columns = ["page", "line", "word_text"]
+ if not all(col in word_level_df.columns for col in required_columns):
+ raise ValueError(
+ f"Input DataFrame must contain all of the following columns: {required_columns}"
+ )
+
+ df = word_level_df.copy()
+
+ # --- 2. Process the DataFrame ---
+ # Ensure word_text is a string to allow for joining
+ df["word_text"] = df["word_text"].astype(str)
+
+ # Group by page and line number, then join the words with a space (not needed for word level search)
+ # The result is a Series with a MultiIndex (page, line)
+ # line_text_series = df.groupby(['page', 'line'])['word_text'].apply(' '.join)
+
+ # Convert the Series back to a DataFrame and reset the index
+ # line_level_df = line_text_series.reset_index()
+
+ # Rename the aggregated column from 'word_text' to the required 'text'
+ df = df.rename(columns={"word_text": "text"})
+
+ # --- 3. Finalise the structure ---
+ # We now have a DataFrame with columns [page, line, text].
+ final_df = df[["page", "text"]]
+
+ # --- 4. Package for output ---
+ # Return in the required List[Tuple[str, DataFrame]] format
+ return [(file_name, final_df)]
+
+
+def create_dataframe_from_string(
+ text_string: str,
+ file_name: str = "user_search_query",
+ page_number: int = 1,
+ split_words: bool = False,
+ split_punctuation: bool = True,
+) -> Tuple[List[Tuple[str, pd.DataFrame]], int]:
+ """
+ Converts a string into a DataFrame compatible with combine_ocr_dataframes.
+
+ Can operate in two modes:
+ 1. As a single-line document (default).
+ 2. As a multi-line document where each word from the string is a separate line.
+
+ Args:
+ text_string (str): The input text to be placed in the DataFrame.
+ file_name (str, optional): A dummy filename to assign to this text.
+ Defaults to "user_search_query".
+ page_number (int, optional): A dummy page number to assign. Defaults to 1.
+ split_words (bool, optional): If True, splits the input string by
+ whitespace and creates a row for each word.
+ If False (default), the entire string is
+ treated as a single text entry.
+ split_punctuation (bool, optional): If True, splits the 'end of sentence' punctuation off the end
+ of the search query to match the reference data.
+
+ Returns:
+ Tuple[List[Tuple[str, pd.DataFrame]], int]:
+ A list containing a single tuple: (file_name, DataFrame).
+ The DataFrame has 'page' and 'text' columns. Also, an integer value indicating the number of words in the search string.
+ Returns an empty list if the input string is empty or whitespace.
+ """
+ # Handle empty input gracefully, this works for both modes.
+ if not text_string or not text_string.strip():
+ print("Warning: Input string is empty. Returning an empty list.")
+ return [], 0
+
+ if split_words:
+ # --- Split string into words, one per row, based on similar punctuation split technique used to create ocr_results_with_words objects ---
+ if split_punctuation is True:
+ words = split_text_with_punctuation(text_string)
+ else:
+ words = text_string.split()
+
+ # words = text_string.split()
+ len_words = len(words)
+ data = {
+ "page": [page_number]
+ * len_words, # Assign the same page number to every word
+ "text": words, # The list of words becomes the text column
+ }
+ else:
+ # --- Entire string in one row ---
+ len_words = 1
+ data = {"page": [page_number], "text": [text_string]}
+
+ # Create the DataFrame from the prepared data
+ df = pd.DataFrame(data)
+
+ df["line"] = df.index + 1
+
+ # Return it in the required format: a list containing one (name, df) tuple
+ return [(file_name, df)], len_words
+
+
+def combine_ocr_dataframes(
+ input_data: List[Tuple[str, pd.DataFrame]],
+ combine_pages: bool = True,
+ output_folder: str = OUTPUT_FOLDER,
+ output_filename: str = "combined_ocr_output.csv",
+ number_of_added_zeros: int = number_of_zeros_to_add_to_index,
+ remake_index: bool = True,
+) -> Tuple[pd.DataFrame, List[str]]:
+ """
+ Combines text from multiple pandas DataFrames containing page and text columns.
+
+ This function takes a list of (name, DataFrame) tuples, processes each DataFrame
+ by grouping and concatenating text, and then combines them into a single DataFrame.
+
+ Args:
+ input_data (List[Tuple[str, pd.DataFrame]]):
+ A list of tuples, where each tuple contains a unique identifier (like a filename)
+ and a pandas DataFrame. Each DataFrame must have 'page' and 'text' columns.
+ combine_pages (bool, optional):
+ If True, text from the same page number within a file is joined into a
+ single row. If False, each line of text gets its own row with a unique
+ page identifier. Defaults to True.
+ output_folder (str, optional):
+ The folder where the combined CSV file will be saved. Defaults to OUTPUT_FOLDER.
+ output_filename (str, optional):
+ The name of the output CSV file. Defaults to "combined_ocr_output.csv".
+
+ Returns:
+ Tuple[pd.DataFrame, List[str]]:
+ A tuple containing:
+ - The final combined and processed DataFrame.
+ - A list containing the path to the saved output CSV file.
+ """
+ all_data = list()
+
+ for file_identifier, df_initial in input_data:
+ df = df_initial.copy() # Work on a copy to avoid side effects
+
+ # --- Validation ---
+ if "page" not in df.columns or "text" not in df.columns:
+ print(
+ f"Warning: Skipping data for '{file_identifier}' - missing required columns 'page' and 'text'."
+ )
+ continue
+
+ # --- Processing ---
+ df["text"] = df["text"].fillna("").astype(str)
+
+ if combine_pages:
+ # Group by page and concatenate text into a single string
+ processed_df = df.groupby("page")["text"].apply(" ".join).reset_index()
+ else:
+ if remake_index is True:
+ # # Create a unique, sortable page ID for each line without combining
+ # df['line_number_by_page'] = df.groupby('page').cumcount() + 1
+ # df['original_page'] = df['page']
+ # # Create a new page ID that combines page and line number for uniqueness
+ # df['page'] = (
+ # df['page'].astype(str).str.zfill(number_of_added_zeros) +
+ # df['line_number_by_page'].astype(str).str.zfill(number_of_added_zeros)
+ # ).astype(int)
+
+ # Define the multiplier based on the max expected lines per page.
+ # If you expect up to 99,999 lines, use 100,000.
+
+ df["line_number_by_page"] = df.groupby("page").cumcount() + 1
+ df["original_page"] = df["page"]
+
+ # Create the new combined ID using arithmetic
+ df["page"] = (df["original_page"] * ID_MULTIPLIER) + df[
+ "line_number_by_page"
+ ]
+
+ else:
+ if "index" not in df.columns:
+ df["index"] = df.index
+ df["page"] = df["index"]
+
+ processed_df = df
+
+ # Add the file identifier column
+ processed_df["file"] = file_identifier
+ all_data.append(processed_df)
+
+ if not all_data:
+ raise ValueError(
+ "No valid DataFrames were processed. Ensure input data is not empty and DataFrames have 'page' and 'text' columns."
+ )
+
+ # --- Final Combination ---
+ combined_df = pd.concat(all_data, ignore_index=True)
+
+ # Reorder columns to a standard format, dropping intermediate columns
+ final_columns = ["file", "page", "text"]
+ if "original_page" in combined_df.columns:
+ final_columns.append("original_page") # Keep for context if created
+
+ # Ensure all final columns exist before trying to select them
+ existing_final_columns = [
+ col for col in final_columns if col in combined_df.columns
+ ]
+
+ full_out_ocr_df = combined_df
+ combined_df = combined_df.copy()[existing_final_columns]
+
+ # --- Save Output ---
+ output_files = list()
+ if output_folder and output_filename:
+ # Validate path safety before creating directories and files
+ if not validate_folder_containment(output_folder, OUTPUT_FOLDER):
+ raise ValueError(f"Unsafe output folder path: {output_folder}")
+ if not validate_path_safety(output_filename):
+ raise ValueError(f"Unsafe output filename: {output_filename}")
+
+ # Normalize and validate the output folder path before using in os.makedirs
+ normalized_output_folder = os.path.normpath(os.path.abspath(output_folder))
+ # Double-check containment after normalization
+ if not validate_folder_containment(normalized_output_folder, OUTPUT_FOLDER):
+ raise ValueError(
+ f"Unsafe normalized output folder path: {normalized_output_folder}"
+ )
+
+ # Assign the validated path back to output_folder to ensure all subsequent
+ # operations use the secure, validated value
+ output_folder = normalized_output_folder
+
+ os.makedirs(output_folder, exist_ok=True)
+ output_path = secure_path_join(output_folder, output_filename)
+ combined_df.to_csv(output_path, index=False)
+ output_files.append(output_path)
+ print(f"Successfully combined data and saved to: {output_path}")
+
+ return combined_df, output_files, full_out_ocr_df
+
+
+def combine_ocr_output_text(
+ input_files: Union[str, List[str]],
+ combine_pages: bool = True,
+ remake_index: bool = True,
+ output_folder: str = OUTPUT_FOLDER,
+) -> Tuple[pd.DataFrame, List[str]]:
+ """
+ Reads multiple OCR CSV files, combines them, and saves the result.
+
+ This function serves as a wrapper that reads CSV files from paths and then
+ uses the `combine_ocr_dataframes` function to perform the combination logic.
+
+ Args:
+ input_files (Union[str, List[str]]): A single file path or a list of file paths.
+ combine_pages (bool, optional): See `combine_ocr_dataframes`. Defaults to True.
+ remake_index (bool, optional): See `combine_ocr_dataframes`. Defaults to True.
+ output_folder (str, optional): See `combine_ocr_dataframes`. Defaults to OUTPUT_FOLDER.
+
+ Returns:
+ Tuple[pd.DataFrame, List[str]]: The combined DataFrame and the path to the output file.
+ """
+ if isinstance(input_files, str):
+ file_paths_list = [input_files]
+ else:
+ file_paths_list = input_files
+
+ data_to_process = list()
+ i = 0
+ first_ocr_df = pd.DataFrame()
+ for file_path in file_paths_list:
+ try:
+ df = pd.read_csv(file_path)
+ # Convert word-level OCR to line-level if user uploaded word-level file
+ if "ocr_results_with_words" in os.path.basename(file_path) and (
+ "word_text" in df.columns and "text" not in df.columns
+ ):
+ df = word_level_ocr_df_to_line_level_ocr_df(df)
+ # Save the first OCR dataframe to save to the GUI
+ if i == 0:
+ first_ocr_df = df
+ first_ocr_df_path = file_path
+ i += 1
+ # Use the base filename as the identifier
+ file_identifier = os.path.basename(file_path)
+ data_to_process.append((file_identifier, df))
+ except FileNotFoundError:
+ print(f"Warning: File not found, skipping: {file_path}")
+ except Exception as e:
+ print(f"Warning: Failed to read or process {file_path}. Error: {e}")
+
+ if not data_to_process:
+ raise ValueError("No valid CSV files could be read or processed.")
+
+ df_combined, _, full_out_ocr_df = combine_ocr_dataframes(
+ input_data=data_to_process,
+ combine_pages=combine_pages,
+ output_folder=output_folder,
+ output_filename="combined_ocr_from_files.csv", # Specific name for this path
+ remake_index=remake_index,
+ )
+
+ # Call the core function with the loaded data
+ return df_combined, first_ocr_df, first_ocr_df_path
+
+
+def clean_and_stem_text_series(df: pd.DataFrame, column: str):
+ """
+ Clean and stem text columns in a data frame
+ """
+
+ def _clean_text(raw_text):
+ from tools.secure_regex_utils import safe_clean_text
+
+ clean = safe_clean_text(raw_text, remove_html=True)
+ clean = " ".join(clean.split())
+ # Join the cleaned words back into a string
+ return clean
+
+ # Function to apply lemmatisation and remove stopwords
+ def _apply_lemmatization(text):
+ doc = nlp(text)
+ # Keep only alphabetic tokens and remove stopwords
+ lemmatized_words = [
+ token.lemma_ for token in doc if token.is_alpha and not token.is_stop
+ ]
+ return " ".join(lemmatized_words)
+
+ df["text_clean"] = df[column].apply(_clean_text)
+
+ df["text_clean"] = df["text_clean"].apply(_apply_lemmatization)
+
+ return df
+
+
+def map_metadata_single_page(
+ similarity_df: pd.DataFrame,
+ metadata_source_df: pd.DataFrame,
+ preview_length: int = 200,
+):
+ """Helper to map metadata for single page results."""
+ metadata_df = metadata_source_df[["file", "page", "text"]]
+ results_df = similarity_df.merge(
+ metadata_df, left_on="Page1_Index", right_index=True
+ ).rename(columns={"file": "Page1_File", "page": "Page1_Page", "text": "Page1_Text"})
+ results_df = results_df.merge(
+ metadata_df, left_on="Page2_Index", right_index=True, suffixes=("_1", "_2")
+ ).rename(columns={"file": "Page2_File", "page": "Page2_Page", "text": "Page2_Text"})
+ results_df["Similarity_Score"] = results_df["Similarity_Score"].round(3)
+ final_df = results_df[
+ [
+ "Page1_File",
+ "Page1_Page",
+ "Page2_File",
+ "Page2_Page",
+ "Similarity_Score",
+ "Page1_Text",
+ "Page2_Text",
+ ]
+ ]
+ final_df = final_df.sort_values(
+ ["Page1_File", "Page1_Page", "Page2_File", "Page2_Page"]
+ )
+ final_df["Page1_Text"] = final_df["Page1_Text"].str[:preview_length]
+ final_df["Page2_Text"] = final_df["Page2_Text"].str[:preview_length]
+ return final_df
+
+
+def map_metadata_subdocument(
+ subdocument_df: pd.DataFrame,
+ metadata_source_df: pd.DataFrame,
+ preview_length: int = 200,
+):
+ """Helper to map metadata for subdocument results."""
+ metadata_df = metadata_source_df[["file", "page", "text"]]
+
+ subdocument_df = subdocument_df.merge(
+ metadata_df, left_on="Page1_Start_Index", right_index=True
+ ).rename(
+ columns={"file": "Page1_File", "page": "Page1_Start_Page", "text": "Page1_Text"}
+ )
+ subdocument_df = subdocument_df.merge(
+ metadata_df[["page"]], left_on="Page1_End_Index", right_index=True
+ ).rename(columns={"page": "Page1_End_Page"})
+ subdocument_df = subdocument_df.merge(
+ metadata_df, left_on="Page2_Start_Index", right_index=True
+ ).rename(
+ columns={"file": "Page2_File", "page": "Page2_Start_Page", "text": "Page2_Text"}
+ )
+ subdocument_df = subdocument_df.merge(
+ metadata_df[["page"]], left_on="Page2_End_Index", right_index=True
+ ).rename(columns={"page": "Page2_End_Page"})
+
+ cols = [
+ "Page1_File",
+ "Page1_Start_Page",
+ "Page1_End_Page",
+ "Page2_File",
+ "Page2_Start_Page",
+ "Page2_End_Page",
+ "Match_Length",
+ "Page1_Text",
+ "Page2_Text",
+ ]
+
+ # Add Avg_Similarity if it exists (it won't for greedy match unless we add it)
+ if "Avg_Similarity" in subdocument_df.columns:
+ subdocument_df["Avg_Similarity"] = subdocument_df["Avg_Similarity"].round(3)
+ cols.insert(7, "Avg_Similarity")
+
+ final_df = subdocument_df[cols]
+ final_df = final_df.sort_values(
+ ["Page1_File", "Page1_Start_Page", "Page2_File", "Page2_Start_Page"]
+ )
+ final_df["Page1_Text"] = final_df["Page1_Text"].str[:preview_length]
+ final_df["Page2_Text"] = final_df["Page2_Text"].str[:preview_length]
+
+ return final_df
+
+
+def save_results_and_redaction_lists(
+ final_df: pd.DataFrame, output_folder: str, combine_pages: bool = True
+) -> list:
+ """
+ Saves the main results DataFrame and generates per-file redaction lists.
+ This function is extracted to be reusable.
+
+ Args:
+ final_df (pd.DataFrame): The DataFrame containing the final match results.
+ output_folder (str): The folder to save the output files.
+ combine_pages (bool, optional): Boolean to check whether the text from pages have been combined into one, or if instead the duplicate match has been conducted line by line.
+
+ Returns:
+ list: A list of paths to all generated files.
+ """
+ # Validate the output_folder path for security
+ if not validate_folder_containment(output_folder, OUTPUT_FOLDER):
+ raise ValueError(f"Invalid or unsafe output folder path: {output_folder}")
+
+ output_paths = list()
+
+ # Use secure path operations to prevent path injection
+ try:
+ output_folder_path = Path(output_folder).resolve()
+ # Validate that the resolved path is within the trusted OUTPUT_FOLDER using robust containment check
+ if not validate_folder_containment(str(output_folder_path), OUTPUT_FOLDER):
+ raise ValueError(
+ f"Output folder path {output_folder} is outside the trusted directory {OUTPUT_FOLDER}"
+ )
+ output_folder_path.mkdir(parents=True, exist_ok=True)
+ except (OSError, PermissionError) as e:
+ raise ValueError(f"Cannot create output directory {output_folder}: {e}")
+
+ if final_df.empty:
+ print("No matches to save.")
+ return []
+
+ # 1. Save the main results DataFrame using secure path operations
+ similarity_file_output_path = secure_path_join(
+ output_folder_path, "page_similarity_results.csv"
+ )
+ final_df.to_csv(similarity_file_output_path, index=False, encoding="utf-8-sig")
+
+ output_paths.append(str(similarity_file_output_path))
+
+ # 2. Save per-file redaction lists
+ # Use 'Page2_File' as the source of duplicate content
+ if combine_pages is True:
+ grouping_col = "Page2_File"
+ if grouping_col not in final_df.columns:
+ print(
+ "Warning: 'Page2_File' column not found. Cannot generate redaction lists."
+ )
+ return output_paths
+
+ for redact_file, group in final_df.groupby(grouping_col):
+ # Sanitize the filename to prevent path injection
+
+ output_file_name_stem = Path(redact_file).stem
+ output_file_name = output_file_name_stem + "_pages_to_redact.csv"
+ # Use secure path operations for the output file
+ output_file_path = secure_path_join(output_folder_path, output_file_name)
+
+ all_pages_to_redact = set()
+ is_subdocument_match = "Page2_Start_Page" in group.columns
+
+ if is_subdocument_match:
+ for _, row in group.iterrows():
+ pages_in_range = range(
+ int(row["Page2_Start_Page"]), int(row["Page2_End_Page"]) + 1
+ )
+ all_pages_to_redact.update(pages_in_range)
+ else:
+ pages = group["Page2_Page"].unique()
+ all_pages_to_redact.update(pages)
+
+ if all_pages_to_redact:
+ redaction_df = pd.DataFrame(
+ sorted(list(all_pages_to_redact)), columns=["Page_to_Redact"]
+ )
+ redaction_df.to_csv(output_file_path, header=False, index=False)
+
+ output_paths.append(str(output_file_path))
+ print(f"Redaction list for {redact_file} saved to {output_file_name}")
+
+ return output_paths
+
+
+def _sequences_match(query_seq: List[str], ref_seq: List[str]) -> bool:
+ """
+ Helper function to compare two sequences of tokens with punctuation flexibility.
+
+ Returns True if the sequences match according to the rules:
+ 1. An exact match is a match.
+ 2. A reference token also matches a query token if it is the query token
+ followed by a single character from PUNCTUATION_TO_STRIP. This rule does not
+ apply if the reference token consists only of punctuation.
+ """
+ if len(query_seq) != len(ref_seq):
+ return False
+
+ for query_token, ref_token in zip(query_seq, ref_seq):
+ # Rule 1: Check for a direct, exact match first (most common case)
+ if query_token == ref_token:
+ continue
+
+ # Rule 2: Check for the flexible punctuation match
+ # - The reference token must be longer than 1 character
+ # - Its last character must be in our punctuation set
+ # - The token without its last character must match the query token
+ if (
+ len(ref_token) > 1
+ and ref_token[-1] in PUNCTUATION_TO_STRIP
+ and ref_token[:-1] == query_token
+ ):
+ continue
+
+ # If neither rule applies, the tokens don't match, so the sequence doesn't match.
+ return False
+
+ # If the loop completes, every token has matched.
+ return True
+
+
+def find_consecutive_sequence_matches(
+ df_filtered: pd.DataFrame,
+ search_file_name: str,
+ reference_file_name: str,
+ use_regex: bool = False,
+) -> pd.DataFrame:
+ """
+ Finds all occurrences of a consecutive sequence of tokens from a search file
+ within a larger reference file.
+
+ This function is designed for order-dependent matching, not "bag-of-words" similarity.
+
+ Args:
+ df_filtered: The DataFrame containing all tokens, with 'file' and 'text_clean' columns.
+ search_file_name: The name of the file containing the search query sequence.
+ reference_file_name: The name of the file to search within.
+ use_regex: If True, treats the search query as a regex pattern instead of literal tokens.
+
+ Returns:
+ A DataFrame with two columns ('Page1_Index', 'Page2_Index') mapping the
+ consecutive match, or an empty DataFrame if no match is found.
+ """
+
+ # Step 1: Isolate the data for each file
+ search_df = df_filtered[df_filtered["file"] == search_file_name]
+ reference_df = df_filtered[df_filtered["file"] == reference_file_name]
+
+ if search_df.empty or reference_df.empty:
+ print("Error: One or both files not found or are empty.")
+ return pd.DataFrame(columns=["Page1_Index", "Page2_Index"])
+
+ if use_regex:
+ # Regex mode: Extract pattern and search in combined text
+ # Get the regex pattern from the search query (should be in 'text' column, not 'text_clean')
+ # We need to get it from the original 'text' column if available, otherwise use 'text_clean'
+ if "text" in search_df.columns:
+ regex_pattern = search_df["text"].iloc[0]
+ else:
+ regex_pattern = search_df["text_clean"].iloc[0]
+
+ # Join reference tokens back into text for regex searching
+ # Use original 'text' column if available to preserve original formatting (important for emails, etc.)
+ # Otherwise fall back to 'text_clean'
+ if "text" in reference_df.columns:
+ reference_tokens = reference_df["text"].tolist()
+ else:
+ reference_tokens = reference_df["text_clean"].tolist()
+ reference_indices = reference_df.index.tolist()
+
+ # Concatenate ALL tokens into a single continuous string with smart spacing
+ # Rules:
+ # - Words are joined with single spaces
+ # - Punctuation (periods, commas, etc.) touches adjacent tokens directly (no spaces)
+ # Example: ["Hi", ".", "How", "are", "you", "?", "Great"] -> "Hi.How are you?Great"
+ # This allows regex patterns to span multiple tokens naturally while preserving word boundaries
+
+ def is_punctuation_only(token):
+ """Check if token contains only punctuation characters"""
+ if not token:
+ return False
+ # Check if all characters are punctuation (using string.punctuation or our set)
+ import string
+
+ return all(c in string.punctuation for c in token)
+
+ def starts_with_punctuation(token):
+ """Check if token starts with punctuation"""
+ if not token:
+ return False
+ import string
+
+ return token[0] in string.punctuation
+
+ def ends_with_punctuation(token):
+ """Check if token ends with punctuation"""
+ if not token:
+ return False
+ import string
+
+ return token[-1] in string.punctuation
+
+ # Build the concatenated string and position mapping
+ reference_text_parts = []
+ char_to_token_map = []
+ current_pos = 0
+
+ for idx, token in enumerate(reference_tokens):
+ # Determine if we need a space before this token
+ needs_space_before = False
+ if idx > 0: # Not the first token
+ prev_token = reference_tokens[idx - 1]
+ # Add space if:
+ # - Current token is not punctuation-only AND
+ # - Previous token is not punctuation-only AND
+ # - Previous token didn't end with punctuation AND
+ # - Current token doesn't start with punctuation
+ if (
+ not is_punctuation_only(token)
+ and not is_punctuation_only(prev_token)
+ and not ends_with_punctuation(prev_token)
+ and not starts_with_punctuation(token)
+ ):
+ needs_space_before = True
+
+ # Add space if needed
+ if needs_space_before:
+ current_pos += 1 # Account for the space
+
+ # Record token position in the concatenated string
+ token_start_in_text = current_pos
+ token_end_in_text = current_pos + len(token)
+ char_to_token_map.append(
+ (token_start_in_text, token_end_in_text, reference_indices[idx])
+ )
+
+ # Add token to the concatenated string
+ if needs_space_before:
+ reference_text_parts.append(" " + token)
+ else:
+ reference_text_parts.append(token)
+
+ # Move position forward by token length (and space if added)
+ current_pos = token_end_in_text
+
+ # Join all parts to create the final concatenated string
+ reference_text = "".join(reference_text_parts)
+
+ # Find all regex matches
+ try:
+ pattern = re.compile(regex_pattern, re.IGNORECASE)
+ matches = list(pattern.finditer(reference_text))
+ except re.error as e:
+ print(f"Error compiling regex pattern: {e}")
+ gr.Warning(f"Invalid regex pattern: {e}")
+ return pd.DataFrame(
+ columns=["Page1_Index", "Page2_Index", "Similarity_Score"]
+ )
+
+ if not matches:
+ print("No regex matches found")
+ gr.Info("No regex matches found")
+ return pd.DataFrame(
+ columns=["Page1_Index", "Page2_Index", "Similarity_Score"]
+ )
+
+ all_found_matches = []
+ query_index = search_df.index[0] # Use the first (and only) query index
+
+ # Optimize overlap detection for large documents
+ # Instead of checking every token for every match (O(m*n)), we can use the fact that
+ # char_to_token_map is sorted by position. For each match, we only need to check
+ # tokens that could possibly overlap.
+
+ # For each regex match found in the concatenated string:
+ # 1. Get the match's start and end character positions
+ # 2. Find all tokens whose character ranges overlap with the match
+ # 3. Include all overlapping tokens in the results
+ # This ensures patterns spanning multiple tokens are captured correctly
+
+ # Optimization: Use a set to track which tokens we've already found
+ # This prevents duplicates if multiple matches overlap the same tokens
+ found_token_indices = set()
+
+ for match in matches:
+ match_start = match.start()
+ match_end = match.end()
+
+ # Find all tokens that overlap with this match
+ # A token overlaps if: token_start < match_end AND token_end > match_start
+ # Optimization: Since char_to_token_map is sorted by start position,
+ # we can stop early once we pass match_end, but we still need to check
+ # tokens that start before match_end (they might extend into the match)
+ matching_token_indices = []
+ for token_start, token_end, token_idx in char_to_token_map:
+ # Early exit optimization: if token starts after match ends, no more overlaps possible
+ # (This works because tokens are processed in order)
+ if token_start >= match_end:
+ break
+
+ # Check if token overlaps with match (not disjoint)
+ if (
+ token_end > match_start
+ ): # token_start < match_end already checked by break above
+ matching_token_indices.append(token_idx)
+
+ # Create matches for all tokens that overlap with the regex match
+ # This ensures patterns spanning multiple tokens are captured
+ for token_idx in matching_token_indices:
+ if token_idx not in found_token_indices:
+ all_found_matches.append((query_index, token_idx, 1))
+ found_token_indices.add(token_idx)
+
+ print(
+ f"Found {len(matches)} regex match(es) spanning {len(set(idx for _, idx, _ in all_found_matches))} token(s)"
+ )
+
+ else:
+ # Original literal token matching logic
+ # Step 2: Convert the token data into lists for easy comparison.
+ # We need both the text tokens and their original global indices.
+ query_tokens = search_df["text_clean"].tolist()
+ query_indices = search_df.index.tolist()
+
+ reference_tokens = reference_df["text_clean"].tolist()
+ reference_indices = reference_df.index.tolist()
+
+ query_len = len(query_tokens)
+ all_found_matches = list()
+
+ print(f"Searching for a sequence of {query_len} tokens...")
+
+ # Step 3: Use a "sliding window" to search for the query sequence in the reference list.
+ for i in range(len(reference_tokens) - query_len + 1):
+ # The "window" is a slice of the reference list that is the same size as the query
+ window = reference_tokens[i : i + query_len]
+
+ # Step 4: If the window matches the query with or without punctuation on end
+ if _sequences_match(query_tokens, window):
+
+ # Get the global indices for this entire matching block
+ matching_reference_indices = reference_indices[i : i + query_len]
+
+ # Create the mapping between query indices and the found reference indices
+ for j in range(query_len):
+ all_found_matches.append(
+ (query_indices[j], matching_reference_indices[j], 1)
+ )
+
+ # If you only want the *first* match, you can uncomment the next line:
+ # break
+
+ if not all_found_matches:
+ print("No matches found")
+ gr.Info("No matches found")
+ return pd.DataFrame(columns=["Page1_Index", "Page2_Index", "Similarity_Score"])
+
+ # Step 5: Create the final DataFrame in the desired format
+ result_df = pd.DataFrame(
+ all_found_matches, columns=["Page1_Index", "Page2_Index", "Similarity_Score"]
+ )
+ return result_df
+
+
+def identify_similar_text_sequences(
+ df_combined: pd.DataFrame,
+ similarity_threshold: float = 1,
+ min_word_count: int = 1,
+ min_consecutive_pages: int = 1,
+ greedy_match: bool = True,
+ combine_pages: bool = False,
+ inter_file_only: bool = False,
+ do_text_clean: bool = True,
+ file1_name: str = "",
+ file2_name: str = "",
+ output_folder: str = OUTPUT_FOLDER,
+ use_regex: bool = False,
+ progress=Progress(track_tqdm=True),
+) -> Tuple[pd.DataFrame, List[str], pd.DataFrame]:
+ """
+ Identifies similar pages. Uses a highly optimized path for inter_file_only=True.
+ """
+ progress(0.1, desc="Processing and filtering text")
+
+ if do_text_clean:
+ df = clean_and_stem_text_series(
+ df_combined, "text"
+ ) # Will produce the column 'text_clean'
+ else:
+ df = df_combined.copy()
+ df["text_clean"] = df[
+ "text"
+ ].str.lower() # .str.replace(r'[^\w\s]', '', regex=True)
+
+ df["word_count"] = df["text_clean"].str.split().str.len().fillna(0)
+ # df['word_count'] = pd.to_numeric(df['word_count'], errors='coerce').fillna(0).astype('int64')
+
+ # ensure min_word_count is an int (e.g., from Gradio/text input)
+ try:
+ min_word_count = int(min_word_count)
+ except (TypeError, ValueError):
+ min_word_count = 0 # or raise/log, depending on your preference
+
+ original_row_count = len(df)
+ df_filtered = df[df["word_count"] >= min_word_count].copy()
+ df_filtered.reset_index(drop=True, inplace=True)
+
+ print(
+ f"Filtered out {original_row_count - len(df_filtered)} pages with fewer than {min_word_count} words."
+ )
+ if len(df_filtered) < 2:
+ return pd.DataFrame(), [], df_combined
+
+ # Similarity calculated differently if comparing between files only (inter_file_only==True), or within the same file
+ if inter_file_only:
+
+ progress(0.2, desc="Finding direct text matches...")
+
+ # base_similarity_df = _debug_similarity_between_two_files(df_filtered, vectorizer, similarity_threshold, file1_name, file2_name)
+ base_similarity_df = find_consecutive_sequence_matches(
+ df_filtered, file1_name, file2_name, use_regex=use_regex
+ )
+ if base_similarity_df.empty:
+ return pd.DataFrame(), [], df_combined
+
+ else:
+ # Use the original, simpler path for all-to-all comparisons (including intra-file).
+ vectorizer = TfidfVectorizer()
+ print("Standard Path: Calculating all-to-all similarity.")
+ progress(0.2, desc="Vectorising text...")
+ tfidf_matrix = vectorizer.fit_transform(df_filtered["text_clean"])
+
+ progress(0.3, desc="Calculating similarity matrix...")
+ similarity_matrix = cosine_similarity(tfidf_matrix, dense_output=False)
+ coo_matrix = similarity_matrix.tocoo()
+
+ similar_pages = [
+ (r, c, v)
+ for r, c, v in zip(coo_matrix.row, coo_matrix.col, coo_matrix.data)
+ if r < c and v >= similarity_threshold
+ ]
+
+ if not similar_pages:
+ return pd.DataFrame(), [], df_combined
+
+ base_similarity_df = pd.DataFrame(
+ similar_pages, columns=["Page1_Index", "Page2_Index", "Similarity_Score"]
+ )
+
+ progress(0.7, desc="Aggregating results based on matching strategy")
+
+ if greedy_match or min_consecutive_pages > 1:
+ # Sort the dataframe to ensure consecutive pages are adjacent
+ similarity_df = base_similarity_df
+
+ # A new sequence starts if the difference from the previous row is not (1, 1)
+ # is_consecutive will be True if a row continues the sequence, False if it's a new one.
+ is_consecutive = (similarity_df["Page1_Index"].diff() == 1) & (
+ similarity_df["Page2_Index"].diff() == 1
+ )
+
+ # Use cumsum() on the inverted boolean series to create a unique ID for each block.
+ # Every time a 'False' appears (a new block starts), the sum increases.
+ block_id = is_consecutive.eq(False).cumsum()
+
+ # Group by this block ID
+ grouped = similarity_df.groupby(block_id)
+
+ # Aggregate each group to get the start, end, and length of the match
+ agg_results = grouped.agg(
+ Page1_Start_Index=("Page1_Index", "first"),
+ Page2_Start_Index=("Page2_Index", "first"),
+ Page1_End_Index=("Page1_Index", "last"),
+ Page2_End_Index=("Page2_Index", "last"),
+ Match_Length=("Page1_Index", "size"),
+ Avg_Similarity=("Similarity_Score", "mean"),
+ ).reset_index(drop=True)
+
+ # If greedy_match=True, we keep all matches. If min_consecutive_pages > 1, we filter.
+ if greedy_match and min_consecutive_pages <= 1:
+ subdocument_df = agg_results
+ else:
+ # This handles the case for min_consecutive_pages > 1
+ subdocument_df = agg_results[
+ agg_results["Match_Length"] >= min_consecutive_pages
+ ].copy()
+
+ if subdocument_df.empty:
+ gr.Info("No matches found")
+ return pd.DataFrame(), [], df_combined
+
+ final_df = map_metadata_subdocument(subdocument_df, df_filtered)
+ else:
+ print("Finding single page matches, not greedy (min_consecutive_pages=1)")
+ # This part of your code would handle the non-sequential case
+ final_df = map_metadata_single_page(base_similarity_df, df_filtered)
+ # subdocument_df = final_df # To align variable names for saving
+
+ if final_df.empty:
+ gr.Info("No matches found")
+ return pd.DataFrame(), [], df_combined
+
+ progress(0.9, desc="Saving output files")
+
+ output_paths = save_results_and_redaction_lists(
+ final_df, output_folder, combine_pages
+ )
+
+ gr.Info(f"Found {final_df.shape[0]} match(es)")
+ print(f"Found {final_df.shape[0]} match(es)")
+
+ return final_df, output_paths, df_combined
+
+
+def handle_selection_and_preview(
+ evt: gr.SelectData, results_df: pd.DataFrame, full_duplicate_data_by_file: dict
+):
+ """
+ This single function handles a user selecting a row. It:
+ 1. Determines the selected row index.
+ 2. Calls the show_page_previews function to get the text data.
+ 3. Returns all the necessary outputs for the UI.
+ """
+ # If the user deselects, the event might be None.
+ if not evt:
+ return None, None, None # Clear state and both preview panes
+
+ # 1. Get the selected index
+ selected_index = evt.index[0]
+
+ # 2. Get the preview data
+ page1_data, page2_data = show_page_previews(
+ full_duplicate_data_by_file, results_df, evt
+ )
+
+ # 3. Return all three outputs in the correct order
+ return selected_index, page1_data, page2_data
+
+
+def exclude_match(
+ results_df: pd.DataFrame,
+ selected_index: int,
+ output_folder=OUTPUT_FOLDER,
+ current_document_name: str | None = None,
+):
+ """
+ Removes a selected row from the results DataFrame, regenerates output files,
+ clears the text preview panes, and returns the updated list of duplicate
+ page numbers so duplicate_pages_list_state stays in sync for Apply.
+ """
+ if selected_index is None:
+ gr.Warning("No match selected. Please click on a row in the table first.")
+ pages_list = extract_duplicate_page_list(results_df, current_document_name)
+ return results_df, gr.update(), None, None, pages_list
+
+ if results_df.empty:
+ gr.Warning("No duplicate page results found, nothing to exclude.")
+ return results_df, gr.update(), None, None, []
+
+ # Drop the selected row
+ updated_df = results_df.drop(selected_index).reset_index(drop=True)
+
+ # Recalculate all output files using the helper function
+ new_output_paths = save_results_and_redaction_lists(updated_df, output_folder)
+
+ # Keep duplicate_pages_list_state in sync with the updated results
+ updated_pages_list = extract_duplicate_page_list(
+ updated_df, current_document_stem=current_document_name
+ )
+
+ gr.Info(f"Match at row {selected_index} excluded. Output files have been updated.")
+
+ return updated_df, new_output_paths, None, None, updated_pages_list
+
+
+def extract_duplicate_page_list(
+ results_df: pd.DataFrame,
+ current_document_stem: str | None = None,
+) -> list[int]:
+ """
+ Extract the list of Page2 (duplicate) page numbers to redact from the
+ duplicate results DataFrame. Used to populate state for
+ apply_whole_page_redactions_from_list when running in the upload flow.
+
+ Args:
+ results_df: DataFrame from identify_similar_text_sequences (has Page2_*
+ columns).
+ current_document_stem: If provided, only include pages for this file
+ (Page2_File == stem). If None, include all Page2 pages when a single
+ file is present, else all.
+
+ Returns:
+ Sorted list of 1-based page numbers to redact (duplicate side).
+ """
+ if results_df is None or results_df.empty:
+ return []
+ df = results_df
+ if current_document_stem and "Page2_File" in df.columns:
+ stem = (
+ Path(current_document_stem).stem
+ if current_document_stem
+ else current_document_stem
+ )
+ df = df[df["Page2_File"].astype(str) == str(stem)]
+ if df.empty:
+ return []
+ pages = set()
+ if "Page2_Start_Page" in df.columns and "Page2_End_Page" in df.columns:
+ for _, row in df.iterrows():
+ try:
+ start_p, start_line = _parse_page_line_id(row["Page2_Start_Page"])
+ end_p, end_line = _parse_page_line_id(row["Page2_End_Page"])
+ # When value < ID_MULTIPLIER, _parse_page_line_id returns (0, value); treat as plain page number
+ if start_p == 0 and start_line > 0:
+ start_p = start_line
+ if end_p == 0 and end_line > 0:
+ end_p = end_line
+ for p in range(start_p, end_p + 1):
+ pages.add(p)
+ except (ValueError, TypeError, KeyError):
+ pass
+ elif "Page2_Page" in df.columns:
+ for _, row in df.iterrows():
+ try:
+ pages.add(int(row["Page2_Page"]))
+ except (ValueError, TypeError, KeyError):
+ pass
+ return sorted(pages)
+
+
+def run_duplicate_analysis(
+ files: list[str] | pd.DataFrame,
+ threshold: float,
+ min_words: int,
+ min_consecutive: int,
+ greedy_match: bool,
+ all_page_line_level_ocr_results_df_base: pd.DataFrame,
+ ocr_df_paths_list: list[str],
+ combine_pages: bool = True,
+ current_document_name: str | None = None,
+ output_folder: str = OUTPUT_FOLDER,
+ preview_length: int = 500,
+ progress=gr.Progress(track_tqdm=True),
+):
+ """
+ Main wrapper function to orchestrate the duplicate page analysis process.
+ It handles file loading, text combination, similarity identification,
+ and result saving.
+
+ Args:
+ files (list[str] | pd.DataFrame): A list of file paths (PDFs, etc.) to be analyzed for duplicate content, or a dataframe containing the OCR results.
+ threshold (float): The similarity threshold (0.0 to 1.0) above which text segments are considered duplicates.
+ min_words (int): The minimum number of words a text segment must contain to be included in the analysis.
+ min_consecutive (int): The minimum number of consecutive pages that must match for a sequence to be considered a duplicate.
+ greedy_match (bool): If True, uses a greedy matching strategy for identifying consecutive sequences.
+ combine_pages (bool, optional): If True, text from multiple pages is combined into larger segments for analysis. Defaults to True.
+ all_page_line_level_ocr_results_df_base (pd.DataFrame): The base dataframe containing the OCR results.
+ ocr_df_paths_list (list[str]): A list of file paths to the OCR results.
+ current_document_name (str | None, optional): When files is a DataFrame, used as the stem for saved filenames (e.g. from doc_file_name_with_extension_textbox). If None or empty, falls back to "uploaded_ocr".
+ output_folder (str, optional): The directory where the similarity results and redaction lists will be saved. Defaults to OUTPUT_FOLDER.
+ preview_length (int, optional): The maximum number of characters to display in the text preview panes. Defaults to 500.
+ progress (gr.Progress, optional): A Gradio progress tracker object to display progress in the UI.
+ """
+
+ if isinstance(files, pd.DataFrame) and files.empty:
+ raise Warning("No data found in the uploaded DataFrame.")
+ elif isinstance(files, pd.DataFrame) and not files.empty:
+ pass
+ elif not files:
+ raise Warning("Please upload files to analyse.")
+
+ if isinstance(files, str):
+ files = [files]
+
+ if not ocr_df_paths_list:
+ ocr_df_paths_list = []
+
+ # Count "files" correctly: list length for file paths, or unique file count / 1 for DataFrame
+ if isinstance(files, list):
+ num_files = len(files)
+ elif isinstance(files, pd.DataFrame):
+ num_files = files["file"].nunique() if "file" in files.columns else 1
+ else:
+ num_files = 0
+ if num_files > MAX_SIMULTANEOUS_FILES:
+ out_message = f"Number of files to deduplicate is greater than {MAX_SIMULTANEOUS_FILES}. Please submit a smaller number of files."
+ print(out_message)
+ raise Exception(out_message)
+
+ start_time = time.time()
+
+ task_textbox = "deduplicate"
+
+ progress(0, desc="Combining input files...")
+ if isinstance(files, list):
+ df_combined, first_ocr_df, first_ocr_df_path = combine_ocr_output_text(
+ files, combine_pages=combine_pages, output_folder=output_folder
+ )
+ elif isinstance(files, pd.DataFrame):
+ # Build data_to_process in same format as combine_ocr_output_text:
+ # List[Tuple[str, pd.DataFrame]] with each DataFrame having 'page' and 'text'.
+ if files.empty:
+ raise Warning("No data found in the uploaded DataFrame.")
+ if "page" not in files.columns or "text" not in files.columns:
+ raise Warning(
+ "DataFrame must have 'page' and 'text' columns (same as OCR data from file paths)."
+ )
+ if "file" in files.columns:
+ data_to_process = [
+ (str(file_id), group_df)
+ for file_id, group_df in files.groupby("file", sort=False)
+ ]
+ else:
+ # Use current document name (e.g. from doc_file_name_with_extension_textbox) for save filenames
+ name = (current_document_name or "").strip()
+ stem = Path(name).stem if name else "uploaded_ocr"
+ if not stem or not validate_path_safety(stem + ".csv"):
+ stem = "uploaded_ocr"
+ data_to_process = [(stem, files.copy())]
+ first_ocr_df = data_to_process[0][1]
+ first_ocr_df_path = "" # No path when input is in-memory DataFrame
+ # When single-file DataFrame, use document stem for combined CSV filename so it matches the doc
+ single_stem = (
+ data_to_process[0][0]
+ if len(data_to_process) == 1 and data_to_process[0][0] != "uploaded_ocr"
+ else None
+ )
+ combined_filename = (
+ f"{single_stem}_combined_ocr.csv"
+ if single_stem
+ else "combined_ocr_from_files.csv"
+ )
+ df_combined, _, full_out_ocr_df = combine_ocr_dataframes(
+ input_data=data_to_process,
+ combine_pages=combine_pages,
+ output_folder=output_folder,
+ output_filename=combined_filename,
+ remake_index=True,
+ )
+ else:
+ raise TypeError(
+ "files must be a list of file paths or a pandas DataFrame with 'page' and 'text' columns."
+ )
+
+ # Replace current OCR app components if currently empty
+ if all_page_line_level_ocr_results_df_base.empty:
+ all_page_line_level_ocr_results_df_base = first_ocr_df
+ if first_ocr_df_path and first_ocr_df_path not in ocr_df_paths_list:
+ ocr_df_paths_list.append(first_ocr_df_path)
+
+ if df_combined.empty:
+ raise Warning("No data found in the uploaded files.")
+
+ # Call the main analysis function with the new parameter
+ results_df, output_paths, full_df = identify_similar_text_sequences(
+ df_combined=df_combined,
+ similarity_threshold=threshold,
+ min_word_count=min_words,
+ min_consecutive_pages=int(min_consecutive),
+ greedy_match=greedy_match,
+ combine_pages=combine_pages,
+ output_folder=output_folder,
+ progress=progress,
+ )
+
+ full_df["text"] = full_df["text"].astype(str)
+
+ # Clip text to first 200 characters
+ full_df["text"] = full_df["text"].str[:preview_length]
+
+ # Preprocess full_data (without preview text) for fast access (run once)
+ full_data_by_file = {
+ file: df.sort_values("page").set_index("page")
+ for file, df in full_df.drop(["text_clean"], axis=1).groupby("file")
+ }
+
+ if results_df.empty:
+ gr.Info("No duplicate pages found, no results returned.")
+
+ # List of duplicate (Page2) page numbers for apply_whole_page_redactions_from_list
+ duplicate_pages_list = extract_duplicate_page_list(
+ results_df,
+ current_document_stem=current_document_name,
+ )
+
+ end_time = time.time()
+ processing_time = round(end_time - start_time, 1)
+
+ return (
+ results_df,
+ output_paths,
+ full_data_by_file,
+ processing_time,
+ task_textbox,
+ all_page_line_level_ocr_results_df_base,
+ ocr_df_paths_list,
+ duplicate_pages_list,
+ )
+
+
+def show_page_previews(
+ full_data_by_file: dict,
+ results_df: pd.DataFrame,
+ evt: gr.SelectData,
+ preview_length: int = 500,
+):
+ """
+ Optimized version using pre-partitioned and indexed full_data.
+ Triggered when a user selects a row in the results DataFrame.
+ """
+ if not full_data_by_file or results_df is None or not evt:
+ return None, None
+
+ selected_row = results_df.iloc[evt.index[0], :]
+
+ is_subdocument_match = "Page1_Start_Page" in selected_row
+
+ if is_subdocument_match:
+ file1, start1, end1 = (
+ selected_row["Page1_File"],
+ selected_row["Page1_Start_Page"],
+ selected_row["Page1_End_Page"],
+ )
+ file2, start2, end2 = (
+ selected_row["Page2_File"],
+ selected_row["Page2_Start_Page"],
+ selected_row["Page2_End_Page"],
+ )
+
+ page1_data = full_data_by_file[file1].loc[start1:end1, ["text"]].reset_index()
+ page2_data = full_data_by_file[file2].loc[start2:end2, ["text"]].reset_index()
+
+ else:
+ file1, page1 = selected_row["Page1_File"], selected_row["Page1_Page"]
+ file2, page2 = selected_row["Page2_File"], selected_row["Page2_Page"]
+
+ page1_data = full_data_by_file[file1].loc[[page1], ["text"]].reset_index()
+ page2_data = full_data_by_file[file2].loc[[page2], ["text"]].reset_index()
+
+ page1_data["text"] = page1_data["text"].astype(str)
+ page2_data["text"] = page2_data["text"].astype(str)
+
+ page1_data["text"] = page1_data["text"].str[:preview_length]
+ page2_data["text"] = page2_data["text"].str[:preview_length]
+
+ return page1_data[["page", "text"]], page2_data[["page", "text"]]
+
+
+def get_page_image_info(page_num: int, page_sizes: List[Dict]) -> Optional[Dict]:
+ """
+ Finds and returns the size and path information for a specific page.
+ Compares page numbers as integers so that string/numpy types match.
+ """
+ try:
+ page_num_int = int(page_num)
+ except (TypeError, ValueError):
+ return None
+ for size in page_sizes:
+ try:
+ if int(size.get("page")) == page_num_int:
+ return size
+ except (TypeError, ValueError, KeyError):
+ continue
+ return None
+
+
+def add_new_annotations_to_existing_page_annotations(
+ all_annotations: List[Dict], image_path: str, new_annotation_boxes: List[Dict]
+) -> Tuple[List[Dict], Dict]:
+ """
+ Adds a list of new annotation boxes to the annotations for a specific page.
+
+ If the page already has annotations, it extends the list of boxes. If not,
+ it creates a new entry for the page.
+
+ Args:
+ all_annotations (List[Dict]): The current list of all annotation groups.
+ image_path (str): The identifier for the image/page.
+ new_annotation_boxes (List[Dict]): A list of new annotation boxes to add.
+
+ Returns:
+ Tuple[List[Dict], Dict]: A tuple containing:
+ - The updated list of all annotation groups.
+ - The annotation group representing the newly added boxes.
+ """
+ # Find the annotation group for the current page/image
+ current_page_group = next(
+ (
+ annot_group
+ for annot_group in all_annotations
+ if annot_group["image"] == image_path
+ ),
+ None,
+ )
+
+ if current_page_group:
+ # Page already has annotations, so extend the list with the new boxes
+ current_page_group["boxes"].extend(new_annotation_boxes)
+ else:
+ # This is the first set of annotations for this page, create a new group
+ new_group = {"image": image_path, "boxes": new_annotation_boxes}
+ all_annotations.append(new_group)
+
+ # This object represents all annotations that were just added for this page
+ newly_added_annotation_group = {"image": image_path, "boxes": new_annotation_boxes}
+
+ return all_annotations, newly_added_annotation_group
+
+
+def apply_whole_page_redactions_from_list(
+ duplicate_page_numbers_df_or_list: pd.DataFrame | list[str],
+ doc_file_name_with_extension_textbox: str,
+ review_file_state: pd.DataFrame,
+ duplicate_output_paths: list[str],
+ pymupdf_doc: pymupdf.Document,
+ page_sizes: list[dict],
+ all_existing_annotations: list[dict],
+ combine_pages: bool = True,
+ new_annotations_with_bounding_boxes: List[dict] = list(),
+ review_file_path: str = "",
+):
+ """
+ This function applies redactions to whole pages based on a provided list of duplicate page numbers. It supports two modes of operation: combining pages and not combining pages. When combining pages is enabled, it attempts to identify duplicate pages across different files and applies redactions accordingly. If combining pages is disabled, it relies on new annotations with bounding boxes to determine which pages to redact. The function utilises a PyMuPDF document object to manipulate the PDF file, and it also considers the sizes of pages to ensure accurate redaction application.
+
+ Args:
+ duplicate_page_numbers_df_or_list: A DataFrame or list containing page numbers identified as duplicates (supports both legacy DataFrame and new Dropdown list format).
+ doc_file_name_with_extension_textbox (str): The name of the document file with its extension.
+ review_file_state (pd.DataFrame): The current state of the review file.
+ duplicate_output_paths (list[str]): A list of paths to files containing duplicate page information.
+ pymupdf_doc (object): A PyMuPDF document object representing the PDF file.
+ page_sizes (list[dict]): A list of dictionaries containing page size information.
+ all_existing_annotations (list[dict]): A list of all existing annotations in the document.
+ combine_pages (bool, optional): A flag indicating whether to combine pages for redaction. Defaults to True.
+ new_annotations_with_bounding_boxes (List[dict], optional): A list of new annotations with bounding boxes. Defaults to an empty list.
+ review_file_path (str, optional): Path to the review PDF file. If pymupdf_doc is not available (e.g. after redaction the handle was released), the document is loaded from this path.
+ """
+
+ if all_existing_annotations is None:
+ all_existing_annotations = list()
+
+ if new_annotations_with_bounding_boxes is None:
+ new_annotations_with_bounding_boxes = list()
+
+ # Gradio File component can pass a single path (str) or list of paths; normalize to list
+ if isinstance(duplicate_output_paths, str):
+ duplicate_output_paths = (
+ [duplicate_output_paths] if duplicate_output_paths else []
+ )
+ elif duplicate_output_paths is None:
+ duplicate_output_paths = []
+
+ all_annotations = all_existing_annotations.copy()
+
+ list_whole_pages_to_redact = list()
+
+ if combine_pages is True:
+ # Get list of pages to redact from either dataframe, list, or file
+ # Handle both DataFrame (legacy) and list (new Dropdown format)
+ if isinstance(duplicate_page_numbers_df_or_list, pd.DataFrame):
+ if not duplicate_page_numbers_df_or_list.empty:
+ df = duplicate_page_numbers_df_or_list
+ # Only redact Page2 (duplicate) pages, not Page1 (original)
+ page_cols = [
+ c
+ for c in (
+ "Page2_Start_Page",
+ "Page2_End_Page",
+ )
+ if c in df.columns
+ ]
+ if page_cols:
+ list_whole_pages_to_redact = []
+ if (
+ "Page2_Start_Page" in df.columns
+ and "Page2_End_Page" in df.columns
+ ):
+ for _, row in df.iterrows():
+ try:
+ start_p = int(row["Page2_Start_Page"])
+ end_p = int(row["Page2_End_Page"])
+ list_whole_pages_to_redact.extend(
+ range(start_p, end_p + 1)
+ )
+ except (ValueError, TypeError):
+ pass
+ else:
+ for col in page_cols:
+ list_whole_pages_to_redact.extend(
+ df[col].dropna().astype(int).tolist()
+ )
+ else:
+ list_whole_pages_to_redact = df.iloc[:, 0].tolist()
+ elif (
+ isinstance(duplicate_page_numbers_df_or_list, list)
+ and len(duplicate_page_numbers_df_or_list) > 0
+ ):
+ # Dropdown component returns a list directly
+ try:
+ # Try to convert to integers for page numbers
+ list_whole_pages_to_redact = [
+ int(item) for item in duplicate_page_numbers_df_or_list if item
+ ]
+ except (ValueError, TypeError):
+ # Fall back to string list if conversion fails
+ list_whole_pages_to_redact = [
+ str(item) for item in duplicate_page_numbers_df_or_list if item
+ ]
+ elif duplicate_output_paths:
+ expected_duplicate_pages_to_redact_name = (
+ f"{doc_file_name_with_extension_textbox}"
+ )
+ # Substitute out '_for_review' from the expected filename to successfully modify existing redactions files
+ expected_duplicate_pages_to_redact_name = (
+ expected_duplicate_pages_to_redact_name.replace(
+ "_redactions_for_review", ""
+ ).replace(".pdf", "")
+ )
+
+ whole_pages_list = pd.DataFrame() # Initialize empty DataFrame
+ chosen_output_file = None
+
+ # Prefer *_pages_to_redact.csv (only Page2/duplicate pages); avoid using
+ # page_similarity_results.csv as first match, which would otherwise add
+ # both Page1 and Page2 and redact the wrong pages.
+ PAGES_TO_REDACT_SUFFIX = "_pages_to_redact"
+ for output_file in duplicate_output_paths:
+ if isinstance(output_file, str):
+ file_name_from_path = os.path.basename(output_file)
+ else:
+ file_name_from_path = getattr(
+ output_file, "name", str(output_file).split(os.sep)[-1]
+ )
+ file_name_from_path = file_name_from_path.replace(
+ "_redactions_for_review", ""
+ )
+
+ if expected_duplicate_pages_to_redact_name in file_name_from_path:
+ if PAGES_TO_REDACT_SUFFIX in file_name_from_path:
+ whole_pages_list = pd.read_csv(output_file, header=None)
+ chosen_output_file = output_file
+ break
+ if chosen_output_file is None:
+ chosen_output_file = output_file
+ if whole_pages_list.empty and chosen_output_file is not None:
+ whole_pages_list = pd.read_csv(chosen_output_file, header=None)
+
+ if not whole_pages_list.empty:
+ output_file = chosen_output_file
+ # Support both formats:
+ # 1) page_similarity_results.csv: has header row. Only Page2_* columns
+ # are the duplicate pages to redact; Page1_* is the "original" and
+ # must not be redacted (same semantics as _pages_to_redact.csv).
+ # 2) *_pages_to_redact.csv: no header, first column is page numbers
+ first_col = whole_pages_list.iloc[:, 0]
+ if (
+ len(whole_pages_list.columns) >= 6
+ and first_col.iloc[0] == "Page1_File"
+ ):
+ # Likely page_similarity_results.csv with header in first row
+ df_with_header = pd.read_csv(output_file, header=0)
+ # Only use Page2 columns (duplicate side); do not redact Page1 (original)
+ page_cols = [
+ c
+ for c in (
+ "Page2_Start_Page",
+ "Page2_End_Page",
+ )
+ if c in df_with_header.columns
+ ]
+ if page_cols:
+ list_whole_pages_to_redact = []
+ if (
+ "Page2_Start_Page" in df_with_header.columns
+ and "Page2_End_Page" in df_with_header.columns
+ ):
+ for _, row in df_with_header.iterrows():
+ try:
+ start_p = int(row["Page2_Start_Page"])
+ end_p = int(row["Page2_End_Page"])
+ list_whole_pages_to_redact.extend(
+ range(start_p, end_p + 1)
+ )
+ except (ValueError, TypeError):
+ pass
+ else:
+ for col in page_cols:
+ list_whole_pages_to_redact.extend(
+ df_with_header[col].dropna().astype(int).tolist()
+ )
+ else:
+ list_whole_pages_to_redact = first_col.tolist()
+ else:
+ list_whole_pages_to_redact = whole_pages_list.iloc[:, 0].tolist()
+ else:
+ message = "No relevant list of whole pages to redact found."
+ print(message)
+ gr.Info(message)
+ return review_file_state, all_annotations
+
+ list_whole_pages_to_redact = list(set(list_whole_pages_to_redact))
+
+ # When called from "Apply match" with combine_pages=True, the page list may come from
+ # in_fully_redacted_list_state (upload), which can be empty. Fall back to deriving
+ # pages from new_annotations_with_bounding_boxes so suggested duplicate pages are applied.
+ if not list_whole_pages_to_redact and new_annotations_with_bounding_boxes:
+ from tools.secure_regex_utils import safe_extract_page_number_from_path
+
+ list_whole_pages_to_redact = list()
+ for annotation in new_annotations_with_bounding_boxes:
+ page_num = safe_extract_page_number_from_path(annotation.get("image"))
+ if page_num is not None:
+ list_whole_pages_to_redact.append(page_num + 1)
+ else:
+ img = annotation.get("image", "")
+ print(f"Warning: Could not extract page number from {img!r}")
+ list_whole_pages_to_redact = list(set(list_whole_pages_to_redact))
+
+ else:
+ if not new_annotations_with_bounding_boxes:
+ message = "No new annotations to add"
+ print(message)
+ gr.Info(message)
+ return review_file_state, all_annotations
+
+ list_whole_pages_to_redact = list()
+ for annotation in new_annotations_with_bounding_boxes:
+ from tools.secure_regex_utils import safe_extract_page_number_from_path
+
+ page_num = safe_extract_page_number_from_path(annotation["image"])
+ if page_num is not None:
+ page = page_num + 1
+ list_whole_pages_to_redact.append(page)
+ else:
+ print(
+ f"Warning: Could not extract page number from {annotation['image']}"
+ )
+
+ list_whole_pages_to_redact = list(set(list_whole_pages_to_redact))
+
+ new_annotations = list()
+ # Process each page for redaction (page may be int or string label/filename)
+ for page in list_whole_pages_to_redact:
+ try:
+ page_num = _normalize_page_to_int(page)
+ if page_num is None:
+ print(f"Warning: Could not parse page number from {page!r}, skipping.")
+ continue
+ page_index = page_num - 1
+ if not (0 <= page_index < len(pymupdf_doc)):
+ print(f"Page {page_num} is out of bounds, skipping.")
+ continue
+
+ page_info = get_page_image_info(page_num, page_sizes)
+ if not page_info:
+ print(f"Page {page_num} not found in page_sizes, skipping.")
+ continue
+
+ image_path = page_info["image_path"]
+ page_annotation_group = next(
+ (g for g in all_annotations if g["image"] == image_path), None
+ )
+ if page_annotation_group and any(
+ box["label"] == "Whole page" for box in page_annotation_group["boxes"]
+ ):
+ print(
+ f"Whole page redaction for page {page_num} already exists, skipping."
+ )
+ continue
+
+ boxes_to_add = list()
+
+ pymupdf_page = pymupdf_doc[page_index]
+
+ if combine_pages is True:
+ whole_page_box = redact_whole_pymupdf_page(
+ rect_height=page_info["cropbox_height"],
+ rect_width=page_info["cropbox_width"],
+ page=pymupdf_page,
+ border=0.005,
+ redact_pdf=False,
+ )
+ boxes_to_add.append(whole_page_box)
+ else:
+ # Find the specific annotation group that matches the current page's image path
+ relevant_box_group = next(
+ (
+ group
+ for group in new_annotations_with_bounding_boxes
+ if group.get("image") == image_path
+ ),
+ None, # Default to None if no match is found
+ )
+
+ # Check if we found a matching group of boxes for this page
+ if relevant_box_group:
+ boxes_to_add.extend(relevant_box_group["boxes"])
+ else:
+ # This case would be unexpected, but it's good to handle.
+ # It means a page was in list_whole_pages_to_redact but had no
+ # corresponding boxes generated in new_annotations_with_bounding_boxes.
+ print(
+ f"Warning: No new annotation boxes found for page {page_num} ({image_path})."
+ )
+
+ # === Use the modified helper function to add a LIST of boxes ===
+ all_annotations, new_annotations_for_page = (
+ add_new_annotations_to_existing_page_annotations(
+ all_annotations=all_annotations,
+ image_path=image_path,
+ new_annotation_boxes=boxes_to_add, # Pass the list here
+ )
+ )
+
+ new_annotations_for_page = fill_missing_box_ids_each_box(
+ new_annotations_for_page
+ )
+ new_annotations.append(new_annotations_for_page)
+
+ except Exception as e:
+ print(f"Error processing page {page}: {str(e)}")
+ continue
+
+ whole_page_review_file = convert_annotation_data_to_dataframe(new_annotations)
+
+ if whole_page_review_file.empty:
+ if list_whole_pages_to_redact:
+ # We had pages to redact but none were applied (e.g. all skipped by get_page_image_info)
+ message = (
+ "No new whole page redactions were added. The duplicate page list had "
+ f"{len(list_whole_pages_to_redact)} page(s), but none matched the current document's "
+ "page layout. Ensure the PDF under review is the same document (or the duplicate side) "
+ "that was analysed, and that the Review tab has loaded the document so page_sizes is set."
+ )
+ else:
+ message = "No new whole page redactions were added."
+ print(message)
+ gr.Info(message)
+
+ if pymupdf_doc is not None:
+ try:
+ pymupdf_doc.close()
+ except Exception:
+ pass
+ return review_file_state, all_annotations
+
+ expected_cols = [
+ "image",
+ "page",
+ "label",
+ "color",
+ "xmin",
+ "ymin",
+ "xmax",
+ "ymax",
+ "text",
+ "id",
+ ]
+ for col in expected_cols:
+ if col not in review_file_state.columns:
+ review_file_state[col] = pd.NA
+ if col not in whole_page_review_file.columns:
+ whole_page_review_file[col] = pd.NA
+
+ # Avoid concat with empty/all-NA to prevent FutureWarning (pandas 2.1+)
+ if review_file_state.empty:
+ review_file_out = whole_page_review_file.copy()
+ else:
+ # Work on copies and give all-NA columns explicit dtype for unambiguous concat
+ state = review_file_state.copy()
+ new_part = whole_page_review_file.copy()
+ for col in expected_cols:
+ if col in state.columns and state[col].isna().all():
+ state[col] = state[col].astype(object)
+ if col in new_part.columns and new_part[col].isna().all():
+ new_part[col] = new_part[col].astype(object)
+ review_file_out = pd.concat([state, new_part], ignore_index=True)
+ review_file_out = review_file_out.sort_values(
+ by=["page", "ymin", "xmin"]
+ ).reset_index(drop=True)
+ review_file_out = review_file_out.drop_duplicates(
+ subset=["page", "label", "text", "id"], keep="first"
+ )
+
+ out_message = (
+ f"Successfully created {whole_page_review_file.shape[0]} whole page redactions."
+ )
+ print(out_message)
+ gr.Info(out_message)
+
+ if pymupdf_doc is not None:
+ try:
+ pymupdf_doc.close()
+ except Exception:
+ pass
+ return review_file_out, all_annotations
+
+
+def _parse_page_line_id(combined_id: int) -> Tuple[int, int]:
+ """Parses a combined ID using modular arithmetic."""
+ if int(combined_id) < ID_MULTIPLIER:
+ # Handle cases where page is 0 (or just an edge case)
+ return 0, combined_id
+
+ page = combined_id // ID_MULTIPLIER
+ line = combined_id % ID_MULTIPLIER
+ return page, line
+
+
+def create_annotation_objects_from_duplicates(
+ duplicates_df: pd.DataFrame,
+ ocr_results_df: pd.DataFrame,
+ page_sizes: List[Dict],
+ combine_pages: bool = False,
+) -> List[Dict]:
+ """
+ Creates structured annotation objects from duplicate line ranges, mapping
+ page numbers to image paths.
+
+ Args:
+ duplicates_df (pd.DataFrame): DataFrame with duplicate ranges.
+ ocr_results_df (pd.DataFrame): DataFrame with OCR results.
+ page_sizes (List[Dict]): A list of dictionaries mapping page numbers to image paths and other metadata. Expected format: [{"page": 1, "image_path": "path/to/img.png", ...}]
+ combine_pages (bool): A boolean that determines whether in previous functions, all text from a page was combined (True). This function will only run if this is False.
+
+ Returns:
+ List[Dict]: A list of dictionaries, where each dict represents a page and its list of annotation boxes, in the format: [{"image": "path/to/img.png", "boxes": [...]}, ...]
+ """
+
+ final_output = list()
+
+ if duplicates_df.empty:
+ gr.Info("No duplicates found")
+ return final_output
+ if ocr_results_df.empty:
+ raise Warning(
+ "No OCR results found for file under review. Please upload relevant OCR_output file and original PDF document on the review tab."
+ )
+
+ if combine_pages is False:
+ page_to_image_map = {item["page"]: item["image_path"] for item in page_sizes}
+
+ # Prepare OCR Data: line_number_by_page must match the duplicate detection
+ # pipeline.
+ if "line" in ocr_results_df.columns:
+ ocr_results_df = ocr_results_df.copy()
+ ocr_results_df["line_number_by_page"] = (
+ pd.to_numeric(ocr_results_df["line"], errors="coerce")
+ .fillna(0)
+ .astype(int)
+ )
+ else:
+ ocr_results_df = ocr_results_df.sort_values(
+ by=["page", "top", "left"]
+ ).reset_index(drop=True)
+ ocr_results_df["line_number_by_page"] = (
+ ocr_results_df.groupby("page").cumcount() + 1
+ )
+
+ annotations_by_page = defaultdict(list)
+
+ # Detect format: subdocument (Page2_Start_Page / Page2_End_Page) vs single-page (Page2_Page)
+ is_subdocument_format = (
+ "Page2_Start_Page" in duplicates_df.columns
+ and "Page2_End_Page" in duplicates_df.columns
+ )
+
+ for _, row in duplicates_df.iterrows():
+ if is_subdocument_format:
+ start_page, start_line = _parse_page_line_id(row["Page2_Start_Page"])
+ end_page, end_line = _parse_page_line_id(row["Page2_End_Page"])
+
+ if start_page == end_page:
+ condition = (ocr_results_df["page"] == start_page) & (
+ ocr_results_df["line_number_by_page"].between(
+ start_line, end_line
+ )
+ )
+ else:
+ cond_start = (ocr_results_df["page"] == start_page) & (
+ ocr_results_df["line_number_by_page"] >= start_line
+ )
+ cond_middle = ocr_results_df["page"].between(
+ start_page + 1, end_page - 1
+ )
+ cond_end = (ocr_results_df["page"] == end_page) & (
+ ocr_results_df["line_number_by_page"] <= end_line
+ )
+ condition = cond_start | cond_middle | cond_end
+ else:
+ # Single-page format (min_consecutive_pages=1, not greedy): Page2_Page only
+ if "Page2_Page" not in row.index:
+ print(
+ "Warning: duplicates_df has neither Page2_Start_Page/Page2_End_Page nor Page2_Page; skipping row."
+ )
+ continue
+ page_num = int(row["Page2_Page"])
+ condition = ocr_results_df["page"] == page_num
+
+ lines_to_annotate = ocr_results_df[condition]
+
+ # Build and group annotation boxes by page number (this logic is unchanged)
+ for _, line_row in lines_to_annotate.iterrows():
+ box = {
+ "label": "Duplicate text",
+ "color": (0, 0, 0),
+ "xmin": line_row["left"],
+ "ymin": line_row["top"],
+ "xmax": line_row["left"] + line_row["width"],
+ "ymax": line_row["top"] + line_row["height"],
+ "text": line_row["text"],
+ "id": "", # to be filled in after
+ }
+ page_number = line_row["page"]
+
+ annotations_by_page[page_number].append(box)
+
+ # --- Format the final output list using the page-to-image map ---
+
+ # Sort by page number for a predictable order
+ for page_num, boxes in sorted(annotations_by_page.items()):
+ # Look up the image path using the page number
+ image_path = page_to_image_map.get(page_num)
+
+ if image_path:
+ page_boxes = {"image": image_path, "boxes": boxes}
+
+ # Fill in missing IDs for the new data entries
+ page_boxes = fill_missing_box_ids_each_box(page_boxes)
+
+ # Add the annotation group using 'image' as the key
+ final_output.append(page_boxes)
+ else:
+ # Handle cases where a page might not have a corresponding image path
+ print(
+ f"Warning: Page {page_num} found in OCR data but has no corresponding "
+ f"entry in the 'page_sizes' object. This page's annotations will be skipped."
+ )
+
+ return final_output
+
+
+def run_search_with_regex_option(
+ search_text, word_df, similarity_threshold, use_regex_flag
+):
+ """Wrapper function to call run_full_search_and_analysis with regex option"""
+ return run_full_search_and_analysis(
+ search_query_text=search_text,
+ word_level_df_orig=word_df,
+ similarity_threshold=similarity_threshold,
+ combine_pages=False,
+ min_word_count=1,
+ min_consecutive_pages=1,
+ greedy_match=True,
+ remake_index=False,
+ use_regex=use_regex_flag,
+ )
diff --git a/tools/find_duplicate_tabular.py b/tools/find_duplicate_tabular.py
new file mode 100644
index 0000000000000000000000000000000000000000..570e7fb7717f440fca83f1d64583999cf617b1fc
--- /dev/null
+++ b/tools/find_duplicate_tabular.py
@@ -0,0 +1,742 @@
+import os
+import time
+from pathlib import Path
+from typing import Dict, List, Tuple
+
+import gradio as gr
+import pandas as pd
+from gradio import Progress
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+
+from tools.config import (
+ DO_INITIAL_TABULAR_DATA_CLEAN,
+ MAX_SIMULTANEOUS_FILES,
+ MAX_TABLE_ROWS,
+ REMOVE_DUPLICATE_ROWS,
+)
+from tools.data_anonymise import initial_clean
+from tools.helper_functions import OUTPUT_FOLDER, read_file
+from tools.load_spacy_model_custom_recognisers import nlp
+from tools.secure_path_utils import secure_join
+
+
+def clean_and_stem_text_series(
+ df: pd.DataFrame,
+ column: str,
+ do_initial_clean_dup: bool = DO_INITIAL_TABULAR_DATA_CLEAN,
+):
+ """
+ Clean and stem text columns in a data frame for tabular data
+ """
+
+ # Function to apply lemmatisation and remove stopwords
+ def _apply_lemmatization(text):
+ doc = nlp(text)
+ # Keep only alphabetic tokens and remove stopwords
+ lemmatized_words = [
+ token.lemma_ for token in doc if token.is_alpha and not token.is_stop
+ ]
+ return " ".join(lemmatized_words)
+
+ # Always create text_clean column first
+ if do_initial_clean_dup:
+ df["text_clean"] = initial_clean(df[column])
+ else:
+ df["text_clean"] = df[column]
+
+ df["text_clean"] = df["text_clean"].apply(_apply_lemmatization)
+ df["text_clean"] = df[
+ "text_clean"
+ ].str.lower() # .str.replace(r'[^\w\s]', '', regex=True)
+
+ return df
+
+
+def convert_tabular_data_to_analysis_format(
+ df: pd.DataFrame, file_name: str, text_columns: List[str] = None
+) -> List[Tuple[str, pd.DataFrame]]:
+ """
+ Convert tabular data (CSV/XLSX) to the format needed for duplicate analysis.
+
+ Args:
+ df (pd.DataFrame): The input DataFrame
+ file_name (str): Name of the file
+ text_columns (List[str], optional): Columns to analyze for duplicates.
+ If None, uses all string columns.
+
+ Returns:
+ List[Tuple[str, pd.DataFrame]]: List containing (file_name, processed_df) tuple
+ """
+ # if text_columns is None:
+ # # Auto-detect text columns (string type columns)
+ # print(f"No text columns given for {file_name}")
+ # return []
+ # text_columns = df.select_dtypes(include=['object', 'string']).columns.tolist()
+
+ text_columns = [col for col in text_columns if col in df.columns]
+
+ if not text_columns:
+ print(f"No text columns found in {file_name}")
+ return list()
+
+ # Create a copy to avoid modifying original
+ df_copy = df.copy()
+
+ # Create a combined text column from all text columns
+ df_copy["combined_text"] = (
+ df_copy[text_columns].fillna("").astype(str).agg(" ".join, axis=1)
+ )
+
+ # Add row identifier
+ df_copy["row_id"] = df_copy.index
+
+ # Create the format expected by the duplicate detection system
+ # Using 'row_number' as row number and 'text' as the combined text
+ processed_df = pd.DataFrame(
+ {
+ "row_number": df_copy["row_id"],
+ "text": df_copy["combined_text"],
+ "file": file_name,
+ }
+ )
+
+ # Add original row data for reference
+ for col in text_columns:
+ processed_df[f"original_{col}"] = df_copy[col]
+
+ return [(file_name, processed_df)]
+
+
+def find_duplicate_cells_in_tabular_data(
+ input_files: List[str],
+ similarity_threshold: float = 0.95,
+ min_word_count: int = 3,
+ text_columns: List[str] = [],
+ output_folder: str = OUTPUT_FOLDER,
+ do_initial_clean_dup: bool = DO_INITIAL_TABULAR_DATA_CLEAN,
+ remove_duplicate_rows: bool = REMOVE_DUPLICATE_ROWS,
+ in_excel_tabular_sheets: str = "",
+ progress: Progress = Progress(track_tqdm=True),
+) -> Tuple[pd.DataFrame, List[str], Dict[str, pd.DataFrame]]:
+ """
+ Find duplicate cells/text in tabular data files (CSV, XLSX, Parquet).
+
+ Args:
+ input_files (List[str]): List of file paths to analyze
+ similarity_threshold (float): Minimum similarity score to consider duplicates
+ min_word_count (int): Minimum word count for text to be considered
+ text_columns (List[str], optional): Specific columns to analyze
+ output_folder (str, optional): Output folder for results
+ do_initial_clean_dup (bool, optional): Whether to do initial clean of text
+ progress (Progress): Progress tracking object
+
+ Returns:
+ Tuple containing:
+ - results_df: DataFrame with duplicate matches
+ - output_paths: List of output file paths
+ - full_data_by_file: Dictionary of processed data by file
+ """
+
+ if not input_files:
+ raise gr.Error("Please upload files to analyze.")
+
+ progress(0.1, desc="Loading and processing files...")
+
+ all_data_to_process = list()
+ full_data_by_file = dict()
+ file_paths = list()
+
+ # Process each file
+ for file_path in input_files:
+ try:
+ if file_path.endswith(".xlsx") or file_path.endswith(".xls"):
+ temp_df = pd.DataFrame()
+
+ # Try finding each sheet in the given list until a match is found
+ for sheet_name in in_excel_tabular_sheets:
+ temp_df = read_file(file_path, excel_sheet_name=sheet_name)
+
+ # If sheet was successfully_loaded
+ if not temp_df.empty:
+
+ if temp_df.shape[0] > MAX_TABLE_ROWS:
+ out_message = f"Number of rows in {file_path} for sheet {sheet_name} is greater than {MAX_TABLE_ROWS}. Please submit a smaller file."
+ print(out_message)
+ raise Exception(out_message)
+
+ file_name = os.path.basename(file_path) + "_" + sheet_name
+ file_paths.append(file_path)
+
+ # Convert to analysis format
+ processed_data = convert_tabular_data_to_analysis_format(
+ temp_df, file_name, text_columns
+ )
+
+ if processed_data:
+ all_data_to_process.extend(processed_data)
+ full_data_by_file[file_name] = processed_data[0][1]
+
+ temp_df = pd.DataFrame()
+ else:
+ temp_df = read_file(file_path)
+
+ if temp_df.shape[0] > MAX_TABLE_ROWS:
+ out_message = f"Number of rows in {file_path} is greater than {MAX_TABLE_ROWS}. Please submit a smaller file."
+ print(out_message)
+ raise Exception(out_message)
+
+ file_name = os.path.basename(file_path)
+ file_paths.append(file_path)
+
+ # Convert to analysis format
+ processed_data = convert_tabular_data_to_analysis_format(
+ temp_df, file_name, text_columns
+ )
+
+ if processed_data:
+ all_data_to_process.extend(processed_data)
+ full_data_by_file[file_name] = processed_data[0][1]
+
+ except Exception as e:
+ print(f"Error processing {file_path}: {e}")
+ continue
+
+ if not all_data_to_process:
+ raise gr.Error("No valid data found in uploaded files.")
+
+ progress(0.2, desc="Combining data...")
+
+ # Combine all data
+ combined_df = pd.concat(
+ [data[1] for data in all_data_to_process], ignore_index=True
+ )
+
+ combined_df = combined_df.drop_duplicates(subset=["row_number", "file"])
+
+ progress(0.3, desc="Cleaning and preparing text...")
+
+ # Clean and prepare text
+ combined_df = clean_and_stem_text_series(
+ combined_df, "text", do_initial_clean_dup=do_initial_clean_dup
+ )
+
+ # Filter by minimum word count
+ combined_df["word_count"] = (
+ combined_df["text_clean"].str.split().str.len().fillna(0)
+ )
+ combined_df = combined_df[combined_df["word_count"] >= min_word_count].copy()
+
+ if len(combined_df) < 2:
+ return pd.DataFrame(), [], full_data_by_file
+
+ progress(0.4, desc="Calculating similarities...")
+
+ # Calculate similarities
+ vectorizer = TfidfVectorizer()
+ tfidf_matrix = vectorizer.fit_transform(combined_df["text_clean"])
+ similarity_matrix = cosine_similarity(tfidf_matrix, dense_output=False)
+
+ # Find similar pairs
+ coo_matrix = similarity_matrix.tocoo()
+ similar_pairs = [
+ (r, c, v)
+ for r, c, v in zip(coo_matrix.row, coo_matrix.col, coo_matrix.data)
+ if r < c and v >= similarity_threshold
+ ]
+
+ if not similar_pairs:
+ gr.Info("No duplicate cells found.")
+ return pd.DataFrame(), [], full_data_by_file
+
+ progress(0.7, desc="Processing results...")
+
+ # Create results DataFrame
+ results_data = []
+ for row1, row2, similarity in similar_pairs:
+ row1_data = combined_df.iloc[row1]
+ row2_data = combined_df.iloc[row2]
+
+ results_data.append(
+ {
+ "File1": row1_data["file"],
+ "Row1": int(row1_data["row_number"]),
+ "File2": row2_data["file"],
+ "Row2": int(row2_data["row_number"]),
+ "Similarity_Score": round(similarity, 3),
+ "Text1": (
+ row1_data["text"][:200] + "..."
+ if len(row1_data["text"]) > 200
+ else row1_data["text"]
+ ),
+ "Text2": (
+ row2_data["text"][:200] + "..."
+ if len(row2_data["text"]) > 200
+ else row2_data["text"]
+ ),
+ "Original_Index1": row1,
+ "Original_Index2": row2,
+ }
+ )
+
+ results_df = pd.DataFrame(results_data)
+ results_df = results_df.sort_values(["File1", "Row1", "File2", "Row2"])
+
+ progress(0.9, desc="Saving results...")
+
+ # Save results
+ output_paths = save_tabular_duplicate_results(
+ results_df,
+ output_folder,
+ file_paths,
+ remove_duplicate_rows=remove_duplicate_rows,
+ in_excel_tabular_sheets=in_excel_tabular_sheets,
+ )
+
+ gr.Info(f"Found {len(results_df)} duplicate cell matches")
+
+ return results_df, output_paths, full_data_by_file
+
+
+def save_tabular_duplicate_results(
+ results_df: pd.DataFrame,
+ output_folder: str,
+ file_paths: List[str],
+ remove_duplicate_rows: bool = REMOVE_DUPLICATE_ROWS,
+ in_excel_tabular_sheets: List[str] = [],
+) -> List[str]:
+ """
+ Save tabular duplicate detection results to files.
+
+ Args:
+ results_df (pd.DataFrame): Results DataFrame
+ output_folder (str): Output folder path
+ file_paths (List[str]): List of file paths
+ remove_duplicate_rows (bool): Whether to remove duplicate rows
+ in_excel_tabular_sheets (str): Name of the Excel sheet to save the results to
+ Returns:
+ List[str]: List of output file paths
+ """
+ output_paths = list()
+ output_folder_path = Path(output_folder)
+ output_folder_path.mkdir(exist_ok=True)
+
+ if results_df.empty:
+ print("No duplicate matches to save.")
+ return list()
+
+ # Save main results
+ results_file = output_folder_path / "tabular_duplicate_results.csv"
+ results_df.to_csv(results_file, index=False, encoding="utf-8-sig")
+ output_paths.append(str(results_file))
+
+ # Group results by original file to handle Excel files properly
+ excel_files_processed = dict() # Track which Excel files have been processed
+
+ # Save per-file duplicate lists
+ for file_name, group in results_df.groupby("File2"):
+ # Check for matches with original file names
+ for original_file in file_paths:
+ original_file_name = os.path.basename(original_file)
+
+ if original_file_name in file_name:
+ original_file_extension = os.path.splitext(original_file)[-1]
+ if original_file_extension in [".xlsx", ".xls"]:
+
+ # Split the string using secure regex to handle both .xlsx_ and .xls_ delimiters
+ from tools.secure_regex_utils import safe_split_filename
+
+ parts = safe_split_filename(
+ os.path.basename(file_name), [".xlsx_", ".xls_"]
+ )
+ # The sheet name is the last part after splitting
+ file_sheet_name = parts[-1]
+
+ file_path = original_file
+
+ # Initialize Excel file tracking if not already done
+ if file_path not in excel_files_processed:
+ excel_files_processed[file_path] = {
+ "sheets_data": dict(),
+ "all_sheets": list(),
+ "processed_sheets": set(),
+ }
+
+ # Read the original Excel file to get all sheet names
+ if not excel_files_processed[file_path]["all_sheets"]:
+ try:
+ excel_file = pd.ExcelFile(file_path)
+ excel_files_processed[file_path][
+ "all_sheets"
+ ] = excel_file.sheet_names
+ except Exception as e:
+ print(f"Error reading Excel file {file_path}: {e}")
+ continue
+
+ # Read the current sheet
+ df = read_file(file_path, excel_sheet_name=file_sheet_name)
+
+ # Create duplicate rows file for this sheet
+ file_stem = Path(file_name).stem
+ duplicate_rows_file = (
+ output_folder_path
+ / f"{file_stem}_{file_sheet_name}_duplicate_rows.csv"
+ )
+
+ # Get unique row numbers to remove
+ rows_to_remove = sorted(group["Row2"].unique())
+ duplicate_df = pd.DataFrame({"Row_to_Remove": rows_to_remove})
+ duplicate_df.to_csv(duplicate_rows_file, index=False)
+ output_paths.append(str(duplicate_rows_file))
+
+ # Process the sheet data
+ df_cleaned = df.copy()
+ df_cleaned["duplicated"] = False
+ df_cleaned.loc[rows_to_remove, "duplicated"] = True
+ if remove_duplicate_rows:
+ df_cleaned = df_cleaned.drop(index=rows_to_remove)
+
+ # Store the processed sheet data
+ excel_files_processed[file_path]["sheets_data"][
+ file_sheet_name
+ ] = df_cleaned
+ excel_files_processed[file_path]["processed_sheets"].add(
+ file_sheet_name
+ )
+
+ else:
+ file_sheet_name = ""
+ file_path = original_file
+ print("file_path after match:", file_path)
+ file_base_name = os.path.basename(file_path)
+ df = read_file(file_path)
+
+ file_stem = Path(file_name).stem
+ duplicate_rows_file = (
+ output_folder_path / f"{file_stem}_duplicate_rows.csv"
+ )
+
+ # Get unique row numbers to remove
+ rows_to_remove = sorted(group["Row2"].unique())
+ duplicate_df = pd.DataFrame({"Row_to_Remove": rows_to_remove})
+ duplicate_df.to_csv(duplicate_rows_file, index=False)
+ output_paths.append(str(duplicate_rows_file))
+
+ df_cleaned = df.copy()
+ df_cleaned["duplicated"] = False
+ df_cleaned.loc[rows_to_remove, "duplicated"] = True
+ if remove_duplicate_rows:
+ df_cleaned = df_cleaned.drop(index=rows_to_remove)
+
+ file_ext = os.path.splitext(file_name)[-1]
+
+ if file_ext in [".parquet"]:
+ output_path = secure_join(
+ output_folder, f"{file_base_name}_deduplicated.parquet"
+ )
+ df_cleaned.to_parquet(output_path, index=False)
+ else:
+ output_path = secure_join(
+ output_folder, f"{file_base_name}_deduplicated.csv"
+ )
+ df_cleaned.to_csv(
+ output_path, index=False, encoding="utf-8-sig"
+ )
+
+ output_paths.append(str(output_path))
+ break
+
+ # Process Excel files to create complete deduplicated files
+ for file_path, file_data in excel_files_processed.items():
+ try:
+ # Create output filename
+ file_base_name = os.path.splitext(os.path.basename(file_path))[0]
+ file_ext = os.path.splitext(file_path)[-1]
+ output_path = secure_join(
+ output_folder, f"{file_base_name}_deduplicated{file_ext}"
+ )
+
+ # Create Excel writer
+ with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
+ # Write all sheets
+ for sheet_name in file_data["all_sheets"]:
+ if sheet_name in file_data["processed_sheets"]:
+ # Use the processed (deduplicated) version
+ file_data["sheets_data"][sheet_name].to_excel(
+ writer, sheet_name=sheet_name, index=False
+ )
+ else:
+ # Use the original sheet (no duplicates found)
+ original_df = read_file(file_path, excel_sheet_name=sheet_name)
+ original_df.to_excel(writer, sheet_name=sheet_name, index=False)
+
+ output_paths.append(str(output_path))
+ print(f"Created deduplicated Excel file: {output_path}")
+
+ except Exception as e:
+ print(f"Error creating deduplicated Excel file for {file_path}: {e}")
+ continue
+
+ return output_paths
+
+
+def remove_duplicate_rows_from_tabular_data(
+ file_path: str,
+ duplicate_rows: List[int],
+ output_folder: str = OUTPUT_FOLDER,
+ in_excel_tabular_sheets: List[str] = [],
+ remove_duplicate_rows: bool = REMOVE_DUPLICATE_ROWS,
+) -> str:
+ """
+ Remove duplicate rows from a tabular data file.
+
+ Args:
+ file_path (str): Path to the input file
+ duplicate_rows (List[int]): List of row indices to remove
+ output_folder (str): Output folder for cleaned file
+ in_excel_tabular_sheets (str): Name of the Excel sheet to save the results to
+ remove_duplicate_rows (bool): Whether to remove duplicate rows
+ Returns:
+ str: Path to the cleaned file
+ """
+ try:
+ # Load the file
+ df = read_file(
+ file_path,
+ excel_sheet_name=in_excel_tabular_sheets if in_excel_tabular_sheets else "",
+ )
+
+ # Remove duplicate rows (0-indexed)
+ df_cleaned = df.drop(index=duplicate_rows).reset_index(drop=True)
+
+ # Save cleaned file
+ file_name = os.path.basename(file_path)
+ file_stem = os.path.splitext(file_name)[0]
+ file_ext = os.path.splitext(file_name)[-1]
+
+ output_path = secure_join(output_folder, f"{file_stem}_deduplicated{file_ext}")
+
+ if file_ext in [".xlsx", ".xls"]:
+ df_cleaned.to_excel(
+ output_path,
+ index=False,
+ sheet_name=in_excel_tabular_sheets if in_excel_tabular_sheets else [],
+ )
+ elif file_ext in [".parquet"]:
+ df_cleaned.to_parquet(output_path, index=False)
+ else:
+ df_cleaned.to_csv(output_path, index=False, encoding="utf-8-sig")
+
+ return output_path
+
+ except Exception as e:
+ print(f"Error removing duplicates from {file_path}: {e}")
+ raise
+
+
+def run_tabular_duplicate_analysis(
+ files: List[str],
+ threshold: float,
+ min_words: int,
+ text_columns: List[str] = [],
+ output_folder: str = OUTPUT_FOLDER,
+ do_initial_clean_dup: bool = DO_INITIAL_TABULAR_DATA_CLEAN,
+ remove_duplicate_rows: bool = REMOVE_DUPLICATE_ROWS,
+ in_excel_tabular_sheets: List[str] = [],
+ progress: Progress = Progress(track_tqdm=True),
+) -> Tuple[pd.DataFrame, List[str], Dict[str, pd.DataFrame]]:
+ """
+ Main function to run tabular duplicate analysis.
+
+ Args:
+ files (List[str]): List of file paths
+ threshold (float): Similarity threshold
+ min_words (int): Minimum word count
+ text_columns (List[str], optional): Specific columns to analyze
+ output_folder (str, optional): Output folder for results
+ progress (Progress): Progress tracking
+
+ Returns:
+ Tuple containing results DataFrame, output paths, and full data by file
+ """
+ return find_duplicate_cells_in_tabular_data(
+ input_files=files,
+ similarity_threshold=threshold,
+ min_word_count=min_words,
+ text_columns=text_columns if text_columns else [],
+ output_folder=output_folder,
+ do_initial_clean_dup=do_initial_clean_dup,
+ in_excel_tabular_sheets=(
+ in_excel_tabular_sheets if in_excel_tabular_sheets else []
+ ),
+ remove_duplicate_rows=remove_duplicate_rows,
+ )
+
+
+# Function to update column choices when files are uploaded
+def update_tabular_column_choices(files, in_excel_tabular_sheets: List[str] = []):
+ if not files:
+ return gr.update(choices=[])
+
+ all_columns = set()
+ for file in files:
+ try:
+ file_extension = os.path.splitext(file.name)[-1]
+ if file_extension in [".xlsx", ".xls"]:
+ for sheet_name in in_excel_tabular_sheets:
+ df = read_file(file.name, excel_sheet_name=sheet_name)
+ text_cols = df.select_dtypes(
+ include=["object", "string"]
+ ).columns.tolist()
+ all_columns.update(text_cols)
+ else:
+ df = read_file(file.name)
+ text_cols = df.select_dtypes(
+ include=["object", "string"]
+ ).columns.tolist()
+ all_columns.update(text_cols)
+
+ # Get text columns
+ text_cols = df.select_dtypes(include=["object", "string"]).columns.tolist()
+
+ all_columns.update(text_cols)
+ except Exception as e:
+ print(f"Error reading {file.name}: {e}")
+ continue
+
+ return gr.Dropdown(choices=sorted(list(all_columns)))
+
+
+# Function to handle tabular duplicate detection
+def run_tabular_duplicate_detection(
+ files,
+ threshold,
+ min_words,
+ text_columns,
+ output_folder: str = OUTPUT_FOLDER,
+ do_initial_clean_dup: bool = DO_INITIAL_TABULAR_DATA_CLEAN,
+ in_excel_tabular_sheets: List[str] = [],
+ remove_duplicate_rows: bool = REMOVE_DUPLICATE_ROWS,
+):
+ if not files:
+ print("No files uploaded")
+ return pd.DataFrame(), [], gr.Dropdown(choices=[]), 0, "deduplicate"
+
+ start_time = time.time()
+
+ task_textbox = "deduplicate"
+
+ # If output folder doesn't end with a forward slash, add one
+ if not output_folder.endswith("/"):
+ output_folder = output_folder + "/"
+
+ file_paths = list()
+ if isinstance(files, str):
+ # If 'files' is a single string, treat it as a list with one element
+ file_paths.append(files)
+ elif isinstance(files, list):
+ # If 'files' is a list, iterate through its elements
+ for f_item in files:
+ if isinstance(f_item, str):
+ # If an element is a string, it's a direct file path
+ file_paths.append(f_item)
+ elif hasattr(f_item, "name"):
+ # If an element has a '.name' attribute (e.g., a Gradio File object), use its name
+ file_paths.append(f_item.name)
+ else:
+ # Log a warning for unexpected element types within the list
+ print(
+ f"Warning: Skipping an element in 'files' list that is neither a string nor has a '.name' attribute: {type(f_item)}"
+ )
+ elif hasattr(files, "name"):
+ # Handle the case where a single file object (e.g., gr.File) is passed directly, not in a list
+ file_paths.append(files.name)
+ else:
+ # Raise an error for any other unexpected type of the 'files' argument itself
+ raise TypeError(
+ f"Unexpected type for 'files' argument: {type(files)}. Expected str, list of str/file objects, or a single file object."
+ )
+
+ if len(file_paths) > MAX_SIMULTANEOUS_FILES:
+ out_message = f"Number of files to deduplicate is greater than {MAX_SIMULTANEOUS_FILES}. Please submit a smaller number of files."
+ print(out_message)
+ raise Exception(out_message)
+
+ results_df, output_paths, full_data = run_tabular_duplicate_analysis(
+ files=file_paths,
+ threshold=threshold,
+ min_words=min_words,
+ text_columns=text_columns if text_columns else [],
+ output_folder=output_folder,
+ do_initial_clean_dup=do_initial_clean_dup,
+ in_excel_tabular_sheets=(
+ in_excel_tabular_sheets if in_excel_tabular_sheets else None
+ ),
+ remove_duplicate_rows=remove_duplicate_rows,
+ )
+
+ # Update file choices for cleaning
+ file_choices = list(set([f for f in file_paths]))
+
+ end_time = time.time()
+ processing_time = round(end_time - start_time, 2)
+
+ return (
+ results_df,
+ output_paths,
+ gr.Dropdown(choices=file_choices),
+ processing_time,
+ task_textbox,
+ )
+
+
+# Function to handle row selection for preview
+def handle_tabular_row_selection(results_df, evt: gr.SelectData):
+
+ if not evt:
+ return None, "", ""
+
+ if not isinstance(results_df, pd.DataFrame):
+ return None, "", ""
+ elif results_df.empty:
+ return None, "", ""
+
+ selected_index = evt.index[0]
+ if selected_index >= len(results_df):
+ return None, "", ""
+
+ row = results_df.iloc[selected_index]
+ return selected_index, row["Text1"], row["Text2"]
+
+
+# Function to clean duplicates from selected file
+def clean_tabular_duplicates(
+ file_name,
+ results_df,
+ output_folder,
+ in_excel_tabular_sheets: str = "",
+ remove_duplicate_rows: bool = REMOVE_DUPLICATE_ROWS,
+):
+ if not file_name or results_df.empty:
+ return None
+
+ # Get duplicate rows for this file
+ file_duplicates = results_df[results_df["File2"] == file_name]["Row2"].tolist()
+
+ if not file_duplicates:
+ return None
+
+ try:
+ # Find the original file path
+ # This is a simplified approach - in practice you might want to store file paths
+ cleaned_file = remove_duplicate_rows_from_tabular_data(
+ file_path=file_name,
+ duplicate_rows=file_duplicates,
+ output_folder=output_folder,
+ in_excel_tabular_sheets=in_excel_tabular_sheets,
+ remove_duplicate_rows=remove_duplicate_rows,
+ )
+ return cleaned_file
+ except Exception as e:
+ print(f"Error cleaning duplicates: {e}")
+ return None
diff --git a/tools/helper_functions.py b/tools/helper_functions.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd314fdc69b4beab07b81398d36bc776c1cab354
--- /dev/null
+++ b/tools/helper_functions.py
@@ -0,0 +1,1932 @@
+import logging
+import os
+import platform
+import random
+import re
+import string
+import sys
+import tempfile
+import unicodedata
+from contextlib import asynccontextmanager
+from datetime import datetime
+from math import ceil
+from pathlib import Path
+from typing import List, Set
+
+import boto3
+import gradio as gr
+import numpy as np
+import pandas as pd
+from botocore.exceptions import (
+ BotoCoreError,
+ ClientError,
+ NoCredentialsError,
+ PartialCredentialsError,
+)
+from fastapi import FastAPI
+
+from tools.aws_functions import download_file_from_s3, upload_file_to_s3
+from tools.config import (
+ AWS_LLM_PII_OPTION,
+ AWS_PII_OPTION,
+ AWS_USER_POOL_ID,
+ BEDROCK_LLM_INPUT_COST,
+ BEDROCK_LLM_INPUT_TOKENS_PER_PAGE,
+ BEDROCK_LLM_OUTPUT_COST,
+ BEDROCK_LLM_OUTPUT_TOKENS_PER_PAGE,
+ BEDROCK_VLM_INPUT_COST,
+ BEDROCK_VLM_OUTPUT_COST,
+ BEDROCK_VLM_PIXELS_PER_INPUT_TOKEN,
+ BEDROCK_VLM_TEXT_EXTRACT_OPTION,
+ COST_CODES_PATH,
+ CUSTOM_HEADER,
+ CUSTOM_HEADER_VALUE,
+ DEFAULT_COST_CODE,
+ DEFAULT_LANGUAGE,
+ DEFAULT_LOCAL_OCR_MODEL,
+ DOCUMENT_REDACTION_BUCKET,
+ INFERENCE_SERVER_PII_OPTION,
+ INPUT_FOLDER,
+ LANGUAGE_CHOICES,
+ LANGUAGE_MAP,
+ LOCAL_OCR_MODEL_OPTIONS,
+ LOCAL_OCR_MODEL_TEXT_EXTRACT_OPTION,
+ LOCAL_PII_OPTION,
+ LOCAL_TRANSFORMERS_LLM_PII_OPTION,
+ NO_REDACTION_PII_OPTION,
+ OUTPUT_COST_CODES_PATH,
+ OUTPUT_FOLDER,
+ RUN_AWS_FUNCTIONS,
+ S3_COST_CODES_PATH,
+ S3_OUTPUTS_FOLDER,
+ SAVE_OUTPUTS_TO_S3,
+ SELECTABLE_TEXT_EXTRACT_OPTION,
+ SESSION_OUTPUT_FOLDER,
+ SHOW_FEEDBACK_BUTTONS,
+ TEXTRACT_JOBS_LOCAL_LOC,
+ TEXTRACT_JOBS_S3_LOC,
+ TEXTRACT_TEXT_EXTRACT_OPTION,
+ TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER,
+ TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER,
+ VLM_MAX_IMAGE_SIZE,
+ aws_comprehend_language_choices,
+ convert_string_to_boolean,
+ ensure_folder_within_app_directory,
+ textract_language_choices,
+)
+from tools.secure_path_utils import sanitize_filename, secure_path_join
+
+
+def reset_state_vars():
+ return (
+ [],
+ pd.DataFrame(),
+ pd.DataFrame(),
+ 0,
+ "",
+ None,
+ [],
+ [],
+ pd.DataFrame(),
+ pd.DataFrame(),
+ [],
+ [],
+ "",
+ False,
+ 0,
+ [],
+ [],
+ 0, # latest_file_completed_num: reset to 0 at start of document redaction
+ 0, # LLM total input tokens
+ 0, # LLM total output tokens
+ 0, # VLM total input tokens
+ 0, # VLM total output tokens
+ )
+
+
+def reset_ocr_results_state():
+ return pd.DataFrame(), pd.DataFrame(), []
+
+
+def reset_review_vars():
+ return pd.DataFrame(), pd.DataFrame()
+
+
+def reset_data_vars():
+ return 0, [], 0
+
+
+def reset_aws_call_vars():
+ return 0, 0, 0, 0, 0, 0, "", ""
+
+
+### functions related to summarisation ###
+
+
+def clean_column_name(
+ column_name: str, max_length: int = 20, front_characters: bool = True
+):
+ # Convert to string
+ column_name = str(column_name)
+ # Replace non-alphanumeric characters (except underscores) with underscores
+ column_name = re.sub(r"\W+", "_", column_name)
+ # Remove leading/trailing underscores
+ column_name = column_name.strip("_")
+ # Ensure the result is not empty; fall back to "column" if necessary
+ column_name = column_name if column_name else "column"
+ # Truncate to max_length
+ if front_characters is True:
+ output_text = column_name[:max_length]
+ else:
+ output_text = column_name[-max_length:]
+ return output_text
+
+
+def create_batch_file_path_details(
+ reference_data_file_name: str,
+ latest_batch_completed: int = None,
+ batch_size_number: int = None,
+ in_column: str = None,
+) -> str:
+ """
+ Creates a standardised batch file path detail string from a reference data filename.
+
+ Args:
+ reference_data_file_name (str): Name of the reference data file
+ latest_batch_completed (int, optional): Latest batch completed. Defaults to None.
+ batch_size_number (int, optional): Batch size number. Defaults to None.
+ in_column (str, optional): In column. Defaults to None.
+ Returns:
+ str: Formatted batch file path detail string
+ """
+
+ # Extract components from filename using regex
+ file_name = (
+ re.search(
+ r"(.*?)(?:_all_|_final_|_batch_|_col_)", reference_data_file_name
+ ).group(1)
+ if re.search(r"(.*?)(?:_all_|_final_|_batch_|_col_)", reference_data_file_name)
+ else reference_data_file_name
+ )
+
+ # Clean the extracted names
+ file_name_cleaned = clean_column_name(file_name, max_length=20)
+
+ return f"{file_name_cleaned}_"
+
+
+def ensure_model_in_map(model_choice: str, model_name_map_dict: dict = None) -> dict:
+ """
+ Ensures that a model_choice is registered in model_name_map.
+ If the model_choice is not found, it assumes it's an inference-server model
+ and adds it to the map with source "inference-server".
+
+ Args:
+ model_choice (str): The model name to check/register
+ model_name_map_dict (dict, optional): The model_name_map dictionary to update.
+ If None, uses the global model_name_map from config.
+
+ Returns:
+ dict: The model_name_map dictionary (updated if needed)
+ """
+ # Use provided dict or global one
+ if model_name_map_dict is None:
+ from tools.config import model_name_map
+
+ model_name_map_dict = model_name_map
+
+ # If model_choice is not in the map, assume it's an inference-server model
+ if model_choice not in model_name_map_dict:
+ model_name_map_dict[model_choice] = {
+ "short_name": model_choice,
+ "source": "inference-server",
+ }
+ print(f"Registered custom model '{model_choice}' as inference-server model")
+
+ return model_name_map_dict
+
+
+def get_file_name_no_ext(file_path: str):
+ # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
+ basename = os.path.basename(file_path)
+
+ # Then, split the basename and its extension and return only the basename without the extension
+ filename_without_extension, _ = os.path.splitext(basename)
+
+ # print(filename_without_extension)
+
+ return filename_without_extension
+
+
+def _file_name_from_pdf_path(full_file_name):
+ """Derive a safe file_name prefix from a PDF path (for summary output naming)."""
+ if not full_file_name or not str(full_file_name).strip():
+ return "document"
+ basename = os.path.basename(full_file_name)
+ name_without_ext, _ = os.path.splitext(basename)
+ filename_prefix = (name_without_ext or "document")[:20]
+ invalid_chars = '<>:"/\\|?*'
+ for char in invalid_chars:
+ filename_prefix = filename_prefix.replace(char, "_")
+ return filename_prefix if filename_prefix else "document"
+
+
+###
+
+
+def load_in_default_allow_list(allow_list_file_path):
+ if isinstance(allow_list_file_path, str):
+ allow_list_file_path = [allow_list_file_path]
+ return allow_list_file_path
+
+
+def load_in_default_cost_codes(cost_codes_path: str, default_cost_code: str = ""):
+ """
+ Load in the cost codes list from file.
+ """
+ cost_codes_df = pd.read_csv(cost_codes_path)
+ dropdown_choices = cost_codes_df.iloc[:, 0].astype(str).tolist()
+
+ # Avoid inserting duplicate or empty cost code values
+ if default_cost_code and default_cost_code not in dropdown_choices:
+ dropdown_choices.insert(0, default_cost_code)
+
+ # Always have a blank option at the top
+ if "" not in dropdown_choices:
+ dropdown_choices.insert(0, "")
+
+ # Use passed default if in choices, else fall back to DEFAULT_COST_CODE so dropdown shows app default on load
+ if default_cost_code and default_cost_code in dropdown_choices:
+ value = default_cost_code
+ elif DEFAULT_COST_CODE and DEFAULT_COST_CODE in dropdown_choices:
+ value = DEFAULT_COST_CODE
+ else:
+ value = ""
+
+ out_dropdown = gr.Dropdown(
+ value=value,
+ label="Choose cost code for analysis",
+ choices=dropdown_choices,
+ allow_custom_value=False,
+ )
+
+ return cost_codes_df, cost_codes_df, out_dropdown
+
+
+def enforce_cost_codes(
+ enforce_cost_code_bool: bool,
+ cost_code_choice: str,
+ cost_code_df: pd.DataFrame,
+ verify_cost_codes: bool = True,
+):
+ """
+ Check if the enforce cost codes variable is set to True, and then check that a cost cost has been chosen. If not, raise an error. Then, check against the values in the cost code dataframe to ensure that the cost code exists.
+ """
+
+ if enforce_cost_code_bool:
+ if not cost_code_choice:
+ raise Exception("Please choose a cost code before continuing")
+
+ if verify_cost_codes:
+ if cost_code_df.empty:
+ raise Exception("No cost codes present in dataframe for verification")
+ else:
+ valid_cost_codes_list = list(cost_code_df.iloc[:, 0].unique())
+
+ if cost_code_choice not in valid_cost_codes_list:
+ raise Exception(
+ "Selected cost code not found in list. Please contact your system administrator if you cannot find the correct cost code from the given list of suggestions."
+ )
+ return
+
+
+def update_cost_code_dataframe_from_dropdown_select(
+ cost_dropdown_selection: str, cost_code_df: pd.DataFrame
+):
+ cost_code_df = cost_code_df.loc[
+ cost_code_df.iloc[:, 0] == cost_dropdown_selection, :
+ ]
+ return cost_code_df
+
+
+SESSION_DEFAULT_COST_CODES_FILENAME = "session_default_cost_codes.csv"
+
+# Reasonable email pattern for validating session_hash (saves/upload require email format)
+_EMAIL_PATTERN = re.compile(r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$")
+
+
+def _is_valid_session_hash_email(session_hash: str) -> bool:
+ """Return True if session_hash is in valid email format (required for saving defaults)."""
+ if not session_hash or not isinstance(session_hash, str):
+ return False
+ return bool(_EMAIL_PATTERN.match(session_hash.strip()))
+
+
+def _dedupe_session_default_cost_codes(df: pd.DataFrame) -> pd.DataFrame:
+ """
+ Remove duplicate session_hashes, keeping only the latest row by saved_at.
+ If saved_at is missing or empty, those rows are treated as oldest.
+ """
+ if df.empty or "session_hash" not in df.columns:
+ return df
+ if "saved_at" not in df.columns:
+ return df.drop_duplicates(subset=["session_hash"], keep="last")
+ # Sort so latest saved_at is last; empty string sorts last in ascending order
+ df = df.sort_values("saved_at", ascending=True, na_position="last")
+ return df.drop_duplicates(subset=["session_hash"], keep="last")
+
+
+def _get_session_default_cost_codes_s3_key_prefix():
+ """
+ Return the S3 key prefix (folder) for the session default cost codes file,
+ same bucket and folder as S3_COST_CODES_PATH. Empty string if no S3 path.
+ """
+ if not S3_COST_CODES_PATH or not str(S3_COST_CODES_PATH).strip():
+ return ""
+ # Use forward slash for S3; path can be "config/COST_CENTRES.csv" or "file.csv"
+ parts = S3_COST_CODES_PATH.replace("\\", "/").rstrip("/").split("/")
+ if len(parts) <= 1:
+ return ""
+ return "/".join(parts[:-1]) + "/"
+
+
+def get_session_default_cost_codes_csv_path(folder: str | None = None):
+ """
+ Return the path to the CSV file that stores session_hash -> default_cost_code.
+ If folder is provided (e.g. input folder path), resolve it under the configured
+ INPUT_FOLDER via secure_path_join (CodeQL / path-injection). If that fails,
+ fall back to the same folder as cost codes (COST_CODES_PATH or OUTPUT_COST_CODES_PATH).
+ """
+ if folder is not None and str(folder).strip():
+ base = str(folder).strip()
+ try:
+ safe_base = secure_path_join(INPUT_FOLDER, base)
+ return os.path.join(str(safe_base), SESSION_DEFAULT_COST_CODES_FILENAME)
+ except (ValueError, PermissionError, OSError):
+ # Cannot constrain user folder under INPUT_FOLDER; use defaults below.
+ pass
+ if COST_CODES_PATH:
+ folder = os.path.dirname(COST_CODES_PATH)
+ else:
+ folder = os.path.dirname(OUTPUT_COST_CODES_PATH)
+ if not folder:
+ folder = "."
+ return os.path.join(folder, SESSION_DEFAULT_COST_CODES_FILENAME)
+
+
+def save_default_cost_code_for_session(
+ session_hash: str,
+ cost_code_choice: str,
+ cost_code_df: pd.DataFrame,
+ output_folder: str | None = None,
+) -> str:
+ """
+ Validate cost_code_choice against cost_code_df 'Cost code' column; if valid,
+ save or update session_hash -> default_cost_code in the session defaults CSV.
+ If output_folder is provided (e.g. input folder path), save there; else use
+ cost codes folder. Returns a message string for the user.
+ """
+ if not session_hash or not str(session_hash).strip():
+ return "No session identifier available."
+ if not _is_valid_session_hash_email(session_hash):
+ return (
+ "Default cost code can only be saved when the session identifier is an "
+ "email address (e.g. when signed in with Cognito)."
+ )
+ if not cost_code_choice or not str(cost_code_choice).strip():
+ return "Please choose a cost code first."
+ if cost_code_df is None or cost_code_df.empty:
+ return "No cost codes loaded for validation."
+ valid_codes = list(cost_code_df.iloc[:, 0].astype(str).unique())
+ if cost_code_choice not in valid_codes:
+ return "Selected cost code not in the list. Choose a cost code from the table."
+ csv_path = get_session_default_cost_codes_csv_path(output_folder)
+ ensure_folder_exists(os.path.dirname(csv_path))
+ saved_at = datetime.now().isoformat()
+ row = {
+ "session_hash": session_hash,
+ "default_cost_code": cost_code_choice,
+ "saved_at": saved_at,
+ }
+ if os.path.exists(csv_path):
+ existing = pd.read_csv(csv_path)
+ if (
+ "session_hash" in existing.columns
+ and "default_cost_code" in existing.columns
+ ):
+ if "saved_at" not in existing.columns:
+ existing["saved_at"] = ""
+ # Replace any existing row for this session (one row per session_hash)
+ existing = existing[
+ existing["session_hash"].astype(str) != str(session_hash)
+ ]
+ updated = pd.concat([existing, pd.DataFrame([row])], ignore_index=True)
+ else:
+ updated = pd.DataFrame([row])
+ else:
+ updated = pd.DataFrame([row])
+ # Remove duplicate session_hashes, keeping only the latest by saved_at
+ updated = _dedupe_session_default_cost_codes(updated)
+ updated.to_csv(csv_path, index=False)
+
+ # If S3 cost codes path is set, save to the same bucket and folder in S3
+ if (
+ S3_COST_CODES_PATH
+ and str(S3_COST_CODES_PATH).strip()
+ and RUN_AWS_FUNCTIONS
+ and DOCUMENT_REDACTION_BUCKET
+ ):
+ # Ensure the file exists before upload (e.g. in case of path or write edge cases)
+ if not os.path.exists(csv_path):
+ ensure_folder_exists(os.path.dirname(csv_path))
+ updated.to_csv(csv_path, index=False)
+ s3_prefix = _get_session_default_cost_codes_s3_key_prefix()
+ s3_full_key = s3_prefix + SESSION_DEFAULT_COST_CODES_FILENAME
+ upload_result = upload_file_to_s3(
+ local_file_paths=[csv_path],
+ s3_key=s3_prefix,
+ s3_bucket=DOCUMENT_REDACTION_BUCKET,
+ RUN_AWS_FUNCTIONS=RUN_AWS_FUNCTIONS,
+ )
+ if upload_result and "successfully" in upload_result.lower():
+ print(
+ f"Session default cost codes CSV uploaded to "
+ f"s3://{DOCUMENT_REDACTION_BUCKET}/{s3_full_key}"
+ )
+ else:
+ print(
+ f"Session default cost codes S3 upload issue "
+ f"(target s3://{DOCUMENT_REDACTION_BUCKET}/{s3_full_key}): {upload_result!r}"
+ )
+
+ return "Default cost code saved"
+
+
+def _read_session_default_from_csv_path(csv_path: str, session_hash: str) -> str:
+ """Read CSV at csv_path and return default_cost_code for session_hash, or "".
+ If saved_at exists, uses the latest row by saved_at for that session_hash.
+ """
+ if not os.path.exists(csv_path):
+ return ""
+ try:
+ df = pd.read_csv(csv_path)
+ if "session_hash" not in df.columns or "default_cost_code" not in df.columns:
+ return ""
+ match = df[df["session_hash"].astype(str) == str(session_hash)]
+ if match.empty:
+ return ""
+ if "saved_at" in match.columns:
+ match = match.sort_values("saved_at", ascending=True, na_position="last")
+ return str(match.iloc[-1]["default_cost_code"]).strip()
+ except Exception:
+ return ""
+
+
+def load_session_default_cost_code(
+ session_hash: str, input_folder: str | None = None
+) -> str:
+ """
+ Load the default cost code for the given session_hash from the session defaults CSV.
+ If S3_COST_CODES_PATH is set, tries the same bucket and folder in S3 first, then local.
+ If input_folder is provided, tries that path first for local file, then fallback.
+ Returns the default cost code string if found, else "".
+ """
+ if not session_hash or not str(session_hash).strip():
+ return ""
+
+ # Only download from S3 when session_hash is an email (same rule as for saves)
+ if _is_valid_session_hash_email(session_hash) and (
+ S3_COST_CODES_PATH
+ and str(S3_COST_CODES_PATH).strip()
+ and RUN_AWS_FUNCTIONS
+ and DOCUMENT_REDACTION_BUCKET
+ ):
+ s3_prefix = _get_session_default_cost_codes_s3_key_prefix()
+ tmp_path = None
+ try:
+ with tempfile.NamedTemporaryFile(
+ mode="w", suffix=".csv", delete=False
+ ) as tmp:
+ tmp_path = tmp.name
+ download_file_from_s3(
+ bucket_name=DOCUMENT_REDACTION_BUCKET,
+ key=s3_prefix + SESSION_DEFAULT_COST_CODES_FILENAME,
+ local_file_path_and_name=tmp_path,
+ RUN_AWS_FUNCTIONS=RUN_AWS_FUNCTIONS,
+ )
+ if tmp_path and os.path.exists(tmp_path) and os.path.getsize(tmp_path) > 0:
+ result = _read_session_default_from_csv_path(tmp_path, session_hash)
+ if result:
+ try:
+ os.unlink(tmp_path)
+ except OSError:
+ pass
+ return result
+ except Exception:
+ pass
+ if tmp_path and os.path.exists(tmp_path):
+ try:
+ os.unlink(tmp_path)
+ except OSError:
+ pass
+
+ # Local: try input_folder first if provided, then default cost codes folder
+ if input_folder is not None and str(input_folder).strip():
+ csv_path = get_session_default_cost_codes_csv_path(input_folder)
+ result = _read_session_default_from_csv_path(csv_path, session_hash)
+ if result:
+ return result
+ csv_path = get_session_default_cost_codes_csv_path()
+ return _read_session_default_from_csv_path(csv_path, session_hash)
+
+
+def apply_session_default_cost_code(
+ session_hash: str,
+ cost_code_dataframe: pd.DataFrame,
+ input_folder: str | None = None,
+ current_default_cost_code: str | None = None,
+ current_dropdown_value: str | None = None,
+) -> tuple[str, str]:
+ """
+ Look up the saved default cost code for session_hash. If found and valid in
+ cost_code_dataframe, return (default_cost_code, default_cost_code) for
+ default_cost_code_textbox and cost_code_choice_drop. When no saved default
+ applies, return (current_default_cost_code, current_dropdown_value) so the
+ dropdown keeps showing e.g. DEFAULT_COST_CODE instead of being cleared.
+ """
+ default_code = load_session_default_cost_code(session_hash, input_folder)
+ if not default_code:
+ # Preserve current values so we don't overwrite with "" on app load
+ cur = current_default_cost_code if current_default_cost_code is not None else ""
+ drop = current_dropdown_value if current_dropdown_value is not None else ""
+ return cur, drop
+ if cost_code_dataframe is None or cost_code_dataframe.empty:
+ return default_code, default_code
+ choices = list(cost_code_dataframe.iloc[:, 0].astype(str).unique())
+ if default_code not in choices:
+ cur = current_default_cost_code if current_default_cost_code is not None else ""
+ drop = current_dropdown_value if current_dropdown_value is not None else ""
+ return cur, drop
+ return default_code, default_code
+
+
+def ensure_folder_exists(output_folder: str):
+ """Checks if the specified folder exists, creates it if not.
+
+ Resolves ``output_folder`` under the app directory via
+ ``ensure_folder_within_app_directory`` so user-influenced paths cannot escape
+ the workspace (CodeQL py/path-injection).
+ """
+ if output_folder is None or not str(output_folder).strip():
+ return
+ try:
+ safe_folder = ensure_folder_within_app_directory(str(output_folder).strip())
+ except (ValueError, PermissionError, OSError) as e:
+ logging.getLogger(__name__).warning(
+ "ensure_folder_exists: refused or could not normalize path %r: %s",
+ output_folder,
+ e,
+ )
+ return
+ if not safe_folder or not str(safe_folder).strip():
+ return
+
+ if not os.path.exists(safe_folder):
+ os.makedirs(safe_folder, exist_ok=True)
+
+
+def update_dataframe(df_or_list):
+ """
+ Update function for both DataFrame and list inputs.
+ For Dropdown components (list), return the list as-is.
+ For DataFrame components, return a copy.
+ """
+ if isinstance(df_or_list, list):
+ return df_or_list
+ elif isinstance(df_or_list, pd.DataFrame):
+ return df_or_list.copy()
+ else:
+ return df_or_list
+
+
+def get_file_name_without_type(file_path):
+ # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
+ basename = os.path.basename(file_path)
+
+ # Then, split the basename and its extension and return only the basename without the extension
+ filename_without_extension, _ = os.path.splitext(basename)
+
+ # print(filename_without_extension)
+
+ return filename_without_extension
+
+
+def detect_file_type(filename: str):
+ """Detect the file type based on its extension."""
+ if not isinstance(filename, str):
+ filename = str(filename)
+
+ if (
+ (filename.endswith(".csv"))
+ | (filename.endswith(".csv.gz"))
+ | (filename.endswith(".zip"))
+ ):
+ return "csv"
+ elif filename.endswith(".xlsx"):
+ return "xlsx"
+ elif filename.endswith(".xls"):
+ return "xls"
+ elif filename.endswith(".parquet"):
+ return "parquet"
+ elif filename.endswith(".pdf"):
+ return "pdf"
+ elif filename.endswith(".jpg"):
+ return "jpg"
+ elif filename.endswith(".jpeg"):
+ return "jpeg"
+ elif filename.endswith(".png"):
+ return "png"
+ elif filename.endswith(".xfdf"):
+ return "xfdf"
+ elif filename.endswith(".docx"):
+ return "docx"
+ else:
+ raise ValueError("Unsupported file type.")
+
+
+def read_file(filename: str, excel_sheet_name: str = ""):
+ """Read the file based on its detected type."""
+ file_type = detect_file_type(filename)
+
+ if file_type == "csv":
+ return pd.read_csv(filename, low_memory=False)
+ elif file_type == "xlsx":
+ if excel_sheet_name:
+ try:
+ return pd.read_excel(filename, sheet_name=excel_sheet_name)
+ except Exception as e:
+ print(
+ f"Error reading {filename} with sheet name {excel_sheet_name}: {e}"
+ )
+ return pd.DataFrame()
+ else:
+ return pd.read_excel(filename)
+ elif file_type == "parquet":
+ return pd.read_parquet(filename)
+
+
+def ensure_output_folder_exists(output_folder: str):
+ """Checks if the specified folder exists, creates it if not."""
+
+ if not os.path.exists(output_folder):
+ # Create the folder if it doesn't exist
+ os.makedirs(output_folder)
+ print(f"Created the {output_folder} folder.")
+ else:
+ print(f"The {output_folder} folder already exists.")
+
+
+def custom_regex_load(in_file: List[str], file_type: str = "allow_list"):
+ """
+ When file is loaded, update the column dropdown choices and write to relevant data states.
+ Returns a list for Dropdown components (instead of DataFrame).
+ """
+ custom_regex_list = list()
+
+ if in_file:
+ file_list = [string.name for string in in_file]
+
+ regex_file_names = [string for string in file_list if "csv" in string.lower()]
+ if regex_file_names:
+ regex_file_name = regex_file_names[0]
+ custom_regex_df = pd.read_csv(
+ regex_file_name, low_memory=False, header=None
+ )
+
+ # Select just first column and convert to list for Dropdown component
+ if not custom_regex_df.empty:
+ custom_regex_list = (
+ custom_regex_df.iloc[:, 0].dropna().astype(str).tolist()
+ )
+
+ # substitute underscores in file type
+ file_type_output = file_type.replace("_", " ")
+
+ output_text = file_type_output + " file loaded."
+ print(output_text)
+ else:
+ output_text = "No file provided."
+ # print(output_text)
+ return output_text, custom_regex_list
+
+ return output_text, custom_regex_list
+
+
+def put_columns_in_df(in_file: List[str]):
+ new_choices = []
+ concat_choices = []
+ all_sheet_names = []
+ number_of_excel_files = 0
+
+ for file in in_file:
+ file_name = file.name
+ file_type = detect_file_type(file_name)
+ # print("File type is:", file_type)
+
+ if (file_type == "xlsx") | (file_type == "xls"):
+ number_of_excel_files += 1
+ new_choices = []
+ anon_xlsx = pd.ExcelFile(file_name)
+ new_sheet_names = anon_xlsx.sheet_names
+ # Iterate through the sheet names
+ for sheet_name in new_sheet_names:
+ # Read each sheet into a DataFrame
+ df = pd.read_excel(file_name, sheet_name=sheet_name)
+
+ # Process the DataFrame (e.g., print its contents)
+ new_choices.extend(list(df.columns))
+
+ all_sheet_names.extend(new_sheet_names)
+
+ elif (file_type == "csv") | (file_type == "parquet"):
+ df = read_file(file_name)
+ new_choices = list(df.columns)
+
+ else:
+ new_choices = []
+
+ concat_choices.extend(new_choices)
+
+ # Drop duplicate columns
+ concat_choices = list(set(concat_choices))
+
+ if number_of_excel_files > 0:
+ return gr.Dropdown(
+ choices=concat_choices, value=concat_choices, visible=True
+ ), gr.Dropdown(choices=all_sheet_names, value=all_sheet_names, visible=True)
+ else:
+ return gr.Dropdown(
+ choices=concat_choices, value=concat_choices, visible=True
+ ), gr.Dropdown(visible=False)
+
+
+def get_textract_file_suffix(handwrite_signature_checkbox: List[str] = list()) -> str:
+ """
+ Generate a suffix for textract JSON files based on the selected feature types.
+
+ Args:
+ handwrite_signature_checkbox: List of selected Textract feature types.
+ Options: "Extract signatures", "Extract forms", "Extract layout", "Extract tables"
+ "Extract handwriting" is the default and doesn't add a suffix.
+
+ Returns:
+ A suffix string like "_sig", "_form", "_sig_form", etc., or empty string if only handwriting is selected.
+ """
+ if not handwrite_signature_checkbox:
+ return ""
+
+ # Map feature types to short suffixes
+ feature_map = {
+ "Extract signatures": "sig",
+ "Extract forms": "form",
+ "Extract layout": "layout",
+ "Extract tables": "table",
+ }
+
+ # Collect suffixes for selected features (excluding handwriting which is default)
+ suffixes = []
+ for feature in handwrite_signature_checkbox:
+ if feature in feature_map:
+ suffixes.append(feature_map[feature])
+
+ # Sort alphabetically for consistent naming
+ suffixes.sort()
+
+ # Return suffix with underscore prefix if any features selected
+ if suffixes:
+ return "_" + "_".join(suffixes)
+ return ""
+
+
+def check_for_existing_textract_file(
+ doc_file_name_no_extension_textbox: str,
+ output_folder: str = OUTPUT_FOLDER,
+ handwrite_signature_checkbox: List[str] = list(),
+):
+ # Generate suffix based on checkbox options
+ suffix = get_textract_file_suffix(handwrite_signature_checkbox)
+ try:
+ safe_stem = sanitize_filename(doc_file_name_no_extension_textbox)
+ filename = safe_stem + suffix + "_textract.json"
+ textract_output_path = secure_path_join(output_folder, filename)
+ except (ValueError, PermissionError, OSError):
+ return False
+
+ if textract_output_path.exists():
+ # print("Existing Textract analysis output file found.")
+ return True
+
+ else:
+ return False
+
+
+def check_for_relevant_ocr_output_with_words(
+ doc_file_name_no_extension_textbox: str,
+ text_extraction_method: str,
+ output_folder: str = OUTPUT_FOLDER,
+):
+ if text_extraction_method == SELECTABLE_TEXT_EXTRACT_OPTION:
+ file_ending = "_ocr_results_with_words_local_text.json"
+ elif text_extraction_method == LOCAL_OCR_MODEL_TEXT_EXTRACT_OPTION:
+ file_ending = "_ocr_results_with_words_local_ocr.json"
+ elif text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
+ file_ending = "_ocr_results_with_words_textract.json"
+ else:
+ # print("No valid text extraction method found. Returning False")
+ return False
+
+ try:
+ safe_stem = sanitize_filename(doc_file_name_no_extension_textbox)
+ doc_file_with_ending = safe_stem + file_ending
+ local_ocr_output_path = secure_path_join(output_folder, doc_file_with_ending)
+ except (ValueError, PermissionError, OSError):
+ return False
+
+ if local_ocr_output_path.exists():
+ print("Existing OCR with words analysis output file found.")
+ return True
+ else:
+ return False
+
+
+def add_folder_to_path(folder_path: str):
+ """
+ Check if a folder exists on your system. If so, get the absolute path and then add it to the system Path variable if it doesn't already exist. Function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
+ """
+
+ if os.path.exists(folder_path) and os.path.isdir(folder_path):
+ print(folder_path, "folder exists.")
+
+ # Resolve relative path to absolute path
+ absolute_path = os.path.abspath(folder_path)
+
+ current_path = os.environ["PATH"]
+ if absolute_path not in current_path.split(os.pathsep):
+ full_path_extension = absolute_path + os.pathsep + current_path
+ os.environ["PATH"] = full_path_extension
+ # print(f"Updated PATH with: ", full_path_extension)
+ else:
+ print(f"Directory {folder_path} already exists in PATH.")
+ else:
+ print(f"Folder not found at {folder_path} - not added to PATH")
+
+
+# Upon running a process, the feedback buttons are revealed
+def reveal_feedback_buttons():
+ if SHOW_FEEDBACK_BUTTONS:
+ is_visible = True
+ else:
+ is_visible = False
+ return (
+ gr.Radio(
+ visible=is_visible,
+ label="Please give some feedback about the results of the redaction. A reminder that the app is only expected to identify about 80% of personally identifiable information in a given (typed) document.",
+ ),
+ gr.Textbox(visible=is_visible),
+ gr.Button(visible=is_visible),
+ gr.Markdown(visible=is_visible),
+ )
+
+
+def wipe_logs(feedback_logs_loc: str, usage_logs_loc: str):
+ try:
+ os.remove(feedback_logs_loc)
+ except Exception as e:
+ print("Could not remove feedback logs file", e)
+ try:
+ os.remove(usage_logs_loc)
+ except Exception as e:
+ print("Could not remove usage logs file", e)
+
+
+def merge_csv_files(file_list: List[str], output_folder: str = OUTPUT_FOLDER):
+
+ # Initialise an empty list to hold DataFrames
+ dataframes = []
+ output_files = []
+
+ # Loop through each file in the file list
+ for file in file_list:
+ # Read the CSV file into a DataFrame
+ df = pd.read_csv(file.name)
+ dataframes.append(df)
+
+ # Concatenate all DataFrames into a single DataFrame
+ merged_df = pd.concat(dataframes, ignore_index=True)
+
+ for col in ["xmin", "xmax", "ymin", "ymax"]:
+ merged_df[col] = np.floor(merged_df[col])
+
+ merged_df = merged_df.drop_duplicates(
+ subset=["page", "label", "color", "xmin", "ymin", "xmax", "ymax"]
+ )
+
+ merged_df = merged_df.sort_values(["page", "ymin", "xmin", "label"])
+
+ file_out_name = os.path.basename(file_list[0])
+
+ merged_csv_path = output_folder + file_out_name + "_merged.csv"
+
+ # Save the merged DataFrame to a CSV file
+ merged_df.to_csv(merged_csv_path, index=False, encoding="utf-8-sig")
+ output_files.append(merged_csv_path)
+
+ return output_files
+
+
+async def get_connection_params(
+ request: gr.Request,
+ output_folder_textbox: str = OUTPUT_FOLDER,
+ input_folder_textbox: str = INPUT_FOLDER,
+ session_output_folder: bool = SESSION_OUTPUT_FOLDER,
+ s3_outputs_folder_textbox: str = S3_OUTPUTS_FOLDER,
+ textract_document_upload_input_folder: str = TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER,
+ textract_document_upload_output_folder: str = TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER,
+ s3_textract_document_logs_subfolder: str = TEXTRACT_JOBS_S3_LOC,
+ local_textract_document_logs_subfolder: str = TEXTRACT_JOBS_LOCAL_LOC,
+):
+ # Convert session_output_folder to boolean if it's a string (from Gradio Textbox)
+ if isinstance(session_output_folder, str):
+ session_output_folder = convert_string_to_boolean(session_output_folder)
+
+ if CUSTOM_HEADER and CUSTOM_HEADER_VALUE:
+ if CUSTOM_HEADER in request.headers:
+ supplied_custom_header_value = request.headers[CUSTOM_HEADER]
+ if supplied_custom_header_value == CUSTOM_HEADER_VALUE:
+ print("Custom header supplied and matches CUSTOM_HEADER_VALUE")
+ else:
+ print("Custom header value does not match expected value.")
+ raise ValueError("Custom header value does not match expected value.")
+ else:
+ print("Custom header value not found.")
+ raise ValueError("Custom header value not found.")
+
+ # Get output save folder from 1 - username passed in from direct Cognito login, 2 - Cognito ID header passed through a Lambda authenticator, 3 - the session hash.
+
+ if request.username:
+ out_session_hash = request.username
+ # print("Request username found:", out_session_hash)
+
+ elif "x-cognito-id" in request.headers:
+ out_session_hash = request.headers["x-cognito-id"]
+ print("Cognito ID found:", out_session_hash)
+
+ elif "x-amzn-oidc-identity" in request.headers:
+ out_session_hash = request.headers["x-amzn-oidc-identity"]
+
+ if AWS_USER_POOL_ID:
+ try:
+ # Fetch email address using Cognito client
+ cognito_client = boto3.client("cognito-idp")
+
+ response = cognito_client.admin_get_user(
+ UserPoolId=AWS_USER_POOL_ID, # Replace with your User Pool ID
+ Username=out_session_hash,
+ )
+ email = next(
+ attr["Value"]
+ for attr in response["UserAttributes"]
+ if attr["Name"] == "email"
+ )
+ print("Cognito email address found, will be used as session hash")
+
+ out_session_hash = email
+ except (
+ ClientError,
+ NoCredentialsError,
+ PartialCredentialsError,
+ BotoCoreError,
+ ) as e:
+ print(f"Error fetching Cognito user details: {e}")
+ print("Falling back to using AWS ID as session hash")
+ # out_session_hash already set to the AWS ID from header, so no need to change it
+ except Exception as e:
+ print(f"Unexpected error when fetching Cognito user details: {e}")
+ print("Falling back to using AWS ID as session hash")
+ # out_session_hash already set to the AWS ID from header, so no need to change it
+
+ print("AWS ID found, will be used as username for session:", out_session_hash)
+
+ else:
+ out_session_hash = request.session_hash
+
+ if session_output_folder:
+ output_folder = output_folder_textbox + out_session_hash + "/"
+ input_folder = input_folder_textbox + out_session_hash + "/"
+
+ # If configured, create a session-specific S3 outputs folder using the same pattern
+ if SAVE_OUTPUTS_TO_S3 and s3_outputs_folder_textbox:
+ s3_outputs_folder = (
+ s3_outputs_folder_textbox.rstrip("/") + "/" + out_session_hash + "/"
+ )
+ else:
+ s3_outputs_folder = s3_outputs_folder_textbox
+
+ textract_document_upload_input_folder = (
+ textract_document_upload_input_folder + "/" + out_session_hash
+ )
+ textract_document_upload_output_folder = (
+ textract_document_upload_output_folder + "/" + out_session_hash
+ )
+
+ s3_textract_document_logs_subfolder = (
+ s3_textract_document_logs_subfolder + "/" + out_session_hash
+ )
+ local_textract_document_logs_subfolder = (
+ local_textract_document_logs_subfolder + "/" + out_session_hash + "/"
+ )
+
+ else:
+ output_folder = output_folder_textbox
+ input_folder = input_folder_textbox
+ # Keep S3 outputs folder as configured (no per-session subfolder)
+ s3_outputs_folder = s3_outputs_folder_textbox
+
+ # Append today's date (YYYYMMDD/) to the final S3 outputs folder when enabled
+ if SAVE_OUTPUTS_TO_S3 and s3_outputs_folder:
+ today_suffix = datetime.now().strftime("%Y%m%d") + "/"
+ s3_outputs_folder = s3_outputs_folder.rstrip("/") + "/" + today_suffix
+
+ if not os.path.exists(output_folder):
+ os.makedirs(output_folder, exist_ok=True)
+ if not os.path.exists(input_folder):
+ os.makedirs(input_folder, exist_ok=True)
+
+ return (
+ out_session_hash,
+ output_folder,
+ out_session_hash,
+ input_folder,
+ textract_document_upload_input_folder,
+ textract_document_upload_output_folder,
+ s3_textract_document_logs_subfolder,
+ local_textract_document_logs_subfolder,
+ s3_outputs_folder,
+ )
+
+
+def clean_unicode_text(text: str, preserve_international_scripts: bool = False) -> str:
+ """
+ Normalise Unicode (NFKC), replace common smart punctuation with ASCII.
+
+ By default, non-ASCII characters are stripped (legacy behaviour for some pipelines).
+ Set ``preserve_international_scripts=True`` for OCR and extracted document text so
+ Cyrillic, Arabic, CJK, etc. are retained in outputs (CSV, JSON, redaction targets).
+ """
+ if text is None:
+ return ""
+ if not isinstance(text, str):
+ text = str(text)
+
+ # Step 1: Normalise unicode characters to decompose any special forms
+ normalized_text = unicodedata.normalize("NFKC", text)
+
+ # Step 2: Replace smart quotes and special punctuation with standard ASCII equivalents
+ replacements = {
+ "‘": "'",
+ "’": "'",
+ "“": '"',
+ "”": '"',
+ "–": "-",
+ "—": "-",
+ "…": "...",
+ "•": "*",
+ }
+
+ for old_char, new_char in replacements.items():
+ normalized_text = normalized_text.replace(old_char, new_char)
+
+ if preserve_international_scripts:
+ return normalized_text
+
+ from tools.secure_regex_utils import safe_remove_non_ascii
+
+ return safe_remove_non_ascii(normalized_text)
+
+
+# --- Helper Function for ID Generation ---
+# This function encapsulates your ID logic in a performant, batch-oriented way.
+def _generate_unique_ids(
+ num_ids_to_generate: int, existing_ids_set: Set[str]
+) -> List[str]:
+ """
+ Generates a specified number of unique, 12-character alphanumeric IDs.
+
+ This is a batch-oriented, performant version of the original
+ `fill_missing_ids_in_list` logic, designed to work efficiently
+ with DataFrames.
+
+ Args:
+ num_ids_to_generate (int): The number of unique IDs to create.
+ existing_ids_set (Set[str]): A set of IDs that are already in use and
+ should be avoided.
+
+ Returns:
+ List[str]: A list of newly generated unique IDs.
+ """
+ id_length = 12
+ character_set = string.ascii_letters + string.digits
+
+ newly_generated_ids = set()
+
+ # The while loop ensures we generate exactly the number of IDs required,
+ # automatically handling the astronomically rare case of a collision.
+ while len(newly_generated_ids) < num_ids_to_generate:
+ candidate_id = "".join(random.choices(character_set, k=id_length))
+
+ # Check against both pre-existing IDs and IDs generated in this batch
+ if (
+ candidate_id not in existing_ids_set
+ and candidate_id not in newly_generated_ids
+ ):
+ newly_generated_ids.add(candidate_id)
+
+ return list(newly_generated_ids)
+
+
+def load_all_output_files(folder_path: str = OUTPUT_FOLDER) -> List[str]:
+ """Get the file paths of all files in the given folder and its subfolders."""
+
+ safe_folder_path_resolved = Path(folder_path).resolve()
+
+ return gr.FileExplorer(
+ root_dir=safe_folder_path_resolved,
+ )
+
+
+def update_file_explorer_object():
+ return gr.FileExplorer()
+
+
+def _is_file_path(path: str) -> bool:
+ """True if path looks like a file (has a file-type suffix), not a folder."""
+ if not path or not path.strip():
+ return False
+ name = os.path.basename(path.rstrip("/\\"))
+ if not name or "." not in name:
+ return False
+ ext = name.rsplit(".", 1)[-1]
+ return bool(ext and len(ext) <= 10 and ext.isalnum())
+
+
+def all_outputs_file_download_fn(file_explorer_object: list[str]):
+ """Return only paths that are files (have a suffix like .csv, .txt), not folder paths."""
+ if not file_explorer_object:
+ return file_explorer_object
+ return [p for p in file_explorer_object if _is_file_path(p)]
+
+
+def calculate_aws_costs(
+ number_of_pages: str,
+ text_extract_method_radio: str,
+ handwrite_signature_checkbox: List[str],
+ pii_identification_method: str,
+ textract_output_found_checkbox: bool,
+ only_extract_text_radio: bool,
+ convert_to_gbp: bool = True,
+ usd_gbp_conversion_rate: float = 0.76,
+ textract_page_cost: float = 1.5 / 1000,
+ textract_signature_cost: float = 2.0 / 1000,
+ textract_forms_cost: float = 50.0 / 1000,
+ textract_layout_cost: float = 4.0 / 1000,
+ textract_tables_cost: float = 15.0 / 1000,
+ comprehend_unit_cost: float = 0.0001,
+ comprehend_size_unit_average: float = 250,
+ average_characters_per_page: float = 2000,
+ bedrock_vlm_output_token_ratio: float = 0.08,
+ bedrock_vlm_face_output_token_ratio: float = 0.03,
+ TEXTRACT_TEXT_EXTRACT_OPTION: str = TEXTRACT_TEXT_EXTRACT_OPTION,
+ BEDROCK_VLM_TEXT_EXTRACT_OPTION: str = BEDROCK_VLM_TEXT_EXTRACT_OPTION,
+ NO_REDACTION_PII_OPTION: str = NO_REDACTION_PII_OPTION,
+ AWS_PII_OPTION: str = AWS_PII_OPTION,
+ AWS_LLM_PII_OPTION: str = AWS_LLM_PII_OPTION,
+ VLM_MAX_IMAGE_SIZE: int = VLM_MAX_IMAGE_SIZE,
+ BEDROCK_VLM_INPUT_COST: float = BEDROCK_VLM_INPUT_COST,
+ BEDROCK_VLM_OUTPUT_COST: float = BEDROCK_VLM_OUTPUT_COST,
+ BEDROCK_VLM_PIXELS_PER_INPUT_TOKEN: int = BEDROCK_VLM_PIXELS_PER_INPUT_TOKEN,
+ BEDROCK_LLM_INPUT_COST: float = BEDROCK_LLM_INPUT_COST,
+ BEDROCK_LLM_OUTPUT_COST: float = BEDROCK_LLM_OUTPUT_COST,
+ BEDROCK_LLM_INPUT_TOKENS_PER_PAGE: int = BEDROCK_LLM_INPUT_TOKENS_PER_PAGE,
+ BEDROCK_LLM_OUTPUT_TOKENS_PER_PAGE: int = BEDROCK_LLM_OUTPUT_TOKENS_PER_PAGE,
+):
+ """
+ Calculate the approximate cost of submitting a document to AWS Textract and/or AWS Comprehend, assuming that Textract outputs do not already exist in the output folder.
+
+ - number_of_pages: The number of pages in the uploaded document(s).
+ - text_extract_method_radio: The method of text extraction.
+ - handwrite_signature_checkbox: Whether signatures are being extracted or not.
+ - pii_identification_method_drop: The method of personally-identifiable information removal.
+ - textract_output_found_checkbox: Whether existing Textract results have been found in the output folder. Assumes that results exist for all pages and files in the output folder.
+ - only_extract_text_radio (bool, optional): Option to only extract text from the document rather than redact.
+ - convert_to_gbp (bool, optional): Should suggested costs be converted from USD to GBP.
+ - usd_gbp_conversion_rate (float, optional): Conversion rate used for USD to GBP. Last changed 14th April 2025.
+ - textract_page_cost (float, optional): AWS pricing for Textract text extraction per page ($).
+ - textract_signature_cost (float, optional): Additional AWS cost above standard AWS Textract extraction for extracting signatures.
+ - textract_forms_cost (float, optional): AWS Textract cost per page for "Extract forms" ($50/1000 pages).
+ - textract_layout_cost (float, optional): AWS Textract cost per page for "Extract layout" ($4/1000 pages).
+ - textract_tables_cost (float, optional): AWS Textract cost per page for "Extract tables" ($15/1000 pages).
+ - comprehend_unit_cost (float, optional): Cost per 'unit' (300 character minimum) for identifying PII in text with AWS Comprehend.
+ - comprehend_size_unit_average (float, optional): Average size of a 'unit' of text passed to AWS Comprehend by the app through the batching process
+ - average_characters_per_page (float, optional): Average number of characters on an A4 page.
+ - bedrock_vlm_output_token_ratio (float, optional): Ratio of output to input tokens for Bedrock VLM OCR (~0.08 in practice).
+ - bedrock_vlm_face_output_token_ratio (float, optional): Ratio of output to input tokens for the face-identification second run (~0.03 in practice).
+ - TEXTRACT_TEXT_EXTRACT_OPTION (str, optional): String label for the text_extract_method_radio button for AWS Textract.
+ - BEDROCK_VLM_TEXT_EXTRACT_OPTION (str, optional): String label for AWS Bedrock VLM OCR text extraction.
+ - NO_REDACTION_PII_OPTION (str, optional): String label for pii_identification_method_drop for no redaction.
+ - AWS_PII_OPTION (str, optional): String label for pii_identification_method_drop for AWS Comprehend.
+ - AWS_LLM_PII_OPTION (str, optional): String label for PII identification via LLM (AWS Bedrock).
+ - VLM_MAX_IMAGE_SIZE, BEDROCK_VLM_*: used for Bedrock VLM OCR cost estimate.
+ - BEDROCK_LLM_*: used for Bedrock LLM (e.g. PII detection) cost estimate when that method is selected (2000 input / 250 output tokens per page).
+ """
+ text_extraction_cost = 0
+ pii_identification_cost = 0
+ calculated_aws_cost = 0
+ number_of_pages = int(number_of_pages)
+
+ if textract_output_found_checkbox is not True:
+ if text_extract_method_radio == TEXTRACT_TEXT_EXTRACT_OPTION:
+ text_extraction_cost = number_of_pages * textract_page_cost
+
+ if "Extract signatures" in handwrite_signature_checkbox:
+ text_extraction_cost += textract_signature_cost * number_of_pages
+ if "Extract forms" in handwrite_signature_checkbox:
+ text_extraction_cost += textract_forms_cost * number_of_pages
+ if "Extract layout" in handwrite_signature_checkbox:
+ text_extraction_cost += textract_layout_cost * number_of_pages
+ if "Extract tables" in handwrite_signature_checkbox:
+ text_extraction_cost += textract_tables_cost * number_of_pages
+
+ elif text_extract_method_radio == BEDROCK_VLM_TEXT_EXTRACT_OPTION:
+ # Estimate input tokens per page from max image size; output tokens ~8% of input
+ input_tokens_per_page = ceil(
+ VLM_MAX_IMAGE_SIZE / BEDROCK_VLM_PIXELS_PER_INPUT_TOKEN
+ )
+ output_tokens_per_page = (
+ input_tokens_per_page * bedrock_vlm_output_token_ratio
+ )
+ total_input_tokens = number_of_pages * input_tokens_per_page
+ total_output_tokens = number_of_pages * output_tokens_per_page
+ text_extraction_cost = total_input_tokens * (
+ BEDROCK_VLM_INPUT_COST / 1_000_000
+ ) + total_output_tokens * (BEDROCK_VLM_OUTPUT_COST / 1_000_000)
+ # Face detection does a second run per page: same input tokens, output ~3% of input
+ if "Face detection" in handwrite_signature_checkbox:
+ face_input_tokens = total_input_tokens
+ face_output_tokens = int(
+ total_input_tokens * bedrock_vlm_face_output_token_ratio
+ )
+ text_extraction_cost += face_input_tokens * (
+ BEDROCK_VLM_INPUT_COST / 1_000_000
+ ) + face_output_tokens * (BEDROCK_VLM_OUTPUT_COST / 1_000_000)
+
+ if pii_identification_method != NO_REDACTION_PII_OPTION:
+ if pii_identification_method == AWS_PII_OPTION:
+ comprehend_page_cost = (
+ ceil(average_characters_per_page / comprehend_size_unit_average)
+ * comprehend_unit_cost
+ )
+ pii_identification_cost = comprehend_page_cost * number_of_pages
+
+ elif pii_identification_method == AWS_LLM_PII_OPTION:
+ # Bedrock LLM (e.g. PII detection): 2000 input tokens, 250 output tokens per page
+ llm_input_tokens = number_of_pages * BEDROCK_LLM_INPUT_TOKENS_PER_PAGE
+ llm_output_tokens = number_of_pages * BEDROCK_LLM_OUTPUT_TOKENS_PER_PAGE
+ pii_identification_cost = llm_input_tokens * (
+ BEDROCK_LLM_INPUT_COST / 1_000_000
+ ) + llm_output_tokens * (BEDROCK_LLM_OUTPUT_COST / 1_000_000)
+
+ calculated_aws_cost = (
+ calculated_aws_cost + text_extraction_cost + pii_identification_cost
+ )
+
+ if convert_to_gbp is True:
+ calculated_aws_cost *= usd_gbp_conversion_rate
+
+ return calculated_aws_cost
+
+
+def calculate_time_taken(
+ number_of_pages: str,
+ text_extract_method_radio: str,
+ pii_identification_method: str,
+ textract_output_found_checkbox: bool,
+ only_extract_text_radio: bool,
+ local_ocr_output_found_checkbox: bool,
+ handwrite_signature_checkbox: List[str],
+ convert_page_time: float = 0.3,
+ textract_page_time: float = 0.4,
+ comprehend_page_time: float = 0.4,
+ local_text_extraction_page_time: float = 0.2,
+ local_pii_redaction_page_time: float = 0.4,
+ local_ocr_extraction_page_time: float = 1.5,
+ TEXTRACT_TEXT_EXTRACT_OPTION: str = TEXTRACT_TEXT_EXTRACT_OPTION,
+ BEDROCK_VLM_TEXT_EXTRACT_OPTION: str = BEDROCK_VLM_TEXT_EXTRACT_OPTION,
+ SELECTABLE_TEXT_EXTRACT_OPTION: str = SELECTABLE_TEXT_EXTRACT_OPTION,
+ local_ocr_option: str = LOCAL_OCR_MODEL_TEXT_EXTRACT_OPTION,
+ NO_REDACTION_PII_OPTION: str = NO_REDACTION_PII_OPTION,
+ AWS_PII_OPTION: str = AWS_PII_OPTION,
+ AWS_LLM_PII_OPTION: str = AWS_LLM_PII_OPTION,
+):
+ """
+ Calculate the approximate time to redact a document.
+
+ - number_of_pages: The number of pages in the uploaded document(s).
+ - text_extract_method_radio: The method of text extraction.
+ - pii_identification_method_drop: The method of personally-identifiable information removal.
+ - textract_output_found_checkbox (bool, optional): Boolean indicating if AWS Textract text extraction outputs have been found.
+ - only_extract_text_radio (bool, optional): Option to only extract text from the document rather than redact.
+ - local_ocr_output_found_checkbox (bool, optional): Boolean indicating if local OCR text extraction outputs have been found.
+ - handwrite_signature_checkbox: List of selected options (e.g. "Face detection"); when Face detection is selected with Bedrock VLM, extraction time is doubled.
+ - textract_page_time (float, optional): Approximate time to query AWS Textract (also used for Bedrock VLM OCR).
+ - comprehend_page_time (float, optional): Approximate time to query text on a page with AWS Comprehend.
+ - local_text_redaction_page_time (float, optional): Approximate time to extract text on a page with the local text redaction option.
+ - local_pii_redaction_page_time (float, optional): Approximate time to redact text on a page with the local text redaction option.
+ - local_ocr_extraction_page_time (float, optional): Approximate time to extract text from a page with the local OCR redaction option.
+ - TEXTRACT_TEXT_EXTRACT_OPTION (str, optional): String label for the text_extract_method_radio button for AWS Textract.
+ - SELECTABLE_TEXT_EXTRACT_OPTION (str, optional): String label for text_extract_method_radio for text extraction.
+ - local_ocr_option (str, optional): String label for text_extract_method_radio for local OCR.
+ - NO_REDACTION_PII_OPTION (str, optional): String label for pii_identification_method_drop for no redaction.
+ - AWS_PII_OPTION (str, optional): String label for pii_identification_method_drop for AWS Comprehend.
+ - BEDROCK_VLM_TEXT_EXTRACT_OPTION, AWS_LLM_PII_OPTION (str, optional): Labels for Bedrock VLM OCR and LLM PII; times match Textract and Comprehend respectively.
+ """
+ calculated_time_taken = 0
+ page_conversion_time_taken = 0
+ page_extraction_time_taken = 0
+ page_redaction_time_taken = 0
+
+ number_of_pages = int(number_of_pages)
+
+ # Page preparation/conversion to image time
+ if (text_extract_method_radio != SELECTABLE_TEXT_EXTRACT_OPTION) and (
+ textract_output_found_checkbox is not True
+ ):
+ page_conversion_time_taken = number_of_pages * convert_page_time
+
+ # Page text extraction time
+ if text_extract_method_radio == TEXTRACT_TEXT_EXTRACT_OPTION:
+ if textract_output_found_checkbox is not True:
+ page_extraction_time_taken = number_of_pages * textract_page_time
+ elif text_extract_method_radio == BEDROCK_VLM_TEXT_EXTRACT_OPTION:
+ if textract_output_found_checkbox is not True:
+ page_extraction_time_taken = number_of_pages * textract_page_time
+ if "Face detection" in (handwrite_signature_checkbox or []):
+ page_extraction_time_taken *= 2
+ elif text_extract_method_radio == local_ocr_option:
+ if local_ocr_output_found_checkbox is not True:
+ page_extraction_time_taken = (
+ number_of_pages * local_ocr_extraction_page_time
+ )
+ elif text_extract_method_radio == SELECTABLE_TEXT_EXTRACT_OPTION:
+ page_conversion_time_taken = number_of_pages * local_text_extraction_page_time
+
+ # Page redaction time (Bedrock LLM PII uses same time as AWS Comprehend)
+ if pii_identification_method != NO_REDACTION_PII_OPTION:
+ if pii_identification_method in (AWS_PII_OPTION, AWS_LLM_PII_OPTION):
+ page_redaction_time_taken = number_of_pages * comprehend_page_time
+ else:
+ page_redaction_time_taken = number_of_pages * local_pii_redaction_page_time
+
+ calculated_time_taken = (
+ page_conversion_time_taken
+ + page_extraction_time_taken
+ + page_redaction_time_taken
+ ) / 60
+
+ return calculated_time_taken
+
+
+def reset_base_dataframe(df: pd.DataFrame):
+ return df
+
+
+def reset_ocr_base_dataframe(df: pd.DataFrame):
+ if df.empty:
+ print("OCR base dataframe is empty, returning empty dataframe")
+ return pd.DataFrame(columns=["page", "line", "text"])
+ else:
+ return df.loc[:, ["page", "line", "text"]]
+
+
+def reset_ocr_with_words_base_dataframe(
+ df: pd.DataFrame, page_entity_dropdown_redaction_value: str
+):
+ """
+ Resets and prepares the OCR dataframe with word-level details for a specific page.
+
+ Args:
+ df (pd.DataFrame): The original dataframe that may contain word-level OCR results,
+ with at least a 'page' column.
+ page_entity_dropdown_redaction_value (str): The currently selected page value
+ to filter the dataframe on, for redaction view.
+
+ Returns:
+ Tuple[pd.DataFrame, pd.DataFrame]: A tuple of (filtered dataframe for selected page, full dataframe after processing).
+ """
+
+ if "page" not in df.columns:
+ print(
+ "OCR with words dataframe does not contain page column, returning empty dataframe"
+ )
+ df_out = pd.DataFrame(
+ columns=[
+ "page",
+ "line",
+ "word_text",
+ "word_x0",
+ "word_y0",
+ "word_x1",
+ "word_y1",
+ "index",
+ ]
+ )
+ df_filtered_out = pd.DataFrame(
+ columns=[
+ "page",
+ "line",
+ "word_text",
+ "index",
+ ]
+ )
+ return df_filtered_out, df_out
+
+ df.reset_index(drop=True, inplace=True)
+ df["index"] = df.index
+
+ output_df_base = df.copy()
+
+ if output_df_base.empty:
+ print("OCR with words dataframe is empty, returning empty dataframe")
+
+ df["page"] = df["page"].astype(str)
+
+ output_df_filtered = df.loc[
+ df["page"] == str(page_entity_dropdown_redaction_value),
+ [
+ "page",
+ "line",
+ "word_text",
+ "index",
+ ],
+ ]
+
+ if output_df_filtered.empty:
+ print("No OCR results found for page, returning empty dataframe")
+
+ return output_df_filtered, output_df_base
+
+
+def update_language_dropdown(
+ chosen_language_full_name_drop,
+ textract_language_choices=textract_language_choices,
+ aws_comprehend_language_choices=aws_comprehend_language_choices,
+ LANGUAGE_MAP=LANGUAGE_MAP,
+):
+
+ try:
+ full_language_name = chosen_language_full_name_drop.lower()
+ matched_language = LANGUAGE_MAP[full_language_name]
+
+ chosen_language_drop = gr.Dropdown(
+ value=matched_language,
+ choices=LANGUAGE_CHOICES,
+ label="Chosen language short code",
+ multiselect=False,
+ visible=True,
+ )
+
+ if (
+ matched_language not in aws_comprehend_language_choices
+ and matched_language not in textract_language_choices
+ ):
+ gr.Info(
+ f"Note that {full_language_name} is not supported by AWS Comprehend or AWS Textract"
+ )
+ elif matched_language not in aws_comprehend_language_choices:
+ gr.Info(
+ f"Note that {full_language_name} is not supported by AWS Comprehend"
+ )
+ elif matched_language not in textract_language_choices:
+ gr.Info(f"Note that {full_language_name} is not supported by AWS Textract")
+ except Exception as e:
+ print(e)
+ gr.Info("Could not find language in list")
+ chosen_language_drop = gr.Dropdown(
+ value=DEFAULT_LANGUAGE,
+ choices=LANGUAGE_CHOICES,
+ label="Chosen language short code",
+ multiselect=False,
+ )
+
+ return chosen_language_drop
+
+
+def get_system_font_path():
+ """
+ Returns the path to a standard font that exists on most operating systems.
+ Used to replace PaddleOCR's default fonts (simfang.ttf, PingFang-SC-Regular.ttf).
+
+ Returns:
+ str: Path to a system font, or None if no suitable font found
+ """
+ system = platform.system()
+
+ # Windows font paths
+ if system == "Windows":
+ windows_fonts = [
+ os.path.join(
+ os.environ.get("WINDIR", "C:\\Windows"), "Fonts", "simsun.ttc"
+ ), # SimSun
+ os.path.join(
+ os.environ.get("WINDIR", "C:\\Windows"), "Fonts", "msyh.ttc"
+ ), # Microsoft YaHei
+ os.path.join(
+ os.environ.get("WINDIR", "C:\\Windows"), "Fonts", "arial.ttf"
+ ), # Arial (fallback)
+ ]
+ for font_path in windows_fonts:
+ if os.path.exists(font_path):
+ return font_path
+
+ # macOS font paths
+ elif system == "Darwin":
+ mac_fonts = [
+ "/System/Library/Fonts/STSong.ttc",
+ "/System/Library/Fonts/STHeiti Light.ttc",
+ "/System/Library/Fonts/Helvetica.ttc",
+ ]
+ for font_path in mac_fonts:
+ if os.path.exists(font_path):
+ return font_path
+
+ # Linux font paths
+ elif system == "Linux":
+ linux_fonts = [
+ "/usr/share/fonts/truetype/wqy/wqy-microhei.ttc",
+ "/usr/share/fonts/truetype/droid/DroidSansFallbackFull.ttf",
+ "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
+ ]
+ for font_path in linux_fonts:
+ if os.path.exists(font_path):
+ return font_path
+
+ return None
+
+
+def get_ocr_visualisation_font_path():
+ """
+ Path to a TrueType/OpenType font for drawing recognised OCR text on debug images.
+
+ OpenCV's Hershey fonts (cv2.putText) are effectively Latin-only; other scripts show as '?'.
+ PIL + this font supports Cyrillic, CJK, Arabic in typical OS installs (Segoe UI, Noto, etc.).
+ """
+ system = platform.system()
+
+ if system == "Windows":
+ fonts_dir = os.path.join(os.environ.get("WINDIR", r"C:\Windows"), "Fonts")
+ for name in (
+ "segoeui.ttf",
+ "calibri.ttf",
+ "msyh.ttc",
+ "simsun.ttc",
+ "arial.ttf",
+ "times.ttf",
+ ):
+ p = os.path.join(fonts_dir, name)
+ if os.path.isfile(p):
+ return p
+
+ elif system == "Darwin":
+ for p in (
+ "/System/Library/Fonts/Supplemental/Arial Unicode.ttf",
+ "/Library/Fonts/Arial Unicode.ttf",
+ "/System/Library/Fonts/PingFang.ttc",
+ "/System/Library/Fonts/Hiragino Sans GB.ttc",
+ "/System/Library/Fonts/Helvetica.ttc",
+ ):
+ if os.path.isfile(p):
+ return p
+
+ elif system == "Linux":
+ for p in (
+ "/usr/share/fonts/truetype/noto/NotoSans-Regular.ttf",
+ "/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc",
+ "/usr/share/fonts/truetype/noto/NotoSansCJK-Regular.ttc",
+ "/usr/share/fonts/truetype/wqy/wqy-microhei.ttc",
+ "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
+ ):
+ if os.path.isfile(p):
+ return p
+
+ return get_system_font_path()
+
+
+# Custom logging filter to remove logs from healthiness/readiness endpoints so they don't fill up application log flow
+class EndpointFilter(logging.Filter):
+ def __init__(self, path: str, *args, **kwargs):
+ self._path = path
+ super().__init__(*args, **kwargs)
+
+ def filter(self, record: logging.LogRecord) -> bool:
+ return record.getMessage().find(self._path) == -1
+
+
+# 2. Define the lifespan context manager
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+ # --- STARTUP LOGIC ---
+ # Filter out /health logging to declutter ECS logs
+ uvicorn_access_logger = logging.getLogger("uvicorn.access")
+ uvicorn_access_logger.addFilter(EndpointFilter(path="/health"))
+
+ # Yield control back to the application
+ yield
+
+ pass
+
+
+def check_duplicate_pages_checkbox(redact_duplicate_pages_checkbox_value: bool):
+ if not redact_duplicate_pages_checkbox_value:
+ # Silently raise an error to avoid showing a popup
+ return
+ if redact_duplicate_pages_checkbox_value:
+ print("Identifying duplicates")
+ sys.tracebacklimit = 0 # Suppress traceback
+ gr.Info("Redact duplicate pages checkbox is enabled, Identifying duplicates")
+ raise gr.Error(
+ message="Redact duplicate pages checkbox is enabled, identifying duplicates.",
+ title="Finding duplicates...",
+ visible=False,
+ print_exception=False,
+ )
+
+
+# Tab switch functions
+def change_tab_to_tabular_or_document_redactions(is_data_file):
+ if is_data_file:
+ return gr.Tabs(selected=5)
+ else:
+ return gr.Tabs(selected=1)
+
+
+def change_tab_to_review_redactions():
+ return gr.Tabs(selected=2)
+
+
+### Examples functions
+def show_info_box_on_click(
+ in_doc_files,
+ text_extract_method_radio,
+ pii_identification_method_drop,
+ handwrite_signature_checkbox,
+ in_redact_entities,
+ in_redact_comprehend_entities,
+ prepared_pdf_state,
+ doc_full_file_name_textbox,
+ doc_file_name_no_extension_textbox,
+ in_deny_list,
+ in_deny_list_state,
+ in_fully_redacted_list,
+ in_fully_redacted_list_state,
+ total_pdf_page_count,
+):
+ gr.Info(
+ "Example data loaded. Now click on 'Extract text and redact document' on the Redact PDFs/images tab to run the example redaction."
+ )
+
+ # Convert deny_list_state, allow_list_state, and fully_redacted_list_state to lists if they are DataFrames
+ # Handle deny_list_state
+ deny_list_walkthrough = []
+ if isinstance(in_deny_list_state, pd.DataFrame):
+ # Explicitly convert empty DataFrame to empty list
+ if in_deny_list_state.empty:
+ deny_list_walkthrough = []
+ else:
+ deny_list_walkthrough = (
+ in_deny_list_state.iloc[:, 0].dropna().astype(str).tolist()
+ )
+ elif isinstance(in_deny_list_state, list):
+ deny_list_walkthrough = (
+ [str(item) for item in in_deny_list_state if item]
+ if in_deny_list_state
+ else []
+ )
+ else:
+ # Default to empty list for any other type
+ deny_list_walkthrough = []
+
+ # Handle fully_redacted_list_state
+ fully_redacted_list_walkthrough = []
+ if isinstance(in_fully_redacted_list_state, pd.DataFrame):
+ # Explicitly convert empty DataFrame to empty list
+ if in_fully_redacted_list_state.empty:
+ fully_redacted_list_walkthrough = []
+ else:
+ fully_redacted_list_walkthrough = (
+ in_fully_redacted_list_state.iloc[:, 0].dropna().astype(str).tolist()
+ )
+ elif isinstance(in_fully_redacted_list_state, list):
+ fully_redacted_list_walkthrough = (
+ [str(item) for item in in_fully_redacted_list_state if item]
+ if in_fully_redacted_list_state
+ else []
+ )
+ else:
+ # Default to empty list for any other type
+ fully_redacted_list_walkthrough = []
+
+ # Allow list is not in examples, so always set to empty list
+ allow_list_walkthrough = []
+
+ # Use default local OCR method - examples don't set this directly
+ local_ocr_method = DEFAULT_LOCAL_OCR_MODEL
+
+ # Update visibility of main PII entity components based on selected PII method
+ # This ensures visibility is correct even when clicking examples with the same PII method
+ # Determine visibility based on PII method (same logic as handle_main_pii_method_selection)
+ is_no_redaction = pii_identification_method_drop == NO_REDACTION_PII_OPTION
+ show_local_entities = (
+ not is_no_redaction and pii_identification_method_drop == LOCAL_PII_OPTION
+ )
+ show_comprehend_entities = (
+ not is_no_redaction and pii_identification_method_drop == AWS_PII_OPTION
+ )
+ is_llm_method = not is_no_redaction and (
+ pii_identification_method_drop == LOCAL_TRANSFORMERS_LLM_PII_OPTION
+ or pii_identification_method_drop == INFERENCE_SERVER_PII_OPTION
+ or pii_identification_method_drop == AWS_LLM_PII_OPTION
+ )
+
+ # Create updates with both value and visibility for main components
+ main_local_entities_update = gr.update(
+ value=in_redact_entities,
+ visible=show_local_entities,
+ )
+ main_comprehend_entities_update = gr.update(
+ value=in_redact_comprehend_entities,
+ visible=show_comprehend_entities,
+ )
+ main_llm_entities_update = gr.update(
+ visible=is_llm_method,
+ )
+ main_llm_instructions_update = gr.update(
+ visible=True,
+ )
+
+ # Set visibility on walkthrough entity dropdowns so they match PII method after example load
+ walkthrough_local_update = gr.update(
+ value=in_redact_entities, visible=show_local_entities
+ )
+ walkthrough_comprehend_update = gr.update(
+ value=in_redact_comprehend_entities, visible=show_comprehend_entities
+ )
+
+ return (
+ gr.File(value=in_doc_files, visible=True), # walkthrough_file_input
+ walkthrough_local_update, # walkthrough_in_redact_entities
+ walkthrough_comprehend_update, # walkthrough_in_redact_comprehend_entities
+ gr.Radio(
+ value=text_extract_method_radio, visible=True
+ ), # walkthrough_text_extract_method_radio
+ gr.Radio(
+ value=local_ocr_method, visible=True
+ ), # walkthrough_local_ocr_method_radio
+ gr.CheckboxGroup(
+ value=handwrite_signature_checkbox, visible=True
+ ), # walkthrough_handwrite_signature_checkbox
+ gr.Radio(
+ value=pii_identification_method_drop, visible=True
+ ), # walkthrough_pii_identification_method_drop
+ gr.Dropdown(
+ value=allow_list_walkthrough, visible=True
+ ), # walkthrough_allow_list_state
+ gr.Dropdown(
+ value=deny_list_walkthrough, visible=True
+ ), # walkthrough_deny_list_state
+ gr.Dropdown(
+ value=fully_redacted_list_walkthrough, visible=True
+ ), # walkthrough_fully_redacted_list_state
+ main_local_entities_update, # in_redact_entities (main component)
+ main_comprehend_entities_update, # in_redact_comprehend_entities (main component)
+ main_llm_entities_update, # in_redact_llm_entities (main component)
+ main_llm_instructions_update, # custom_llm_instructions_textbox (main component)
+ )
+
+
+def show_info_box_on_click_ocr_examples(
+ in_doc_files,
+ text_extract_method_radio,
+ pii_identification_method_drop,
+ handwrite_signature_checkbox,
+ prepared_pdf_state,
+ doc_full_file_name_textbox,
+ doc_file_name_no_extension_textbox,
+ total_pdf_page_count,
+ page_min,
+ page_max,
+ local_ocr_method_radio,
+ in_redact_entities,
+ in_redact_llm_entities,
+ custom_llm_instructions_textbox,
+):
+ gr.Info(
+ "Example OCR data loaded. Now click on 'Extract text and redact document' on the Redact PDFs/images tab to run the OCR analysis."
+ )
+
+ is_no_redaction = pii_identification_method_drop == NO_REDACTION_PII_OPTION
+ show_local_entities = (
+ not is_no_redaction and pii_identification_method_drop == LOCAL_PII_OPTION
+ )
+ is_llm_method = not is_no_redaction and (
+ pii_identification_method_drop == LOCAL_TRANSFORMERS_LLM_PII_OPTION
+ or pii_identification_method_drop == INFERENCE_SERVER_PII_OPTION
+ or pii_identification_method_drop == AWS_LLM_PII_OPTION
+ )
+
+ main_local_entities_update = gr.update(
+ value=in_redact_entities,
+ visible=show_local_entities,
+ )
+
+ main_llm_entities_update = gr.update(
+ value=in_redact_llm_entities,
+ visible=is_llm_method,
+ )
+ main_llm_instructions_update = gr.update(
+ value=custom_llm_instructions_textbox,
+ visible=True,
+ )
+
+ return (
+ gr.File(value=in_doc_files, visible=True), # walkthrough_file_input
+ main_local_entities_update, # walkthrough_in_redact_entities
+ gr.Radio(
+ value=text_extract_method_radio, visible=True
+ ), # walkthrough_text_extract_method_radio
+ gr.Radio(
+ value=local_ocr_method_radio, visible=True
+ ), # walkthrough_local_ocr_method_radio
+ gr.CheckboxGroup(
+ value=handwrite_signature_checkbox, visible=True
+ ), # walkthrough_handwrite_signature_checkbox
+ gr.Radio(
+ value=pii_identification_method_drop, visible=True
+ ), # walkthrough_pii_identification_method_drop
+ main_llm_entities_update, # walkthrough_in_redact_llm_entities
+ main_llm_instructions_update, # walkthrough_custom_llm_instructions_textbox
+ main_llm_entities_update, # in_redact_llm_entities (main component)
+ main_llm_instructions_update, # custom_llm_instructions_textbox (main component)
+ )
+
+
+def show_duplicate_info_box_on_click(
+ in_duplicate_pages,
+ duplicate_threshold_input,
+ min_word_count_input,
+ combine_page_text_for_duplicates_bool,
+):
+ gr.Info(
+ "Example data loaded. Now click on 'Identify duplicate pages/subdocuments' on the Identify duplicate pages tab to run the example duplicate detection."
+ )
+
+
+def show_tabular_info_box_on_click(
+ in_data_files,
+ in_colnames,
+ pii_identification_method_drop_tabular,
+ anon_strategy,
+ in_tabular_duplicate_files,
+ tabular_text_columns,
+ tabular_min_word_count,
+):
+ gr.Info(
+ "Example data loaded. Now click on 'Redact text/data files' or 'Find duplicate cells/rows' on the Word or Excel/CSV files tab to run the example."
+ )
+
+ return (
+ gr.File(value=in_data_files), # walkthrough_file_input
+ gr.Radio(
+ value=pii_identification_method_drop_tabular
+ ), # walkthrough_pii_identification_method_drop_tabular
+ gr.Radio(value=anon_strategy), # walkthrough_anon_strategy
+ )
+
+
+# Dynamic visibility handlers for main redaction tab (run regardless of SHOW_COSTS)
+# Automatically set local_ocr_method_radio to "bedrock-vlm" when AWS Bedrock VLM is selected
+def auto_set_local_ocr_for_bedrock_vlm(text_extract_method):
+ """Automatically set local OCR method to bedrock-vlm when AWS Bedrock VLM is selected."""
+ if text_extract_method == BEDROCK_VLM_TEXT_EXTRACT_OPTION:
+ # Only set if "bedrock-vlm" is a valid option
+ if "bedrock-vlm" in LOCAL_OCR_MODEL_OPTIONS:
+ return gr.update(value="bedrock-vlm")
+ return gr.update()
diff --git a/tools/llm_entity_detection.py b/tools/llm_entity_detection.py
new file mode 100644
index 0000000000000000000000000000000000000000..1054e3faa45cae457fea5c1101f1f2f28ed6c120
--- /dev/null
+++ b/tools/llm_entity_detection.py
@@ -0,0 +1,1167 @@
+"""
+LLM-based entity detection using AWS Bedrock.
+This module provides functions to detect PII entities using LLMs instead of AWS llm.
+"""
+
+import json
+import os
+import re
+from datetime import datetime
+from typing import Any, Dict, List, Optional, Tuple
+
+import boto3
+from gradio import Progress
+
+from tools.config import (
+ CHOSEN_LLM_PII_INFERENCE_METHOD,
+ CLOUD_LLM_PII_CUSTOM_INSTRUCTIONS_MODEL_CHOICE,
+ CLOUD_LLM_PII_MODEL_CHOICE,
+ INFERENCE_SERVER_API_URL,
+ LLM_MAX_NEW_TOKENS,
+ LLM_TEMPERATURE,
+ model_name_map,
+)
+from tools.llm_entity_detection_prompts import (
+ create_entity_detection_prompt,
+ create_entity_detection_system_prompt,
+)
+
+# Max length for column/sheet name in tabular log filenames (to keep filenames short)
+LLM_LOG_TABULAR_NAME_MAX_LEN = 25
+
+# Import LLM functions from local tools.llm_funcs
+try:
+ # Use send_request from llm_funcs.py which handles all model sources, retries, and response parsing
+
+ from tools.llm_funcs import (
+ send_request,
+ )
+except ImportError as e:
+ print(f"Warning: Could not import LLM functions: {e}")
+ print("LLM-based entity detection will not be available.")
+ print("Please ensure llm_funcs.py is in the tools folder.")
+ call_aws_bedrock = None
+ construct_azure_client = None
+ ResponseObject = None
+
+
+def _find_text_in_passage(
+ search_text: str,
+ original_text: str,
+ reported_offset: Optional[int] = None,
+ start_from: int = 0,
+) -> Optional[Tuple[int, int]]:
+ """
+ Find the position of search_text in original_text and return (begin, end) offsets.
+
+ Only considers occurrences at or after start_from. This allows a "first pass" where
+ each entity is matched starting after the previous entity's end, so repeated phrases
+ (e.g. "University of Notre Dame" vs "University" + "of Notre Dame") map to the
+ correct occurrence.
+
+ Args:
+ search_text: The text to search for
+ original_text: The text to search in
+ reported_offset: Optional offset reported by LLM (used to disambiguate multiple matches)
+ start_from: Only consider matches at or after this position (default 0).
+
+ Returns:
+ Tuple of (begin_offset, end_offset) if found, None otherwise
+ """
+ if not search_text:
+ return None
+
+ def first_or_closest(
+ positions: List[int], length: int
+ ) -> Optional[Tuple[int, int]]:
+ candidates = [p for p in positions if p >= start_from]
+ if not candidates:
+ return None
+ if reported_offset is not None:
+ closest_pos = min(candidates, key=lambda p: abs(p - reported_offset))
+ else:
+ closest_pos = min(candidates)
+ return (closest_pos, closest_pos + length)
+
+ # Clean search text - remove trailing ellipsis that LLM might add
+ search_text_clean = search_text.rstrip("...").strip()
+
+ # Find all occurrences of the exact text
+ all_positions = []
+ start = 0
+ while True:
+ pos = original_text.find(search_text, start)
+ if pos == -1:
+ break
+ all_positions.append(pos)
+ start = pos + 1
+
+ if all_positions:
+ result = first_or_closest(all_positions, len(search_text))
+ if result is not None:
+ return result
+
+ # Try with cleaned text (without ellipsis) if original didn't match
+ if search_text_clean != search_text:
+ all_positions_clean = []
+ start = 0
+ while True:
+ pos = original_text.find(search_text_clean, start)
+ if pos == -1:
+ break
+ all_positions_clean.append(pos)
+ start = pos + 1
+
+ if all_positions_clean:
+ result = first_or_closest(all_positions_clean, len(search_text_clean))
+ if result is not None:
+ return result
+
+ # Try case-insensitive match
+ search_text_lower = search_text.lower()
+ original_text_lower = original_text.lower()
+ all_positions_lower = []
+ start = 0
+ while True:
+ pos = original_text_lower.find(search_text_lower, start)
+ if pos == -1:
+ break
+ all_positions_lower.append(pos)
+ start = pos + 1
+
+ if all_positions_lower:
+ result = first_or_closest(all_positions_lower, len(search_text))
+ if result is not None:
+ return result
+
+ # Try case-insensitive match with cleaned text
+ if search_text_clean != search_text:
+ search_text_clean_lower = search_text_clean.lower()
+ all_positions_clean_lower = []
+ start = 0
+ while True:
+ pos = original_text_lower.find(search_text_clean_lower, start)
+ if pos == -1:
+ break
+ all_positions_clean_lower.append(pos)
+ start = pos + 1
+
+ if all_positions_clean_lower:
+ result = first_or_closest(all_positions_clean_lower, len(search_text_clean))
+ if result is not None:
+ return result
+
+ return None
+
+
+def _find_all_text_in_passage(
+ search_text: str, original_text: str
+) -> List[Tuple[int, int]]:
+ """
+ Find all positions of search_text in original_text and return a list of (begin, end) offsets.
+ Uses the same search strategy as _find_text_in_passage (exact, then cleaned, then case-insensitive).
+ LLM offset values are never used; positions come only from search.
+
+ Returns:
+ List of (begin_offset, end_offset) tuples, sorted by begin_offset (ascending).
+ """
+ if not search_text:
+ return []
+
+ search_text_clean = search_text.rstrip("...").strip()
+
+ def find_all_exact(needle: str, haystack: str) -> List[Tuple[int, int]]:
+ result = []
+ start = 0
+ while True:
+ pos = haystack.find(needle, start)
+ if pos == -1:
+ break
+ result.append((pos, pos + len(needle)))
+ start = pos + 1
+ return result
+
+ positions = find_all_exact(search_text, original_text)
+ if positions:
+ return sorted(positions, key=lambda p: p[0])
+
+ if search_text_clean != search_text:
+ positions = find_all_exact(search_text_clean, original_text)
+ if positions:
+ return sorted(positions, key=lambda p: p[0])
+
+ # Case-insensitive
+ needle_lower = search_text.lower()
+ haystack_lower = original_text.lower()
+ positions = find_all_exact(needle_lower, haystack_lower)
+ if positions:
+ # Return (start, start + len(search_text)) so length matches original entity text
+ return sorted(
+ [(p[0], p[0] + len(search_text)) for p in positions], key=lambda p: p[0]
+ )
+
+ if search_text_clean != search_text:
+ needle_clean_lower = search_text_clean.lower()
+ positions = find_all_exact(needle_clean_lower, haystack_lower)
+ if positions:
+ return sorted(
+ [(p[0], p[0] + len(search_text_clean)) for p in positions],
+ key=lambda p: p[0],
+ )
+
+ return []
+
+
+def _entity_get(obj: Dict[str, Any], key: str, default: Any = None) -> Any:
+ """Get value from entity dict with case-insensitive key lookup (e.g. BeginOffset vs beginOffset)."""
+ key_lower = key.lower()
+ for k, v in obj.items():
+ if k.lower() == key_lower:
+ return v
+ return default
+
+
+def parse_llm_entity_response(
+ response_text: str,
+ original_text: str,
+) -> List[Dict[str, Any]]:
+ """
+ Parse LLM response and extract entity information.
+ LLM BeginOffset/EndOffset are used only to define order. Positions are
+ resolved by a first-pass text search: for each entity (in reported order),
+ search for the entity's Text in the passage starting from the end of the
+ preceding entity's resolved span. If not found there, search from the
+ start of the passage. This ensures repeated phrases (e.g. "University of
+ Notre Dame" once, then "University" and "of Notre Dame" separately) map
+ to the correct occurrence and avoid duplicate redaction boxes.
+
+ Args:
+ response_text: The LLM response text (should contain JSON)
+ original_text: The original text that was analyzed (for validation)
+
+ Returns:
+ List of entity dictionaries with keys: Type, BeginOffset, EndOffset, Score, Text
+ """
+ entities_out: List[Dict[str, Any]] = []
+
+ # Remove tags and their content (common in some LLM outputs)
+ # This handles cases where LLMs include thinking/reasoning tags
+ response_text = re.sub(
+ r".*?", "", response_text, flags=re.DOTALL | re.IGNORECASE
+ )
+ response_text = re.sub(
+ r".*?", "", response_text, flags=re.DOTALL | re.IGNORECASE
+ )
+
+ # Prefer extracting from markdown code block (e.g. ```json\n...\n```)
+ # so we get a clean slice and can strip trailing tokens before parsing
+ json_str = None
+ if "```json" in response_text or "```" in response_text:
+ code_block = re.search(
+ r"```(?:json)?\s*\n?(.*?)(?:\n?```|$)", response_text, re.DOTALL
+ )
+ if code_block:
+ candidate = code_block.group(1).strip()
+ # Strip trailing tokens that some models append (e.g. )
+ candidate = re.sub(r"\s*$", "", candidate, flags=re.IGNORECASE)
+ candidate = candidate.rstrip()
+ # Extract only the root JSON object by brace matching so we never include trailing garbage
+ start = candidate.find("{")
+ if start >= 0:
+ depth = 0
+ for i in range(start, len(candidate)):
+ if candidate[i] == "{":
+ depth += 1
+ elif candidate[i] == "}":
+ depth -= 1
+ if depth == 0:
+ json_str = candidate[start : i + 1]
+ break
+ if json_str is None:
+ json_str = candidate[start:] # fallback: from first { to end
+
+ # Fallback: try regex-based extraction (fragile for nested braces)
+ if json_str is None:
+ json_match = re.search(
+ r'\{[^{}]*"entities"[^{}]*\[.*?\].*?\}', response_text, re.DOTALL
+ )
+ if not json_match:
+ json_match = re.search(r'\{.*?"entities".*?\}', response_text, re.DOTALL)
+ if json_match:
+ json_str = json_match.group(0)
+
+ if json_str:
+ try:
+ # Clean up the JSON string (in case we came from regex path)
+ json_str = json_str.strip()
+ # Remove markdown code block markers if present (regex path may include them)
+ json_str = re.sub(r"^```json\s*", "", json_str, flags=re.MULTILINE)
+ json_str = re.sub(r"^```\s*", "", json_str, flags=re.MULTILINE)
+ # Strip trailing tokens again (e.g. after closing })
+ json_str = re.sub(r"\s*$", "", json_str, flags=re.IGNORECASE)
+ json_str = json_str.strip()
+ # Keep only the root object if trailing garbage remains (brace-match from start)
+ start = json_str.find("{")
+ if start >= 0:
+ depth = 0
+ for i in range(start, len(json_str)):
+ if json_str[i] == "{":
+ depth += 1
+ elif json_str[i] == "}":
+ depth -= 1
+ if depth == 0:
+ json_str = json_str[start : i + 1]
+ break
+
+ # Fix common JSON issues:
+ # 1. Remove trailing commas before closing brackets/braces
+ json_str = re.sub(r",\s*}", "}", json_str)
+ json_str = re.sub(r",\s*]", "]", json_str)
+
+ # 2. Fix unquoted string values (e.g., "Type": NAME should be "Type": "NAME")
+ # This handles cases where LLMs output unquoted identifiers as values
+ # Pattern: "key": VALUE where VALUE is an unquoted identifier
+ def fix_unquoted_value(match):
+ key_part = match.group(1) # The key (e.g., "Type")
+ value = match.group(2) # The unquoted value
+ separator = match.group(3) # The separator (comma, closing brace, etc.)
+ # Only fix if it looks like an identifier (alphanumeric/underscore, not a number or boolean)
+ if re.match(
+ r"^[A-Za-z_][A-Za-z0-9_]*$", value
+ ) and value.lower() not in ["true", "false", "null"]:
+ return f'{key_part}: "{value}"{separator}'
+ return match.group(0) # Return original if it doesn't need fixing
+
+ # Fix unquoted string values after colons (common in LLM outputs)
+ # Match: "key": VALUE where VALUE is unquoted identifier followed by comma, }, or ]
+ # This pattern handles: "Type": NAME, or "Type": EMAIL_ADDRESS}
+ json_str = re.sub(
+ r'("[\w]+")\s*:\s*([A-Za-z_][A-Za-z0-9_]*)\s*([,}\]])',
+ fix_unquoted_value,
+ json_str,
+ )
+
+ # Also handle cases where unquoted value is at end of line or followed by newline
+ json_str = re.sub(
+ r'("[\w]+")\s*:\s*([A-Za-z_][A-Za-z0-9_]*)\s*(\n)',
+ r'\1: "\2"\3',
+ json_str,
+ )
+
+ # Final trim: strip trailing whitespace, control chars, backticks, and truncate to root object only
+ # (avoids "Expecting ',' delimiter" when trailing \r, ```, , or other bytes remain)
+ json_str = json_str.rstrip().rstrip("\r\t")
+ json_str = re.sub(r"[ \t\r\n]+$", "", json_str)
+ json_str = re.sub(r"`+$", "", json_str)
+ json_str = re.sub(r"\s*$", "", json_str, flags=re.IGNORECASE)
+ json_str = json_str.rstrip()
+ start = json_str.find("{")
+ if start >= 0:
+ depth = 0
+ for i in range(start, len(json_str)):
+ if json_str[i] == "{":
+ depth += 1
+ elif json_str[i] == "}":
+ depth -= 1
+ if depth == 0:
+ json_str = json_str[start : i + 1]
+ break
+
+ # Try to parse the JSON
+ try:
+ data = json.loads(json_str)
+ except json.JSONDecodeError as e:
+ # If parsing still fails, try a more aggressive fix for unquoted values
+ # This is a fallback that quotes any unquoted identifiers after colons
+ print(
+ f"Initial JSON parse failed: {e}. Attempting more aggressive fixes..."
+ )
+
+ # More aggressive fix: quote any unquoted word after a colon that's not already quoted
+ # Pattern: ": WORD" where WORD is not in quotes and not a number/boolean
+ def quote_unquoted_identifier(match):
+ prefix = match.group(1) # Everything before the colon
+ value = match.group(2) # The unquoted value
+ suffix = match.group(3) # Everything after (comma, brace, etc.)
+ # Only quote if it's a valid identifier and not a boolean/null
+ if re.match(
+ r"^[A-Za-z_][A-Za-z0-9_]*$", value
+ ) and value.lower() not in ["true", "false", "null"]:
+ return f'{prefix}: "{value}"{suffix}'
+ return match.group(0)
+
+ # Try fixing unquoted values more aggressively
+ json_str = re.sub(
+ r"(:\s*)([A-Za-z_][A-Za-z0-9_]*)(\s*[,}\]])",
+ quote_unquoted_identifier,
+ json_str,
+ )
+
+ # Try parsing again
+ try:
+ data = json.loads(json_str)
+ except json.JSONDecodeError as e2:
+ print(f"JSON parsing failed after fixes: {e2}")
+ print(f"Cleaned JSON string (first 1000 chars): {json_str[:1000]}")
+ raise e2
+
+ if "entities" in data and isinstance(data["entities"], list):
+ # Collect raw entity records (Type, Text, Score, reported BeginOffset for order only)
+ raw_entities: List[Dict[str, Any]] = []
+ for entity in data["entities"]:
+ entity_type_val = _entity_get(entity, "Type")
+ if entity_type_val is None:
+ print(f"Warning: Entity missing Type field: {entity}")
+ continue
+ entity_text = _entity_get(entity, "Text", "")
+ reported_begin = _entity_get(entity, "BeginOffset")
+ if reported_begin is not None:
+ try:
+ reported_begin = int(reported_begin)
+ except (ValueError, TypeError):
+ reported_begin = None
+ reported_end = _entity_get(entity, "EndOffset")
+ if reported_end is not None:
+ try:
+ reported_end = int(reported_end)
+ except (ValueError, TypeError):
+ reported_end = None
+ # If no Text, try to derive from reported offsets (for display/grouping only)
+ if (
+ not entity_text
+ and reported_begin is not None
+ and reported_end is not None
+ and 0 <= reported_begin < reported_end <= len(original_text)
+ ):
+ entity_text = original_text[reported_begin:reported_end]
+ if not entity_text:
+ print(
+ f"Warning: Entity of type '{entity_type_val}' has no Text value and invalid offsets"
+ )
+ continue
+ raw_entities.append(
+ {
+ "Type": str(entity_type_val),
+ "Text": entity_text,
+ "Score": float(_entity_get(entity, "Score", 0.8)),
+ "reported_begin": reported_begin,
+ }
+ )
+
+ # Process entities in reported order. First-pass: search for each entity's
+ # Text starting from the preceding entity's EndOffset; if not found, search
+ # from the start of the passage. This disambiguates repeated phrases.
+ ordered = sorted(
+ raw_entities,
+ key=lambda r: (
+ r["reported_begin"] is None,
+ r["reported_begin"] or 0,
+ ),
+ )
+ search_start = 0
+ for rec in ordered:
+ search_text = rec["Text"]
+ result = _find_text_in_passage(
+ search_text,
+ original_text,
+ reported_offset=rec["reported_begin"],
+ start_from=search_start,
+ )
+ if result is None:
+ result = _find_text_in_passage(
+ search_text,
+ original_text,
+ reported_offset=rec["reported_begin"],
+ start_from=0,
+ )
+ if result is None:
+ print(
+ f"Warning: Could not find text '{search_text[:50]}...' in original passage"
+ )
+ continue
+ start, end = result
+ entities_out.append(
+ {
+ "Type": rec["Type"],
+ "BeginOffset": start,
+ "EndOffset": end,
+ "Score": rec["Score"],
+ "Text": original_text[start:end],
+ }
+ )
+ search_start = end
+ except json.JSONDecodeError as e:
+ print(f"Error parsing JSON from LLM response: {e}")
+ print(f"Response text: {response_text[:500]}")
+ except (ValueError, KeyError) as e:
+ print(f"Error processing entity data: {e}")
+ else:
+ print("Warning: Could not find JSON in LLM response")
+ print(f"Response text: {response_text[:500]}")
+
+ return entities_out
+
+
+def _sanitize_for_filename(s: str, max_len: Optional[int] = None) -> str:
+ """Sanitize a string for use in a filename (alphanumeric, spaces to underscores)."""
+ out = (
+ "".join(c for c in (s or "") if c.isalnum() or c in (" ", "-", "_"))
+ .strip()
+ .replace(" ", "_")
+ )
+ if max_len is not None and len(out) > max_len:
+ out = out[:max_len]
+ return out or "unknown"
+
+
+def save_llm_prompt_response(
+ system_prompt: str,
+ user_prompt: str,
+ response_text: str,
+ output_folder: str,
+ batch_number: int,
+ model_choice: str,
+ entities_to_detect: List[str],
+ language: str,
+ temperature: float,
+ max_tokens: int,
+ file_name: Optional[str] = None,
+ page_number: Optional[int] = None,
+ sheet_name: Optional[str] = None,
+ column_name: Optional[str] = None,
+ row_number: Optional[int] = None,
+ input_tokens: Optional[int] = None,
+ output_tokens: Optional[int] = None,
+) -> str:
+ """
+ Save LLM prompt and response to a text file for traceability.
+
+ Writes the exact system prompt and user prompt that were sent to the model
+ (e.g. for local transformers, inference-server, AWS, etc.). Each section is
+ clearly delimited so the log never duplicates or conflates system vs user.
+
+ Args:
+ system_prompt: System prompt sent to LLM (exactly as passed to the model).
+ user_prompt: User prompt sent to LLM (exactly as passed to the model).
+ response_text: Response text from LLM
+ output_folder: Output folder path
+ batch_number: Batch number for this call
+ model_choice: Model used
+ entities_to_detect: List of entities being detected
+ language: Language code
+ temperature: Temperature used
+ max_tokens: Max tokens used
+ file_name: Optional file name (without extension) for the filename / log header
+ page_number: Optional page number (0-based) for the filename; displayed in log as 1-based.
+ sheet_name: Optional Excel sheet name (tabular data); included in log and filename if present.
+ column_name: Optional column name (tabular data); included in log and filename (shortened if long).
+ row_number: Optional row number (1-based for display; tabular data); included in log and filename.
+ input_tokens: Optional input token count from the LLM call
+ output_tokens: Optional output token count from the LLM call
+
+ Returns:
+ Path to the saved file
+ """
+ # Normalise to strings so we never write "None" or non-string types
+ system_prompt_str = (system_prompt if system_prompt is not None else "").strip()
+ user_prompt_str = (user_prompt if user_prompt is not None else "").strip()
+
+ # Create LLM logs subfolder
+ llm_logs_folder = os.path.join(output_folder, "llm_prompts_responses")
+ os.makedirs(llm_logs_folder, exist_ok=True)
+
+ # Tabular: filename = sheet (if relevant) + column (shortened) + row
+ is_tabular = (
+ column_name is not None or sheet_name is not None or row_number is not None
+ )
+ if is_tabular:
+ parts = ["llm"]
+ if sheet_name:
+ parts.append(
+ _sanitize_for_filename(sheet_name, LLM_LOG_TABULAR_NAME_MAX_LEN)
+ )
+ if column_name:
+ parts.append(
+ _sanitize_for_filename(column_name, LLM_LOG_TABULAR_NAME_MAX_LEN)
+ )
+ if row_number is not None:
+ parts.append(f"row{row_number:05d}")
+ parts.append(f"batch_{batch_number:04d}")
+ filename = "_".join(parts) + ".txt"
+ elif file_name and page_number is not None:
+ # Document: file name + page number
+ safe_file_name = _sanitize_for_filename(file_name)
+ filename = (
+ f"llm_{safe_file_name}_page_{page_number:04d}_batch_{batch_number:04d}.txt"
+ )
+ else:
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+ filename = f"llm_batch_{batch_number:04d}_{timestamp}.txt"
+ filepath = os.path.join(llm_logs_folder, filename)
+
+ # Write prompt and response to file with explicit section boundaries
+ # so system and user prompts are never duplicated or mixed.
+ with open(filepath, "w", encoding="utf-8") as f:
+ f.write("=" * 80 + "\n")
+ f.write("LLM ENTITY DETECTION - PROMPT AND RESPONSE LOG\n")
+ f.write("=" * 80 + "\n\n")
+
+ f.write(f"Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
+ if file_name:
+ f.write(f"File: {file_name}\n")
+ if sheet_name:
+ f.write(f"Sheet: {sheet_name}\n")
+ if column_name is not None:
+ f.write(f"Column: {column_name}\n")
+ if row_number is not None:
+ f.write(f"Row: {row_number}\n")
+ if page_number is not None:
+ f.write(f"Page: {page_number + 1}\n")
+ if input_tokens is not None:
+ f.write(f"Input tokens: {input_tokens}\n")
+ if output_tokens is not None:
+ f.write(f"Output tokens: {output_tokens}\n")
+ f.write(f"Batch Number: {batch_number}\n")
+ f.write(f"Model: {model_choice}\n")
+ f.write(f"Language: {language}\n")
+ f.write(f"Temperature: {temperature}\n")
+ f.write(f"Max Tokens: {max_tokens}\n")
+ f.write(f"Entities to Detect: {', '.join(entities_to_detect)}\n")
+
+ f.write("\n" + "=" * 80 + "\n")
+ f.write("SYSTEM PROMPT (sent as system role)\n")
+ f.write("=" * 80 + "\n")
+ f.write("--- BEGIN SYSTEM PROMPT ---\n")
+ f.write(system_prompt_str)
+ f.write("\n--- END SYSTEM PROMPT ---\n")
+
+ f.write("\n" + "=" * 80 + "\n")
+ f.write("USER PROMPT (sent as user role)\n")
+ f.write("=" * 80 + "\n")
+ if (
+ system_prompt_str
+ and user_prompt_str
+ and system_prompt_str == user_prompt_str
+ ):
+ f.write(
+ "[NOTE: System and user prompt content were identical - check caller.]\n"
+ )
+ f.write("--- BEGIN USER PROMPT ---\n")
+ f.write(user_prompt_str)
+ f.write("\n--- END USER PROMPT ---\n")
+
+ f.write("\n\n" + "=" * 80 + "\n")
+ f.write("LLM RESPONSE\n")
+ f.write("=" * 80 + "\n\n")
+ f.write(response_text)
+ f.write("\n\n" + "=" * 80 + "\n")
+ f.write("END OF LOG\n")
+ f.write("=" * 80 + "\n")
+
+ return filepath
+
+
+def call_llm_for_entity_detection(
+ text: str,
+ entities_to_detect: List[str],
+ language: str,
+ bedrock_runtime: Optional[boto3.Session.client] = None,
+ model_choice: str = CLOUD_LLM_PII_MODEL_CHOICE,
+ temperature: float = LLM_TEMPERATURE,
+ max_tokens: int = LLM_MAX_NEW_TOKENS,
+ max_retries: int = 10,
+ retry_delay: int = 3,
+ output_folder: Optional[str] = None,
+ batch_number: int = 0,
+ custom_instructions: str = "",
+ file_name: Optional[str] = None,
+ page_number: Optional[int] = None,
+ sheet_name: Optional[str] = None,
+ column_name: Optional[str] = None,
+ row_number: Optional[int] = None,
+ inference_method: Optional[str] = None,
+ local_model=None,
+ tokenizer=None,
+ assistant_model=None,
+ client=None,
+ client_config=None,
+ api_url: Optional[str] = None,
+) -> List[Dict[str, Any]]:
+ """
+ Call LLM to detect entities in text using various inference methods.
+
+ Args:
+ text: Text to analyze
+ entities_to_detect: List of entity types to detect
+ language: Language code
+ bedrock_runtime: AWS Bedrock runtime client (required for AWS method)
+ model_choice: Model identifier (varies by inference method)
+ temperature: Temperature for LLM generation (lower = more deterministic)
+ max_tokens: Maximum tokens in response
+ max_retries: Maximum retry attempts
+ retry_delay: Delay between retries (seconds)
+ output_folder: Optional folder to save prompt/response logs
+ batch_number: Batch number for logging
+ custom_instructions: Optional custom instructions to include in the prompt
+ file_name: Optional file name (without extension) for saving logs
+ page_number: Optional page number for saving logs (document flow)
+ sheet_name: Optional Excel sheet name for tabular logs
+ column_name: Optional column name for tabular logs
+ row_number: Optional row number (1-based) for tabular logs
+ inference_method: Inference method to use ("aws-bedrock", "local", "inference-server", "azure-openai", "gemini")
+ If None, uses CHOSEN_LLM_PII_INFERENCE_METHOD from config
+ local_model: Local model instance (required for "local" method)
+ tokenizer: Tokenizer instance (required for "local" method with transformers)
+ assistant_model: Assistant model for speculative decoding (optional)
+ client: API client (required for "azure-openai" or "gemini" methods)
+ client_config: Client config (required for "gemini" method)
+ api_url: API URL for inference-server (required for "inference-server" method)
+
+ Returns:
+ List of entity dictionaries
+ """
+ # Ensure custom_instructions is a string (callers may pass bool or other types).
+ # Treat boolean True and the string "True" as empty (e.g. from an unchecked/empty Gradio box).
+ if not isinstance(custom_instructions, str):
+ custom_instructions = (
+ ""
+ if custom_instructions is True or not custom_instructions
+ else str(custom_instructions)
+ )
+ if (
+ isinstance(custom_instructions, str)
+ and custom_instructions.strip().lower() == "true"
+ ):
+ custom_instructions = ""
+
+ # Determine inference method
+ if inference_method is None:
+ inference_method = CHOSEN_LLM_PII_INFERENCE_METHOD
+
+ # When custom instructions are provided, use the upgraded model if configured
+ custom_instructions_model = (
+ CLOUD_LLM_PII_CUSTOM_INSTRUCTIONS_MODEL_CHOICE.strip()
+ if isinstance(CLOUD_LLM_PII_CUSTOM_INSTRUCTIONS_MODEL_CHOICE, str)
+ and CLOUD_LLM_PII_CUSTOM_INSTRUCTIONS_MODEL_CHOICE.strip()
+ else ""
+ )
+ if (
+ custom_instructions.strip()
+ and model_choice == CLOUD_LLM_PII_MODEL_CHOICE
+ and custom_instructions_model
+ ):
+ model_choice = custom_instructions_model
+
+ # Filter out CUSTOM_VLM_* entities (these are handled separately via VLM)
+ filtered_entities = [
+ entity for entity in entities_to_detect if not entity.startswith("CUSTOM_VLM_")
+ ]
+
+ # No standard entities and no custom instructions
+ if not filtered_entities and (
+ not custom_instructions or not custom_instructions.strip()
+ ):
+ # Nothing selected at all → error
+ if not entities_to_detect:
+ raise ValueError(
+ "No standard entities selected and no custom instructions provided. "
+ "Please select at least one entity type (excluding CUSTOM_VLM_* entities) or provide custom instructions for LLM-based PII detection."
+ )
+ # Only CUSTOM_VLM_* entities selected (handled separately via VLM) → return blank
+ return []
+
+ # Determine model source from model_choice if using model_name_map
+ model_source = None
+ if model_choice and model_name_map and model_choice in model_name_map:
+ model_source = model_name_map[model_choice].get("source", "AWS")
+ # Map model source to inference method
+ if model_source == "Local":
+ inference_method = "local"
+ elif model_source == "inference-server":
+ inference_method = "inference-server"
+ elif model_source == "Azure/OpenAI":
+ inference_method = "azure-openai"
+ elif model_source == "Gemini":
+ inference_method = "gemini"
+ elif model_source == "AWS":
+ inference_method = "aws-bedrock"
+
+ system_prompt = create_entity_detection_system_prompt(
+ filtered_entities, language, custom_instructions
+ )
+ user_prompt = create_entity_detection_prompt(
+ text, filtered_entities, language, custom_instructions
+ )
+
+ # Map inference_method to model_source format expected by send_request
+ model_source_map = {
+ "aws-bedrock": "AWS",
+ "local": "Local",
+ "inference-server": "inference-server",
+ "azure-openai": "Azure/OpenAI",
+ "gemini": "Gemini",
+ }
+
+ model_source = model_source_map.get(inference_method, "AWS")
+
+ # Prepare client and config for Gemini if needed
+ if inference_method == "gemini" and (client is None or client_config is None):
+ from tools.llm_funcs import construct_gemini_generative_model
+
+ try:
+ client, client_config = construct_gemini_generative_model(
+ in_api_key="", # Will use environment variable
+ temperature=temperature,
+ model_choice=model_choice,
+ system_prompt=system_prompt,
+ max_tokens=max_tokens, # Use our specific max_tokens for entity detection
+ )
+ except Exception as e:
+ raise ValueError(
+ f"Failed to construct Gemini client: {e}. "
+ f"Ensure GEMINI_API_KEY is set or pass client and client_config."
+ )
+
+ # Prepare client for Azure/OpenAI if needed
+ if inference_method == "azure-openai" and client is None:
+ from tools.llm_funcs import construct_azure_client
+
+ try:
+ client, _ = construct_azure_client(
+ in_api_key="", # Will use environment variable
+ endpoint="", # Will use environment variable
+ )
+ except Exception as e:
+ raise ValueError(
+ f"Failed to construct Azure/OpenAI client: {e}. "
+ f"Ensure AZURE_OPENAI_API_KEY is set or pass client."
+ )
+
+ # Set up API URL for inference-server if needed
+ if inference_method == "inference-server" and api_url is None:
+ api_url = INFERENCE_SERVER_API_URL
+ if not api_url:
+ raise ValueError(
+ "api_url is required when using inference-server method. "
+ "Set INFERENCE_SERVER_API_URL in config or pass api_url parameter."
+ )
+
+ try:
+ # Call send_request which handles all routing, retries, and response parsing
+ # Note: send_request signature shows local_model=list() but it's actually used as a single model object
+ (
+ response,
+ conversation_history,
+ response_text,
+ num_transformer_input_tokens,
+ num_transformer_generated_tokens,
+ ) = send_request(
+ prompt=user_prompt,
+ conversation_history=[], # Empty for entity detection (no conversation history needed)
+ client=client,
+ config=client_config,
+ model_choice=model_choice,
+ system_prompt=system_prompt,
+ temperature=temperature,
+ bedrock_runtime=bedrock_runtime,
+ model_source=model_source,
+ # local_model=(
+ # local_model if local_model else []
+ # ), # Pass model directly (signature shows list but uses as single object)
+ # tokenizer=tokenizer,
+ # assistant_model=assistant_model,
+ progress=Progress(
+ track_tqdm=False
+ ), # Disable progress bar for entity detection
+ api_url=api_url,
+ )
+ except Exception as e:
+ print(f"LLM entity detection failed: {e}")
+ raise
+
+ # Extract token usage from response (before save so we can write it to the log file)
+ input_tokens = 0
+ output_tokens = 0
+ try:
+ if isinstance(response, dict) and "usage" in response:
+ # inference-server or llama-cpp format
+ input_tokens = response["usage"].get("prompt_tokens", 0)
+ output_tokens = response["usage"].get("completion_tokens", 0)
+ elif hasattr(response, "usage_metadata"):
+ # Check if it's AWS Bedrock format
+ if isinstance(response.usage_metadata, dict):
+ input_tokens = response.usage_metadata.get("inputTokens", 0)
+ output_tokens = response.usage_metadata.get("outputTokens", 0)
+ # Check if it's Gemini format
+ elif hasattr(response.usage_metadata, "prompt_token_count"):
+ input_tokens = response.usage_metadata.prompt_token_count
+ output_tokens = response.usage_metadata.candidates_token_count
+ except (KeyError, AttributeError) as e:
+ print(f"Warning: Could not extract token usage from response: {e}")
+
+ # Fallback for Local/transformers: response is plain text, so use token counts from send_request
+ if num_transformer_input_tokens and num_transformer_input_tokens > 0:
+ input_tokens = num_transformer_input_tokens
+ if num_transformer_generated_tokens and num_transformer_generated_tokens > 0:
+ output_tokens = num_transformer_generated_tokens
+
+ # Save prompt and response if output_folder is provided.
+ # Use the same system_prompt and user_prompt that were sent to the model
+ # (no combined/rendered version) so the log correctly shows system vs user.
+ if output_folder and response_text:
+ try:
+ saved_file = save_llm_prompt_response(
+ system_prompt=system_prompt,
+ user_prompt=user_prompt,
+ response_text=response_text,
+ output_folder=output_folder,
+ batch_number=batch_number,
+ model_choice=model_choice,
+ entities_to_detect=entities_to_detect,
+ language=language,
+ temperature=temperature,
+ max_tokens=max_tokens,
+ file_name=file_name,
+ page_number=page_number,
+ sheet_name=sheet_name,
+ column_name=column_name,
+ row_number=row_number,
+ input_tokens=input_tokens,
+ output_tokens=output_tokens,
+ )
+ if 0 == 1: # To avoid lint check issue
+ print(f"Saved LLM prompt/response to: {saved_file}")
+ except Exception as e:
+ print(f"Warning: Could not save LLM prompt/response: {e}")
+
+ # Parse the response
+ entities = parse_llm_entity_response(response_text, text)
+
+ return entities, input_tokens, output_tokens
+
+
+def map_back_llm_entity_results(
+ entities: List[Dict[str, Any]],
+ current_batch_mapping: List[Tuple],
+ allow_list: List[str],
+ chosen_redact_llm_entities: List[str],
+ all_text_line_results: List[Tuple],
+) -> List[Tuple]:
+ """
+ Map LLM-detected entities back to line-level results.
+ Similar to map_back_llm_entity_results but for LLM responses.
+
+ Args:
+ entities: List of entity dictionaries from LLM
+ current_batch_mapping: Mapping of batch positions to line indices
+ allow_list: List of allowed text values (to skip) - case-insensitive matching
+ chosen_redact_llm_entities: List of entity types to include
+ all_text_line_results: Existing line-level results to append to
+
+ Returns:
+ Updated all_text_line_results
+ """
+ if not entities:
+ return all_text_line_results
+
+ # Normalize allow_list for case-insensitive matching
+ if allow_list:
+ allow_list_normalized = [item.strip().lower() for item in allow_list if item]
+ else:
+ allow_list_normalized = []
+
+ for entity in entities:
+ entity_type = entity.get("Type")
+ # Allow all entity types returned by LLM, including custom types from custom instructions
+ # Log when a custom entity type (not in the original list) is found
+ # if entity_type not in chosen_redact_llm_entities:
+ # print(
+ # f"Info: Found custom entity type '{entity_type}' (not in original detection list). "
+ # f"Including it in results as it was returned by LLM."
+ # )
+
+ entity_start = entity["BeginOffset"]
+ entity_end = entity["EndOffset"]
+
+ # Track if the entity has been added to any line
+ added_to_line = False
+
+ # Find the correct line and offset within that line
+ for (
+ batch_start,
+ line_idx,
+ original_line,
+ chars,
+ line_offset,
+ ) in current_batch_mapping:
+ # Calculate the end position of this line segment in the batch
+ if line_offset is not None:
+ # Line offset is the start position within the line
+ line_text_length = len(original_line.text[line_offset:])
+ else:
+ line_text_length = len(original_line.text)
+
+ batch_end = batch_start + line_text_length
+
+ # Check if the entity overlaps with the current line
+ if batch_start < entity_end and batch_end > entity_start:
+ # Calculate the relative position within the line
+ if line_offset is not None:
+ relative_start = max(0, entity_start - batch_start + line_offset)
+ relative_end = min(
+ entity_end - batch_start + line_offset, len(original_line.text)
+ )
+ else:
+ relative_start = max(0, entity_start - batch_start)
+ relative_end = min(
+ entity_end - batch_start, len(original_line.text)
+ )
+
+ result_text = original_line.text[relative_start:relative_end]
+
+ # Check if result_text is in allow_list (case-insensitive)
+ # If allow_list contains this text, skip adding it as a PII entity
+ # This allows allow_list terms to "overrule" LLM PII detection
+ result_text_normalized = result_text.strip().lower()
+ if result_text_normalized not in allow_list_normalized:
+ # Create entity dict in llm-like format
+ adjusted_entity = {
+ "Type": entity_type,
+ "BeginOffset": relative_start,
+ "EndOffset": relative_end,
+ "Score": entity.get("Score", 0.8),
+ }
+
+ # Import here to avoid circular imports
+ from tools.presidio_analyzer_custom import (
+ recognizer_result_from_dict,
+ )
+
+ recogniser_entity = recognizer_result_from_dict(adjusted_entity)
+
+ # Check if this line already has an entry
+ existing_entry = next(
+ (
+ entry
+ for idx, entry in all_text_line_results
+ if idx == line_idx
+ ),
+ None,
+ )
+ if existing_entry is None:
+ all_text_line_results.append((line_idx, [recogniser_entity]))
+ else:
+ existing_entry.append(recogniser_entity)
+
+ added_to_line = True
+
+ # Optional: Handle cases where the entity does not fit in any line
+ if not added_to_line:
+ print(
+ f"Entity '{entity_type}' at position {entity_start}-{entity_end} does not fit in any line."
+ )
+
+ return all_text_line_results
+
+
+def do_llm_entity_detection_call(
+ current_batch: str,
+ current_batch_mapping: List[Tuple],
+ bedrock_runtime: Optional[boto3.Session.client] = None,
+ language: str = "en",
+ allow_list: List[str] = None,
+ chosen_redact_llm_entities: List[str] = None,
+ all_text_line_results: List[Tuple] = None,
+ model_choice: str = CLOUD_LLM_PII_MODEL_CHOICE,
+ temperature: float = LLM_TEMPERATURE,
+ max_tokens: int = LLM_MAX_NEW_TOKENS,
+ output_folder: Optional[str] = None,
+ batch_number: int = 0,
+ custom_instructions: str = "",
+ file_name: Optional[str] = None,
+ page_number: Optional[int] = None,
+ inference_method: Optional[str] = None,
+ local_model=None,
+ tokenizer=None,
+ assistant_model=None,
+ client=None,
+ client_config=None,
+ api_url: Optional[str] = None,
+) -> Tuple[List[Tuple], int, int]:
+ """
+ Call LLM for entity detection on a batch of text.
+ Similar interface to do_aws_llm_call.
+
+ Args:
+ current_batch: Text batch to analyze
+ current_batch_mapping: Mapping of batch positions to line indices
+ bedrock_runtime: AWS Bedrock runtime client (required for AWS method)
+ language: Language code
+ allow_list: List of allowed text values
+ chosen_redact_llm_entities: List of entity types to detect
+ all_text_line_results: Existing line-level results
+ model_choice: Model identifier (varies by inference method)
+ temperature: Temperature for LLM generation
+ max_tokens: Maximum tokens in response
+ output_folder: Optional folder to save prompt/response logs
+ batch_number: Batch number for logging
+ custom_instructions: Optional custom instructions to include in the prompt
+ file_name: Optional file name (without extension) for saving logs
+ page_number: Optional page number for saving logs
+ inference_method: Inference method to use (if None, uses config default)
+ local_model: Local model instance (required for "local" method)
+ tokenizer: Tokenizer instance (required for "local" method with transformers)
+ assistant_model: Assistant model for speculative decoding (optional)
+ client: API client (required for "azure-openai" or "gemini" methods)
+ client_config: Client config (required for "gemini" method)
+ api_url: API URL for inference-server (required for "inference-server" method)
+
+ Returns:
+ Tuple of (updated all_text_line_results, input_tokens, output_tokens)
+ """
+ if not current_batch:
+ return (all_text_line_results or [], 0, 0)
+
+ if allow_list is None:
+ allow_list = []
+ if chosen_redact_llm_entities is None:
+ chosen_redact_llm_entities = []
+ if all_text_line_results is None:
+ all_text_line_results = []
+
+ try:
+ entities, input_tokens, output_tokens = call_llm_for_entity_detection(
+ text=current_batch.strip(),
+ entities_to_detect=chosen_redact_llm_entities,
+ language=language,
+ bedrock_runtime=bedrock_runtime,
+ model_choice=model_choice,
+ temperature=temperature,
+ max_tokens=max_tokens,
+ output_folder=output_folder,
+ batch_number=batch_number,
+ custom_instructions=custom_instructions,
+ file_name=file_name,
+ page_number=page_number,
+ inference_method=inference_method,
+ local_model=local_model,
+ tokenizer=tokenizer,
+ assistant_model=assistant_model,
+ client=client,
+ client_config=client_config,
+ api_url=api_url,
+ )
+
+ all_text_line_results = map_back_llm_entity_results(
+ entities,
+ current_batch_mapping,
+ allow_list,
+ chosen_redact_llm_entities,
+ all_text_line_results,
+ )
+
+ return all_text_line_results, input_tokens, output_tokens
+
+ except Exception as e:
+ print(f"LLM entity detection call failed: {e}")
+ raise
diff --git a/tools/llm_entity_detection_prompts.py b/tools/llm_entity_detection_prompts.py
new file mode 100644
index 0000000000000000000000000000000000000000..30ca32313485545f8b8e05b4183f6a1c3e35c101
--- /dev/null
+++ b/tools/llm_entity_detection_prompts.py
@@ -0,0 +1,102 @@
+"""
+Prompts for LLM-based entity detection.
+These prompts are designed to extract PII entities with character positions.
+"""
+
+
+def create_entity_detection_prompt(
+ text: str,
+ entities_to_detect: list[str],
+ language: str = "en",
+ custom_instructions: str = "",
+) -> str:
+ """
+ Create a prompt for LLM-based entity detection.
+
+ Args:
+ text: The text to analyze
+ entities_to_detect: List of entity types to detect (e.g., ["EMAIL_ADDRESS", "PHONE_NUMBER", "PERSON_NAME"])
+ language: Language code (e.g., "en", "es", "fr")
+ custom_instructions: Optional custom instructions to include in the prompt (e.g., "don't redact anything related to Mark Wilson")
+
+ Returns:
+ Formatted prompt string
+ """
+
+ prompt = f"""### User prompt
+Analyse the following text according to the provided instructions in the system prompt.
+### Text:
+{text}
+### Final instruction:
+Return only the JSON object, nothing else:"""
+
+ return prompt
+
+
+def create_entity_detection_system_prompt(
+ entities_to_detect: list[str],
+ language: str = "en",
+ custom_instructions: str = "",
+) -> str:
+ """
+ Create a system prompt for LLM-based entity detection.
+
+ Args:
+ entities_to_detect: List of entity types to detect
+ language: Language code
+ custom_instructions: Optional custom instructions to include in the system prompt
+
+ Returns:
+ System prompt string
+ """
+ # Ensure custom_instructions is a string (callers may pass bool or other types).
+ # Treat boolean True and the string "True" as empty (e.g. from an unchecked/empty Gradio box).
+ if not isinstance(custom_instructions, str):
+ custom_instructions = ""
+ elif custom_instructions.strip().lower() == "true":
+ custom_instructions = ""
+
+ # Filter out CUSTOM_VLM_* entities (these are handled separately via VLM)
+ filtered_entities = [
+ entity for entity in entities_to_detect if not entity.startswith("CUSTOM_VLM_")
+ ]
+
+ custom_instructions_section = ""
+
+ if custom_instructions and custom_instructions.strip():
+ custom_instructions_section = (
+ f"\n## ADDITIONAL USER INSTRUCTIONS:\n{custom_instructions.strip()}\n"
+ )
+ else:
+ custom_instructions_section = "No specific user constraints provided."
+
+ # Handle case where no standard entities are selected
+ if filtered_entities:
+ entity_types_section = f"Standard entity types to detect: {filtered_entities}"
+ else:
+ entity_types_section = "No standard entity types selected - analyse text based on ADDITIONAL USER INSTRUCTIONS provided."
+ if not custom_instructions or not custom_instructions.strip():
+ raise ValueError(
+ "No standard entities selected and no custom instructions provided. "
+ "Please select at least one entity type or provide custom instructions for LLM-based PII detection."
+ )
+
+ system_prompt = f"""# System Prompt
+You are a personal information detection system. Extract entities from the standard entity list, according to the INSTRUCTIONS HIERARCHY rules below, into a JSON array with the following structure: `{{"entities": [{{"Type": "", "BeginOffset": 0, "EndOffset": 0, "Score": 1.0, "Text": ""}}]}}`.
+
+## Standard entity list
+{entity_types_section}
+{custom_instructions_section}
+## INSTRUCTIONS HIERARCHY (Follow in order)
+1. Use the standard entity list as the baseline list of entities to analyse the text.
+2. ADDITIONAL USER INSTRUCTIONS (if available): these provide additional instructions for the analysis, and override the standard entity list if they contradict it. Users may suggest new entity types to identify - they may refer to them as labels, redactions, entity types, or other similar terms. Be sure to follow each instruction closely.
+3. If a USER INSTRUCTION contradicts a standard entity rule, the user instruction is the final authority. For example, with the USER INSTRUCTION "Do not redact information related to John", exclude all entities where the text mentions or is related to John in the output.
+4. If text could be assigned to multiple entity types then assign it to all relevant entity types in separate JSON entries, this includes entity types in the standard entity list or the ADDITIONAL USER INSTRUCTIONS.
+4. OFFSETS: The text position of the start and end of the text for a given entity type. A 0-based index.
+5. SCORE: A confidence score between 0.0 and 1.0. 1.0 is the highest confidence score.
+6. TEXT: The exact text substring that was identified.
+7. Return every relevant instance of a valid entity type after taking into account the ADDITIONAL USER INSTRUCTIONS, no matter how many times they appear in the text.
+8. OUTPUT: Return ONLY valid JSON. No additional text or commentary.
+"""
+
+ return system_prompt
diff --git a/tools/llm_funcs.py b/tools/llm_funcs.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7ea0b88670d94def7549bb53d49eba4840cf10b
--- /dev/null
+++ b/tools/llm_funcs.py
@@ -0,0 +1,2119 @@
+import json
+import os
+import re
+import time
+from typing import List, Tuple
+from urllib.parse import urlparse
+
+import boto3
+import requests
+import spaces
+
+from tools.config import (
+ MAX_SPACES_GPU_RUN_TIME,
+ PRINT_TRANSFORMERS_USER_PROMPT,
+ REPORT_LLM_OUTPUTS_TO_GUI,
+ VLM_DEFAULT_DO_SAMPLE,
+)
+
+# Import mock patches if in test mode
+if os.environ.get("USE_MOCK_LLM") == "1" or os.environ.get("TEST_MODE") == "1":
+ try:
+ # Try to import and apply mock patches
+ import sys
+
+ # Add project root to sys.path so we can import test.mock_llm_calls
+ project_root = os.path.dirname(os.path.dirname(__file__))
+ if project_root not in sys.path:
+ sys.path.insert(0, project_root)
+ # try:
+ # from test.mock_llm_calls import apply_mock_patches
+
+ # apply_mock_patches()
+ # except ImportError:
+ # # If mock module not found, continue without mocking
+ # pass
+ except Exception:
+ # If anything fails, continue without mocking
+ pass
+try:
+ from google import genai as ai
+ from google.genai import types
+except ImportError:
+ print(
+ "Warning: Google GenAI not found. Google GenAI functionality will not be available."
+ )
+ pass
+from gradio import Progress
+from huggingface_hub import hf_hub_download
+
+try:
+ from openai import OpenAI
+except ImportError:
+ print("Warning: OpenAI not found. OpenAI functionality will not be available.")
+ pass
+from tqdm import tqdm
+
+model_type = None # global variable setup
+full_text = (
+ "" # Define dummy source text (full text) just to enable highlight function to load
+)
+
+# Global variables for PII detection model and tokenizer
+# These are now used for all LLM model loading (both general and PII-specific)
+_pii_model = None
+_pii_tokenizer = None
+_pii_assistant_model = None
+
+# Import config variables with defaults for missing ones
+# This allows llm_funcs.py to work even if some config variables don't exist
+from tools.config import (
+ ASSISTANT_MODEL,
+ COMPILE_MODE,
+ COMPILE_TRANSFORMERS,
+ HF_TOKEN,
+ INFERENCE_SERVER_DISABLE_THINKING,
+ INT8_WITH_OFFLOAD_TO_CPU,
+ LLM_CONTEXT_LENGTH,
+ LLM_MAX_NEW_TOKENS,
+ LLM_MIN_P,
+ LLM_MODEL_DTYPE,
+ LLM_REPETITION_PENALTY,
+ LLM_RESET,
+ LLM_RETRY_ATTEMPTS,
+ LLM_SEED,
+ LLM_STOP_STRINGS,
+ LLM_STREAM,
+ LLM_TEMPERATURE,
+ LLM_THREADS,
+ LLM_TIMEOUT_WAIT,
+ LLM_TOP_K,
+ LLM_TOP_P,
+ LOAD_TRANSFORMERS_LLM_PII_MODEL_AT_START,
+ LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE,
+ LOCAL_TRANSFORMERS_LLM_PII_REPO_ID,
+ MULTIMODAL_PROMPT_FORMAT,
+ QUANTISE_TRANSFORMERS_LLM_MODELS,
+ REASONING_SUFFIX,
+ SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL,
+ SHOW_TRANSFORMERS_LLM_PII_DETECTION_OPTIONS,
+ SPECULATIVE_DECODING,
+ USE_LLAMA_SWAP,
+ USE_TRANSFORMERS_VLM_MODEL_AS_LLM,
+ VLM_DISABLE_QWEN3_5_THINKING,
+ VLM_QWEN3_5_NOTHINK_SUFFIX,
+)
+
+
+def _stringify_openai_message_content(content) -> str:
+ """Normalize message.content from OpenAI-compatible APIs (str, null, or list of parts)."""
+ if content is None:
+ return ""
+ if isinstance(content, str):
+ return content
+ if isinstance(content, list):
+ parts = []
+ for p in content:
+ if isinstance(p, dict):
+ t = p.get("text")
+ if t is None and p.get("type") == "text":
+ t = p.get("text", "")
+ if isinstance(t, str):
+ parts.append(t)
+ elif isinstance(p, str):
+ parts.append(p)
+ return "".join(parts)
+ return str(content)
+
+
+def _extract_choice_message_text(choice: dict) -> str:
+ """Extract assistant text from a chat-completions choice (handles reasoning-only / multimodal)."""
+ if not isinstance(choice, dict):
+ return ""
+ msg = choice.get("message") or {}
+ text = _stringify_openai_message_content(msg.get("content"))
+ if text and str(text).strip():
+ return text
+ for alt_key in ("reasoning_content", "reasoning"):
+ alt = msg.get(alt_key)
+ if isinstance(alt, str) and alt.strip():
+ return alt
+ legacy = choice.get("text")
+ if isinstance(legacy, str) and legacy.strip():
+ return legacy
+ return text or ""
+
+
+def _report_llm_output_to_gui(text: str) -> None:
+ """Report streamed LLM output to Gradio UI via gr.Info when REPORT_LLM_OUTPUTS_TO_GUI is True."""
+ if not REPORT_LLM_OUTPUTS_TO_GUI or not (text and str(text).strip()):
+ return
+ try:
+ import gradio as gr
+
+ gr.Info(text, duration=2)
+ except Exception:
+ # gr.Info may not be available (e.g. in worker process or CLI), ignore
+ pass
+
+
+if isinstance(LLM_THREADS, str):
+ LLM_THREADS = int(LLM_THREADS)
+
+max_tokens = LLM_MAX_NEW_TOKENS
+
+temperature = LLM_TEMPERATURE
+top_k = LLM_TOP_K
+top_p = LLM_TOP_P
+min_p = LLM_MIN_P
+repetition_penalty = LLM_REPETITION_PENALTY
+LLM_MAX_NEW_TOKENS: int = LLM_MAX_NEW_TOKENS
+seed: int = LLM_SEED
+reset: bool = LLM_RESET
+stream: bool = LLM_STREAM
+context_length: int = LLM_CONTEXT_LENGTH
+speculative_decoding = SPECULATIVE_DECODING
+
+if not LLM_THREADS:
+ threads = 1
+else:
+ threads = LLM_THREADS
+
+timeout_wait = LLM_TIMEOUT_WAIT
+number_of_api_retry_attempts = LLM_RETRY_ATTEMPTS
+
+
+class LocalLLMContextConfig:
+ """Holds context length and GPU layer count for local transformers model loading."""
+
+ def __init__(self, n_ctx: int = context_length, n_gpu_layers: int = -1):
+ self.n_ctx = n_ctx
+ self.n_gpu_layers = n_gpu_layers
+
+ def update_gpu(self, new_value: int) -> None:
+ self.n_gpu_layers = new_value
+
+ def update_context(self, new_value: int) -> None:
+ self.n_ctx = new_value
+
+
+# GPU and CPU context configs for load_model (CPU uses 0 GPU layers).
+local_gpu_context = LocalLLMContextConfig(n_ctx=context_length, n_gpu_layers=-1)
+local_cpu_context = LocalLLMContextConfig(n_ctx=context_length, n_gpu_layers=0)
+
+
+class LocalLLMGenerationConfig:
+ def __init__(
+ self,
+ temperature=temperature,
+ top_k=top_k,
+ min_p=min_p,
+ top_p=top_p,
+ repeat_penalty=repetition_penalty,
+ seed=seed,
+ stream=stream,
+ max_tokens=LLM_MAX_NEW_TOKENS,
+ reset=reset,
+ ):
+ self.temperature = temperature
+ self.top_k = top_k
+ self.top_p = top_p
+ self.repeat_penalty = repeat_penalty
+ self.seed = seed
+ self.max_tokens = max_tokens
+ self.stream = stream
+ self.reset = reset
+
+ def update_temp(self, new_value):
+ self.temperature = new_value
+
+
+# ResponseObject class for AWS Bedrock calls
+class ResponseObject:
+ def __init__(self, text, usage_metadata):
+ self.text = text
+ self.usage_metadata = usage_metadata
+
+
+###
+# LOCAL MODEL FUNCTIONS
+###
+
+
+def get_model_path(
+ repo_id=LOCAL_TRANSFORMERS_LLM_PII_REPO_ID,
+ model_filename="",
+ model_dir="",
+ hf_token=HF_TOKEN,
+):
+ # Construct the expected local path
+ local_path = os.path.join(model_dir, model_filename)
+
+ print("local path for model load:", local_path)
+
+ try:
+ if os.path.exists(local_path):
+ print(f"Model already exists at: {local_path}")
+
+ return local_path
+ else:
+ if hf_token:
+ print("Downloading model from Hugging Face Hub with HF token")
+ downloaded_model_path = hf_hub_download(
+ repo_id=repo_id, token=hf_token, filename=model_filename
+ )
+
+ return downloaded_model_path
+ else:
+ print(
+ "No HF token found, downloading model from Hugging Face Hub without token"
+ )
+ downloaded_model_path = hf_hub_download(
+ repo_id=repo_id, filename=model_filename
+ )
+
+ return downloaded_model_path
+
+ except Exception as e:
+ print("Error loading model:", e)
+ raise Warning("Error loading model:", e)
+
+
+def _normalize_huggingface_repo_id(repo_id: str) -> str:
+ """
+ If repo_id is an http(s) URL for huggingface.co, return the org/model path segment.
+ Uses parsed host validation (not substring checks) to satisfy CodeQL py/incomplete-url-substring-sanitization.
+ """
+ s = repo_id.strip()
+ lower = s.lower()
+ if not (lower.startswith("https://") or lower.startswith("http://")):
+ return repo_id
+ parsed = urlparse(s)
+ if parsed.scheme.lower() not in ("http", "https"):
+ return repo_id
+ host = (parsed.hostname or "").lower()
+ if host not in ("huggingface.co", "www.huggingface.co"):
+ return repo_id
+ path = parsed.path.strip("/")
+ if not path:
+ return repo_id
+ return path
+
+
+def load_model(
+ local_model_type: str = None,
+ gpu_layers: int = -1,
+ max_context_length: int = context_length,
+ gpu_context: LocalLLMContextConfig = local_gpu_context,
+ cpu_context: LocalLLMContextConfig = local_cpu_context,
+ torch_device: str = "cpu",
+ repo_id=LOCAL_TRANSFORMERS_LLM_PII_REPO_ID,
+ model_filename="",
+ model_dir="",
+ compile_mode=COMPILE_MODE,
+ model_dtype=LLM_MODEL_DTYPE,
+ hf_token=HF_TOKEN,
+ speculative_decoding=speculative_decoding,
+ model=None,
+ tokenizer=None,
+ assistant_model=None,
+):
+ """
+ Load a model from Hugging Face Hub via the transformers package.
+
+ Args:
+ local_model_type (str): The type of local model to load.
+ gpu_layers (int): The number of GPU layers to offload to the GPU (-1 for default).
+ max_context_length (int): The maximum context length for the model.
+ gpu_context (LocalLLMContextConfig): Context config for GPU (n_ctx, n_gpu_layers).
+ cpu_context (LocalLLMContextConfig): Context config for CPU.
+ torch_device (str): The device to load the model on ("cuda" or "cpu").
+ repo_id (str): The Hugging Face repository ID where the model is located.
+ model_filename (str): The specific filename of the model to download from the repository.
+ model_dir (str): The local directory where the model will be stored or downloaded.
+ compile_mode (str): The compilation mode to use for the model.
+ model_dtype (str): The data type to use for the model.
+ hf_token (str): The Hugging Face token to use for the model.
+ speculative_decoding (bool): Whether to use speculative decoding.
+ model (transformers model): Optional pre-loaded model (skips loading if provided).
+ tokenizer (transformers tokenizer): Optional pre-loaded tokenizer.
+ assistant_model (transformers model): Optional assistant model for speculative decoding.
+ Returns:
+ tuple: (model, tokenizer, assistant_model).
+ """
+
+ # If model is provided, validate that tokenizer is also provided and compatible
+ if model:
+ if tokenizer is None:
+ print(
+ "Warning: Model provided but tokenizer is None. Attempting to load matching tokenizer..."
+ )
+ # Try to determine model_id from model config
+ try:
+ if hasattr(model, "config") and hasattr(model.config, "_name_or_path"):
+ model_id = model.config._name_or_path
+ from transformers import AutoTokenizer
+
+ tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token)
+ if not tokenizer.pad_token:
+ tokenizer.pad_token = tokenizer.eos_token
+ print(f"Loaded matching tokenizer from {model_id}")
+ else:
+ print(
+ "Warning: Could not determine model source to load matching tokenizer"
+ )
+ except Exception as e:
+ print(f"Warning: Failed to load matching tokenizer: {e}")
+ return model, tokenizer, assistant_model
+
+ # Use LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE if local_model_type is not provided
+ if local_model_type is None:
+ local_model_type = LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE
+
+ if isinstance(repo_id, str):
+ repo_id = _normalize_huggingface_repo_id(repo_id)
+
+ print("Loading model:", local_model_type)
+
+ # Verify the device and cuda settings
+ # Check if CUDA is enabled
+
+ import torch
+
+ torch.cuda.empty_cache()
+ print("Is CUDA enabled? ", torch.cuda.is_available())
+ print("Is a CUDA device available on this computer?", torch.backends.cudnn.enabled)
+ if torch.cuda.is_available():
+ torch_device = "cuda"
+ print("CUDA version:", torch.version.cuda)
+ # try:
+ # os.system("nvidia-smi")
+ # except Exception as e:
+ # print("Could not print nvidia-smi settings due to:", e)
+ else:
+ torch_device = "cpu"
+ gpu_layers = 0
+
+ print("Running on device:", torch_device)
+ print("GPU layers assigned to cuda:", gpu_layers)
+
+ if not LLM_THREADS:
+ threads = torch.get_num_threads()
+ else:
+ threads = LLM_THREADS
+ print("CPU threads:", threads)
+
+ # GPU mode
+ if torch_device == "cuda":
+ torch.cuda.empty_cache()
+ gpu_context.update_gpu(gpu_layers)
+ gpu_context.update_context(max_context_length)
+
+ from transformers import (
+ AutoModelForCausalLM,
+ AutoTokenizer,
+ BitsAndBytesConfig,
+ )
+
+ print("Loading model from transformers")
+ # Use the official model ID for Gemma 3 4B
+ model_id = repo_id
+ # 1. Set Data Type (dtype)
+ # For H200/Hopper: 'bfloat16'
+ # For RTX 3060/Ampere: 'float16'
+ dtype_str = model_dtype # os.environ.get("LLM_MODEL_DTYPE", "bfloat16").lower()
+ if dtype_str == "bfloat16":
+ torch_dtype = torch.bfloat16
+ elif dtype_str == "float16":
+ torch_dtype = torch.float16
+ elif dtype_str == "auto":
+ torch_dtype = "auto"
+ else:
+ torch_dtype = torch.float32 # A safe fallback
+
+ # 2. Set Compilation Mode
+ # 'max-autotune' is great for both but can be slow initially.
+ # 'reduce-overhead' is a faster alternative for compiling.
+
+ print("--- System Configuration ---")
+ print(f"Using model id: {model_id}")
+ print(f"Using dtype: {torch_dtype}")
+ print(f"Using compile mode: {compile_mode}")
+ print(f"Using quantization: {QUANTISE_TRANSFORMERS_LLM_MODELS}")
+ print("--------------------------\n")
+
+ # --- Load Tokenizer and Model Atomically ---
+ # Ensure both model and tokenizer are loaded from the same source
+ # If either fails, both should fail together to prevent mismatched pairs
+
+ try:
+ # Setup quantization config if enabled
+ quantization_config = None
+ if QUANTISE_TRANSFORMERS_LLM_MODELS:
+ if not torch.cuda.is_available():
+ print(
+ "Warning: Quantisation requires CUDA, but CUDA is not available."
+ )
+ print("Falling back to loading models without quantisation")
+ quantization_config = None
+ else:
+ if INT8_WITH_OFFLOAD_TO_CPU:
+ # This will be very slow. Requires at least 4GB of VRAM and 32GB of RAM
+ print(
+ "Using bitsandbytes for quantisation to 8 bits, with offloading to CPU"
+ )
+ max_memory = {0: "4GB", "cpu": "32GB"}
+ quantization_config = BitsAndBytesConfig(
+ load_in_8bit=True,
+ max_memory=max_memory,
+ llm_int8_enable_fp32_cpu_offload=True, # Note: if bitsandbytes has to offload to CPU, inference will be slow
+ )
+ else:
+ # For Gemma 4B, requires at least 6GB of VRAM
+ print("Using bitsandbytes for quantisation to 4 bits")
+ quantization_config = BitsAndBytesConfig(
+ load_in_4bit=True,
+ bnb_4bit_quant_type="nf4", # Use the modern NF4 quantisation for better performance
+ bnb_4bit_compute_dtype=torch_dtype,
+ # bnb_4bit_use_double_quant=True, # Optional: uses a second quantisation step to save even more memory
+ )
+
+ # Prepare load kwargs
+ # Match VLM behavior: always use device_map="auto" for better device handling
+ load_kwargs = {
+ # "max_seq_length": max_context_length,
+ "token": hf_token,
+ "device_map": "auto", # Always use device_map="auto" like VLM
+ }
+
+ if quantization_config is not None:
+ load_kwargs["quantization_config"] = quantization_config
+ print("Loading model with bitsandbytes quantisation")
+ else:
+ # Use "auto" dtype like VLM for better compatibility
+ load_kwargs["dtype"] = "auto" if model_dtype == "auto" else torch_dtype
+ print("Loading model without quantisation")
+
+ # Load tokenizer FIRST to validate the model_id is accessible
+ # This ensures we catch tokenizer errors before loading the (larger) model
+ print(f"Loading tokenizer from {model_id}...")
+ tokenizer = AutoTokenizer.from_pretrained(
+ model_id,
+ token=hf_token,
+ trust_remote_code=True,
+ )
+
+ if not tokenizer.pad_token:
+ tokenizer.pad_token = tokenizer.eos_token
+ print("Tokenizer loaded successfully")
+
+ # Load model from the SAME model_id to ensure compatibility
+
+ if "qwen" in local_model_type.lower() and "3.5" in local_model_type.lower():
+ print(f"Loading Qwen 3.5 model from {model_id}...")
+ from transformers import (
+ Qwen3_5ForCausalLM,
+ )
+
+ model = Qwen3_5ForCausalLM.from_pretrained(
+ model_id,
+ trust_remote_code=True,
+ **load_kwargs,
+ )
+ elif (
+ "qwen" in local_model_type.lower() and "3 " in local_model_type.lower()
+ ):
+ print(f"Loading Qwen 3 model from {model_id}...")
+ from transformers import Qwen3VLForConditionalGeneration
+
+ model = Qwen3VLForConditionalGeneration.from_pretrained(
+ model_id,
+ trust_remote_code=True,
+ **load_kwargs,
+ )
+ else:
+ print(f"Loading model from {model_id}...")
+ model = AutoModelForCausalLM.from_pretrained(
+ model_id,
+ trust_remote_code=True,
+ **load_kwargs,
+ )
+
+ # Set model to evaluation mode (standard transformers approach)
+ # Note: With device_map="auto", don't manually move model - let it handle device placement
+ model.eval()
+ print("Model loaded successfully")
+
+ # Validate that model and tokenizer are from the same source
+ if hasattr(model, "config") and hasattr(model.config, "_name_or_path"):
+ model_source = model.config._name_or_path
+ if hasattr(tokenizer, "name_or_path"):
+ tokenizer_source = tokenizer.name_or_path
+ if model_source != tokenizer_source and model_id not in [
+ model_source,
+ tokenizer_source,
+ ]:
+ print(
+ f"Warning: Model source ({model_source}) and tokenizer source ({tokenizer_source}) may differ. Using model_id: {model_id}"
+ )
+
+ except Exception as e:
+ # If loading fails, ensure both model and tokenizer are None to prevent partial state
+ print(f"Error loading model and tokenizer: {e}")
+ model = None
+ tokenizer = None
+ raise RuntimeError(
+ f"Failed to load model and tokenizer from {model_id}: {e}"
+ ) from e
+
+ # Compile the Model with the selected mode 🚀
+ if COMPILE_TRANSFORMERS:
+ try:
+ model = torch.compile(model, mode=compile_mode, fullgraph=False)
+ except Exception as e:
+ print(f"Could not compile model: {e}. Running in eager mode.")
+
+ print(
+ "Loading with",
+ gpu_context.n_gpu_layers,
+ "model layers sent to GPU and a maximum context length of",
+ gpu_context.n_ctx,
+ )
+
+ # CPU mode
+ else:
+ try:
+ from transformers import AutoTokenizer
+
+ model_id = repo_id
+ tokenizer = AutoTokenizer.from_pretrained(
+ model_id,
+ token=hf_token,
+ trust_remote_code=True,
+ )
+ if not tokenizer.pad_token:
+ tokenizer.pad_token = tokenizer.eos_token
+ print(f"Loaded tokenizer from {model_id} for compatibility")
+ except Exception as e:
+ print(f"Warning: Could not load tokenizer: {e}")
+ tokenizer = None
+
+ print(
+ "Loading with",
+ cpu_context.n_gpu_layers,
+ "model layers sent to GPU and a maximum context length of",
+ cpu_context.n_ctx,
+ )
+
+ print("Finished loading model:", local_model_type)
+ print("GPU layers assigned to cuda:", gpu_layers)
+
+ # Load assistant model for speculative decoding if enabled
+ # Note: Assistant model typically shares the same tokenizer as the main model
+ # for speculative decoding, so we don't load a separate tokenizer for it
+ if speculative_decoding and torch_device == "cuda":
+ print("Loading assistant model for speculative decoding:", ASSISTANT_MODEL)
+ try:
+ from transformers import (
+ AutoModelForCausalLM,
+ BitsAndBytesConfig,
+ )
+
+ # Setup quantization config for assistant model (same as main model)
+ assistant_quantization_config = None
+ if QUANTISE_TRANSFORMERS_LLM_MODELS and torch.cuda.is_available():
+ if INT8_WITH_OFFLOAD_TO_CPU:
+ max_memory = {0: "4GB", "cpu": "32GB"}
+ assistant_quantization_config = BitsAndBytesConfig(
+ load_in_8bit=True,
+ max_memory=max_memory,
+ llm_int8_enable_fp32_cpu_offload=True,
+ )
+ else:
+ assistant_quantization_config = BitsAndBytesConfig(
+ load_in_4bit=True,
+ bnb_4bit_quant_type="nf4",
+ bnb_4bit_compute_dtype=torch_dtype,
+ bnb_4bit_use_double_quant=True,
+ )
+
+ # Prepare load kwargs for assistant model
+ assistant_load_kwargs = {
+ "token": hf_token,
+ }
+
+ if assistant_quantization_config is not None:
+ assistant_load_kwargs["quantization_config"] = (
+ assistant_quantization_config
+ )
+ assistant_load_kwargs["device_map"] = "auto"
+ print("Loading assistant model with bitsandbytes quantisation")
+ else:
+ assistant_load_kwargs["dtype"] = torch_dtype
+ print("Loading assistant model without quantisation")
+
+ # Load the assistant model from ASSISTANT_MODEL
+ # Note: Assistant model should be compatible with the main model's tokenizer
+ # for speculative decoding to work correctly
+ print(f"Loading assistant model from {ASSISTANT_MODEL}...")
+ assistant_model = AutoModelForCausalLM.from_pretrained(
+ ASSISTANT_MODEL, **assistant_load_kwargs
+ )
+
+ # For non-quantized assistant models, explicitly move to device (matching VLM behavior)
+ if assistant_quantization_config is None:
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+ assistant_model = assistant_model.to(device)
+
+ # Validate that assistant model can work with the main tokenizer
+ # For speculative decoding, both models should use compatible tokenizers
+ if hasattr(assistant_model, "config") and hasattr(
+ assistant_model.config, "_name_or_path"
+ ):
+ assistant_source = assistant_model.config._name_or_path
+ if hasattr(tokenizer, "name_or_path"):
+ tokenizer_source = tokenizer.name_or_path
+ if assistant_source != tokenizer_source:
+ print(
+ f"Warning: Assistant model ({assistant_source}) and tokenizer ({tokenizer_source}) are from different sources."
+ )
+ print(
+ "This may cause issues with speculative decoding. Ensure they are compatible."
+ )
+
+ # Compile the assistant model if compilation is enabled
+ if COMPILE_TRANSFORMERS:
+ try:
+ assistant_model = torch.compile(
+ assistant_model, mode=compile_mode, fullgraph=False
+ )
+ except Exception as e:
+ print(
+ f"Could not compile assistant model: {e}. Running in eager mode."
+ )
+
+ print("Successfully loaded assistant model for speculative decoding")
+ print("Note: Assistant model uses the same tokenizer as the main model")
+
+ except Exception as e:
+ print(f"Error loading assistant model: {e}")
+ assistant_model = None
+ else:
+ assistant_model = None
+
+ return model, tokenizer, assistant_model
+
+
+# Initialize PII model at startup if configured (even if SHOW_TRANSFORMERS_LLM_PII_DETECTION_OPTIONS is False)
+# This allows PII model to be loaded independently for PII detection tasks
+if (
+ LOAD_TRANSFORMERS_LLM_PII_MODEL_AT_START
+ and SHOW_TRANSFORMERS_LLM_PII_DETECTION_OPTIONS
+):
+ try:
+ print("Loading local PII model:", LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE)
+ _pii_model, _pii_tokenizer, _pii_assistant_model = load_model(
+ local_model_type=LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE,
+ max_context_length=context_length,
+ gpu_context=local_gpu_context,
+ cpu_context=local_cpu_context,
+ repo_id=LOCAL_TRANSFORMERS_LLM_PII_REPO_ID,
+ model_filename="",
+ model_dir="",
+ compile_mode=COMPILE_MODE,
+ model_dtype=LLM_MODEL_DTYPE,
+ hf_token=HF_TOKEN,
+ model=_pii_model,
+ tokenizer=_pii_tokenizer,
+ assistant_model=_pii_assistant_model,
+ )
+ except Exception as e:
+ print(f"Warning: Could not load PII model at startup: {e}")
+ print("PII model will be loaded on-demand when needed.")
+
+
+@spaces.GPU(duration=MAX_SPACES_GPU_RUN_TIME)
+def call_transformers_model(
+ prompt: str,
+ system_prompt: str,
+ gen_config: LocalLLMGenerationConfig,
+ model=_pii_model,
+ tokenizer=_pii_tokenizer,
+ assistant_model=_pii_assistant_model,
+ speculative_decoding=speculative_decoding,
+ use_vlm_safe_generation=VLM_DEFAULT_DO_SAMPLE,
+):
+ """
+ This function sends a request to a transformers model with the given prompt, system prompt, and generation configuration.
+ When use_vlm_safe_generation is True (e.g. VLM model used for LLM tasks), uses greedy decoding to avoid
+ sampling-related CUDA errors (e.g. invalid probability tensor in multinomial).
+ """
+ import torch
+ from transformers import TextStreamer
+
+ # Custom streamer that reports streamed output to gr.Info when REPORT_LLM_OUTPUTS_TO_GUI is True
+ class _LLMGUIStreamer(TextStreamer):
+ def __init__(self, tokenizer, skip_prompt=True):
+ super().__init__(tokenizer, skip_prompt=skip_prompt)
+ self._line_buffer = ""
+
+ def on_finalized_text(self, text, stream_end=False):
+ super().on_finalized_text(text, stream_end)
+ if not REPORT_LLM_OUTPUTS_TO_GUI:
+ return
+ self._line_buffer += text
+ if "\n" in text or stream_end:
+ parts = self._line_buffer.split("\n")
+ for line in parts[:-1]:
+ if line.strip():
+ _report_llm_output_to_gui(line)
+ self._line_buffer = parts[-1] if parts else ""
+ if stream_end and self._line_buffer.strip():
+ _report_llm_output_to_gui(self._line_buffer)
+
+ # Load model and tokenizer together to ensure they're from the same source
+ # This prevents mismatches that could occur if they're loaded separately
+ if model is None or tokenizer is None:
+ print("Model not found. Loading model and tokenizer...")
+ # Use get_model_and_tokenizer() to ensure both are loaded atomically
+ # This is safer than calling get_pii_model() and get_pii_tokenizer() separately
+ loaded_model, loaded_tokenizer, assistant_model = load_model()
+ if model is None:
+ model = loaded_model
+ if tokenizer is None:
+ tokenizer = loaded_tokenizer
+ # if assistant_model is None and speculative_decoding:
+ # assistant_model = # get_assistant_model()
+
+ if model is None or tokenizer is None:
+ raise ValueError(
+ "No model or tokenizer available. Either pass them as parameters or ensure LOAD_TRANSFORMERS_LLM_PII_MODEL_AT_START is True."
+ )
+
+ # Apply reasoning suffix to prompt if configured
+ if REASONING_SUFFIX and REASONING_SUFFIX.strip():
+ prompt = f"{prompt} {REASONING_SUFFIX}".strip()
+
+ # When using VLM as LLM with Qwen3.5 thinking disabled, we append after the generation
+ # prompt so the model continues with the answer (avoids continue_final_message which can fail
+ # when the chat template does not include the final assistant message in the rendered string).
+ add_nothink_assistant_turn = (
+ VLM_DISABLE_QWEN3_5_THINKING
+ and "Qwen 3.5" in LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE
+ ) or (
+ VLM_DISABLE_QWEN3_5_THINKING
+ and USE_TRANSFORMERS_VLM_MODEL_AS_LLM
+ and (
+ "Qwen 3.5" in SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL
+ or "Qwen3.5" in SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL
+ )
+ )
+
+ # 1. Define the conversation as a list of dictionaries
+ # Note: The multimodal format [{"type": "text", "text": text}] is only needed for actual multimodal models
+ # with images/videos. For text-only content, even multimodal models expect plain strings.
+
+ # Check if system_prompt is meaningful (not empty/None)
+ has_system_prompt = system_prompt and str(system_prompt).strip()
+
+ # Always use string format for text-only content, regardless of MULTIMODAL_PROMPT_FORMAT setting
+ # MULTIMODAL_PROMPT_FORMAT should only be used when you actually have multimodal inputs (images, etc.)
+ if MULTIMODAL_PROMPT_FORMAT:
+ conversation = []
+ if has_system_prompt:
+ conversation.append(
+ {
+ "role": "system",
+ "content": [{"type": "text", "text": str(system_prompt)}],
+ }
+ )
+ conversation.append(
+ {"role": "user", "content": [{"type": "text", "text": str(prompt)}]}
+ )
+ else:
+ conversation = []
+ if has_system_prompt:
+ conversation.append({"role": "system", "content": str(system_prompt)})
+ conversation.append({"role": "user", "content": str(prompt)})
+
+ if PRINT_TRANSFORMERS_USER_PROMPT:
+ print("System prompt:", system_prompt)
+ print("User prompt:", prompt)
+
+ device = "cuda" if torch.cuda.is_available() else "cpu"
+ model.to(device)
+
+ if assistant_model is not None:
+ assistant_model = assistant_model.to(device)
+
+ if PRINT_TRANSFORMERS_USER_PROMPT:
+ print("Model device:", device)
+ print("Model device type:", type(device))
+
+ try:
+ # Try applying chat template with system prompt (if present)
+ # Create inputs dict like VLM does - this allows model to handle device placement automatically
+ # From transformers v5, apply_chat_template returns BatchEncoding; extract input_ids tensor
+ _encoded = tokenizer.apply_chat_template(
+ conversation,
+ add_generation_prompt=True,
+ tokenize=True,
+ return_tensors="pt",
+ )
+ input_ids = (
+ _encoded["input_ids"].to(device)
+ if hasattr(_encoded, "keys")
+ else _encoded.to(device)
+ )
+
+ if PRINT_TRANSFORMERS_USER_PROMPT:
+ print("Input IDs:", input_ids)
+ print("Rendered prompt:")
+ rendered = tokenizer.apply_chat_template(
+ conversation,
+ add_generation_prompt=True,
+ tokenize=False,
+ )
+ print(rendered)
+ print("-" * 50)
+
+ except (TypeError, KeyError, IndexError, ValueError) as e:
+ # If chat template fails, try without system prompt (some models don't support it)
+ if has_system_prompt:
+ print(
+ f"Chat template failed with system prompt ({e}), trying without system prompt..."
+ )
+ # Try again with only user prompt
+ user_only_conversation = [{"role": "user", "content": str(prompt)}]
+ try:
+ _encoded = tokenizer.apply_chat_template(
+ user_only_conversation,
+ add_generation_prompt=True,
+ tokenize=True,
+ return_tensors="pt",
+ )
+ input_ids = (
+ _encoded["input_ids"].to(device)
+ if hasattr(_encoded, "keys")
+ else _encoded.to(device)
+ )
+
+ if PRINT_TRANSFORMERS_USER_PROMPT:
+ print("Input IDs:", input_ids)
+ print("Rendered prompt (without system):")
+ rendered = tokenizer.apply_chat_template(
+ user_only_conversation,
+ add_generation_prompt=True,
+ tokenize=False,
+ )
+ print(rendered)
+ print("-" * 50)
+ except Exception as e2:
+ print(
+ f"Chat template failed without system prompt ({e2}), using manual tokenization"
+ )
+ # Combine system and user prompts manually as fallback
+ full_prompt = (
+ f"{system_prompt}\n\n{prompt}" if has_system_prompt else prompt
+ )
+ # Tokenize manually with special tokens (tokenizer() returns BatchEncoding; extract tensor)
+ encoded = tokenizer(
+ full_prompt, return_tensors="pt", add_special_tokens=True
+ )
+ input_ids = encoded["input_ids"].to(device)
+
+ else:
+ # No system prompt, but chat template still failed - use manual tokenization
+ print(f"Chat template failed ({e}), using manual tokenization")
+ full_prompt = str(prompt)
+ encoded = tokenizer(
+ full_prompt, return_tensors="pt", add_special_tokens=True
+ )
+ input_ids = encoded["input_ids"].to(device)
+
+ except Exception as e:
+ print("Error applying chat template:", e)
+ import traceback
+
+ traceback.print_exc()
+ raise
+
+ attention_mask = torch.ones_like(input_ids).to(device)
+
+ # When disabling Qwen3.5 thinking, append suffix to prompt so model continues with the answer (same as run_vlm).
+ if add_nothink_assistant_turn:
+ nothink_tokens = tokenizer.encode(
+ VLM_QWEN3_5_NOTHINK_SUFFIX, add_special_tokens=False, return_tensors="pt"
+ )
+ if nothink_tokens.dim() == 1:
+ nothink_tokens = nothink_tokens.unsqueeze(0)
+ nothink_tokens = nothink_tokens.to(device)
+ input_ids = torch.cat([input_ids, nothink_tokens], dim=1)
+ attention_mask = torch.cat(
+ [
+ attention_mask,
+ torch.ones(
+ (attention_mask.shape[0], nothink_tokens.shape[1]),
+ device=device,
+ dtype=attention_mask.dtype,
+ ),
+ ],
+ dim=1,
+ )
+
+ # Map generation config to transformers parameters.
+ # When use_vlm_safe_generation (VLM model used for LLM tasks), use greedy decoding to avoid
+ # "probability tensor contains inf/nan or element < 0" errors in torch.multinomial on some setups.
+ if use_vlm_safe_generation:
+ generation_kwargs = {
+ "max_new_tokens": gen_config.max_tokens,
+ "do_sample": False,
+ "attention_mask": attention_mask,
+ }
+ else:
+ generation_kwargs = {
+ "max_new_tokens": gen_config.max_tokens,
+ "temperature": gen_config.temperature,
+ "top_p": gen_config.top_p,
+ "top_k": gen_config.top_k,
+ "do_sample": True,
+ "attention_mask": attention_mask,
+ }
+
+ if gen_config.stream:
+ streamer = (
+ _LLMGUIStreamer(tokenizer, skip_prompt=True)
+ if REPORT_LLM_OUTPUTS_TO_GUI
+ else TextStreamer(tokenizer, skip_prompt=True)
+ )
+ else:
+ streamer = None
+
+ # Remove parameters that don't exist in transformers (repetition_penalty is valid for both sampling and greedy)
+ if hasattr(gen_config, "repeat_penalty") and gen_config.repeat_penalty is not None:
+ generation_kwargs["repetition_penalty"] = gen_config.repeat_penalty
+
+ if PRINT_TRANSFORMERS_USER_PROMPT:
+ print("Generation kwargs:", generation_kwargs)
+
+ if tokenizer.pad_token is None:
+ tokenizer.pad_token = tokenizer.eos_token
+
+ model.config.pad_token_id = tokenizer.pad_token_id
+
+ # --- Timed Inference Test ---
+ print("\nStarting model inference...")
+ start_time = time.time()
+
+ # Use speculative decoding if assistant model is available
+ try:
+ if speculative_decoding and assistant_model is not None:
+ if PRINT_TRANSFORMERS_USER_PROMPT:
+ print("Using speculative decoding with assistant model")
+ outputs = model.generate(
+ input_ids,
+ assistant_model=assistant_model,
+ **generation_kwargs,
+ streamer=streamer,
+ )
+ else:
+ if PRINT_TRANSFORMERS_USER_PROMPT:
+ print("Generating without speculative decoding")
+ outputs = model.generate(input_ids, **generation_kwargs, streamer=streamer)
+ except Exception as e:
+ error_msg = str(e)
+ # Check if this is a CUDA compilation error
+ if (
+ "sm_120" in error_msg
+ or "LLVM ERROR" in error_msg
+ or "Cannot select" in error_msg
+ ):
+ print("\n" + "=" * 80)
+ print("CUDA COMPILATION ERROR DETECTED")
+ print("=" * 80)
+ print(
+ "\nThe error is caused by torch.compile() trying to compile CUDA kernels"
+ )
+ print(
+ "with incompatible settings. This is a known issue with certain CUDA/PyTorch"
+ )
+ print("combinations.\n")
+ print(
+ "SOLUTION: Disable model compilation by setting COMPILE_TRANSFORMERS=False"
+ )
+ print("in your config file (config/app_config.env).")
+ print(
+ "\nThe model will still work without compilation, just slightly slower."
+ )
+ print("=" * 80 + "\n")
+ raise RuntimeError(
+ "CUDA compilation error detected. Please set COMPILE_TRANSFORMERS=False "
+ "in your config file to disable model compilation and avoid this error."
+ ) from e
+ else:
+ # Re-raise other errors as-is
+ raise
+
+ end_time = time.time()
+
+ # --- Decode and Display Results ---
+ # Extract only the newly generated tokens (exclude input tokens)
+ input_length = input_ids.shape[-1]
+
+ # Handle different output formats from model.generate()
+ # model.generate() returns a tensor with shape [batch_size, sequence_length]
+ # that includes both input and generated tokens
+ if isinstance(outputs, torch.Tensor):
+ # If outputs is a tensor, extract the new tokens
+ if outputs.dim() == 2:
+ # Shape: [batch_size, sequence_length]
+ new_tokens = outputs[0, input_length:].clone()
+ elif outputs.dim() == 1:
+ # Shape: [sequence_length] (single sequence)
+ new_tokens = outputs[input_length:].clone()
+ else:
+ raise ValueError(f"Unexpected output tensor shape: {outputs.shape}")
+ else:
+ # If outputs is a sequence or other format
+ if hasattr(outputs, "__getitem__"):
+ new_tokens = (
+ outputs[0][input_length:]
+ if len(outputs) > 0
+ else outputs[input_length:]
+ )
+ else:
+ raise ValueError(f"Unexpected output type: {type(outputs)}")
+
+ # Ensure new_tokens is a tensor and on CPU for decoding
+ if isinstance(new_tokens, torch.Tensor):
+ new_tokens = new_tokens.cpu().clone()
+ # Convert to list for decoding (some tokenizers prefer lists)
+ new_tokens_list = new_tokens.tolist()
+ else:
+ new_tokens_list = (
+ list(new_tokens) if hasattr(new_tokens, "__iter__") else [new_tokens]
+ )
+
+ if PRINT_TRANSFORMERS_USER_PROMPT:
+ print(f"Input length: {input_length}")
+ print(f"Output shape: {outputs.shape if hasattr(outputs, 'shape') else 'N/A'}")
+ print(f"New tokens count: {len(new_tokens_list)}")
+ print(f"First 20 new token IDs: {new_tokens_list[:20]}")
+
+ # Decode the tokens
+ # Use the token list for decoding (more reliable than tensor)
+ try:
+ assistant_reply = tokenizer.decode(
+ new_tokens_list, skip_special_tokens=True, clean_up_tokenization_spaces=True
+ )
+ except Exception as e:
+ print(f"Warning: Error decoding tokens: {e}")
+ print(f"New tokens count: {len(new_tokens_list)}")
+ print(f"New tokens (first 20): {new_tokens_list[:20]}")
+ # Try alternative decoding methods
+ try:
+ # Try with tensor directly
+ if isinstance(new_tokens, torch.Tensor):
+ assistant_reply = tokenizer.decode(
+ new_tokens,
+ skip_special_tokens=True,
+ clean_up_tokenization_spaces=True,
+ )
+ else:
+ raise e
+ except Exception as e2:
+ print(f"Error with tensor decoding: {e2}")
+ # Last resort: try to decode each token individually to see which ones fail
+ try:
+ decoded_parts = []
+ failed_tokens = []
+ for i, token_id in enumerate(
+ new_tokens_list[:200]
+ ): # Limit to first 200 to avoid issues
+ try:
+ decoded = tokenizer.decode([token_id], skip_special_tokens=True)
+ decoded_parts.append(decoded)
+ except Exception as token_error:
+ failed_tokens.append((i, token_id, str(token_error)))
+ decoded_parts.append(f"")
+ if failed_tokens:
+ print(
+ f"Warning: {len(failed_tokens)} tokens failed to decode individually"
+ )
+ print(f"First few failed tokens: {failed_tokens[:5]}")
+ assistant_reply = "".join(decoded_parts)
+ except Exception as e3:
+ print(f"Error with individual token decoding: {e3}")
+ assistant_reply = f""
+
+ num_input_tokens = input_length
+ num_generated_tokens = (
+ len(new_tokens_list) if hasattr(new_tokens_list, "__len__") else 0
+ )
+ duration = end_time - start_time
+ tokens_per_second = num_generated_tokens / duration if duration > 0 else 0
+
+ if PRINT_TRANSFORMERS_USER_PROMPT:
+ print(f"\nDecoded output length: {len(assistant_reply)} characters")
+ print(f"First 200 chars of output: {assistant_reply[:200]}")
+
+ print("\n--- Performance ---")
+ print(f"Time taken: {duration:.2f} seconds")
+ print(f"Generated tokens: {num_generated_tokens}")
+ print(f"Tokens per second: {tokens_per_second:.2f}")
+
+ return assistant_reply, num_input_tokens, num_generated_tokens
+
+
+# Function to send a request and update history
+def send_request(
+ prompt: str,
+ conversation_history: List[dict],
+ client: ai.Client | OpenAI,
+ config: types.GenerateContentConfig,
+ model_choice: str,
+ system_prompt: str,
+ temperature: float,
+ bedrock_runtime: boto3.Session.client,
+ model_source: str,
+ local_model=_pii_model,
+ tokenizer=_pii_tokenizer,
+ assistant_model=_pii_assistant_model,
+ assistant_prefill="",
+ progress=Progress(track_tqdm=True),
+ api_url: str = None,
+) -> Tuple[str, List[dict]]:
+ """Sends a request to a language model and manages the conversation history.
+
+ This function constructs the full prompt by appending the new user prompt to the conversation history,
+ generates a response from the model, and updates the conversation history with the new prompt and response.
+ It handles different model sources (Gemini, AWS, Local, inference-server) and includes retry logic for API calls.
+
+ Args:
+ prompt (str): The user's input prompt to be sent to the model.
+ conversation_history (List[dict]): A list of dictionaries representing the ongoing conversation.
+ Each dictionary should have 'role' and 'parts' keys.
+ client (ai.Client): The API client object for the chosen model (e.g., Gemini `ai.Client`, or Azure/OpenAI `OpenAI`).
+ config (types.GenerateContentConfig): Configuration settings for content generation (e.g., Gemini `types.GenerateContentConfig`).
+ model_choice (str): The specific model identifier to use (e.g., "gemini-pro", "claude-v2").
+ system_prompt (str): An optional system-level instruction or context for the model.
+ temperature (float): Controls the randomness of the model's output, with higher values leading to more diverse responses.
+ bedrock_runtime (boto3.Session.client): The boto3 Bedrock runtime client object for AWS models.
+ model_source (str): Indicates the source/provider of the model (e.g., "Gemini", "AWS", "Local", "inference-server").
+ local_model (list, optional): A list containing the local model and its tokenizer (if `model_source` is "Local"). Defaults to [].
+ tokenizer (object, optional): The tokenizer object for local models. Defaults to None.
+ assistant_model (object, optional): An optional assistant model used for speculative decoding with local models. Defaults to None.
+ assistant_prefill (str, optional): A string to pre-fill the assistant's response, useful for certain models like Claude. Defaults to "".
+ progress (Progress, optional): A progress object for tracking the operation, typically from `tqdm`. Defaults to Progress(track_tqdm=True).
+ api_url (str, optional): The API URL for inference-server calls. Required when model_source is 'inference-server'.
+
+ Returns:
+ Tuple[str, List[dict]]: A tuple containing the model's response text and the updated conversation history.
+ """
+ # Constructing the full prompt from the conversation history
+ full_prompt = "Conversation history:\n"
+ num_transformer_input_tokens = 0
+ num_transformer_generated_tokens = 0
+ response_text = ""
+ if not model_choice or model_choice == "":
+ model_choice = None
+
+ for entry in conversation_history:
+ role = entry[
+ "role"
+ ].capitalize() # Assuming the history is stored with 'role' and 'parts'
+ message = " ".join(entry["parts"]) # Combining all parts of the message
+ full_prompt += f"{role}: {message}\n"
+
+ # Adding the new user prompt
+ full_prompt += f"\nUser: {prompt}"
+
+ # Clear any existing progress bars
+ tqdm._instances.clear()
+
+ progress_bar = range(0, number_of_api_retry_attempts)
+
+ # Generate the model's response
+ if "Gemini" in model_source:
+
+ for i in progress_bar:
+ try:
+ print("Calling Gemini model, attempt", i + 1)
+
+ response = client.models.generate_content(
+ model=model_choice, contents=full_prompt, config=config
+ )
+
+ # print("Successful call to Gemini model.")
+ break
+ except Exception as e:
+ # If fails, try again after X seconds in case there is a throttle limit
+ print(
+ "Call to Gemini model failed:",
+ e,
+ " Waiting for ",
+ str(timeout_wait),
+ "seconds and trying again.",
+ )
+
+ time.sleep(timeout_wait)
+
+ if i == number_of_api_retry_attempts:
+ return (
+ ResponseObject(text="", usage_metadata={"RequestId": "FAILED"}),
+ conversation_history,
+ response_text,
+ num_transformer_input_tokens,
+ num_transformer_generated_tokens,
+ )
+
+ elif "AWS" in model_source:
+ for i in progress_bar:
+ try:
+ # print("Calling AWS Bedrock model, attempt", i + 1)
+ response = call_aws_bedrock(
+ prompt,
+ system_prompt,
+ temperature,
+ max_tokens,
+ model_choice,
+ bedrock_runtime=bedrock_runtime,
+ assistant_prefill=assistant_prefill,
+ )
+
+ # print("Successful call to Claude model.")
+ break
+ except Exception as e:
+ # If fails, try again after X seconds in case there is a throttle limit
+ print(
+ "Call to Bedrock model failed:",
+ e,
+ " Waiting for ",
+ str(timeout_wait),
+ "seconds and trying again.",
+ )
+ time.sleep(timeout_wait)
+
+ if i == number_of_api_retry_attempts:
+ return (
+ ResponseObject(text="", usage_metadata={"RequestId": "FAILED"}),
+ conversation_history,
+ response_text,
+ num_transformer_input_tokens,
+ num_transformer_generated_tokens,
+ )
+ elif "Azure/OpenAI" in model_source:
+ for i in progress_bar:
+ try:
+ print("Calling Azure/OpenAI inference model, attempt", i + 1)
+
+ messages = [
+ {
+ "role": "system",
+ "content": system_prompt,
+ },
+ {
+ "role": "user",
+ "content": prompt,
+ },
+ ]
+
+ response_raw = client.chat.completions.create(
+ messages=messages,
+ model=model_choice,
+ temperature=temperature,
+ max_completion_tokens=max_tokens,
+ )
+
+ response_text = response_raw.choices[0].message.content
+ usage = getattr(response_raw, "usage", None)
+ input_tokens = 0
+ output_tokens = 0
+ if usage is not None:
+ input_tokens = getattr(
+ usage, "input_tokens", getattr(usage, "prompt_tokens", 0)
+ )
+ output_tokens = getattr(
+ usage, "output_tokens", getattr(usage, "completion_tokens", 0)
+ )
+ response = ResponseObject(
+ text=response_text,
+ usage_metadata={
+ "inputTokens": input_tokens,
+ "outputTokens": output_tokens,
+ },
+ )
+ break
+ except Exception as e:
+ print(
+ "Call to Azure/OpenAI model failed:",
+ e,
+ " Waiting for ",
+ str(timeout_wait),
+ "seconds and trying again.",
+ )
+ time.sleep(timeout_wait)
+ if i == number_of_api_retry_attempts:
+ return (
+ ResponseObject(text="", usage_metadata={"RequestId": "FAILED"}),
+ conversation_history,
+ response_text,
+ num_transformer_input_tokens,
+ num_transformer_generated_tokens,
+ )
+ elif "Local" in model_source:
+ # This is the local model. When USE_TRANSFORMERS_VLM_MODEL_AS_LLM and model_choice is the VLM model, use the loaded VLM model/tokenizer.
+ vlm_model, vlm_tokenizer = None, None
+ if (
+ USE_TRANSFORMERS_VLM_MODEL_AS_LLM
+ and model_choice == SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL
+ ):
+ try:
+ from tools.run_vlm import get_loaded_vlm_model_and_tokenizer
+
+ vlm_model, vlm_tokenizer = get_loaded_vlm_model_and_tokenizer()
+ except Exception as e:
+ print(
+ f"Could not get VLM model for LLM task (USE_TRANSFORMERS_VLM_MODEL_AS_LLM): {e}"
+ )
+
+ for i in progress_bar:
+ try:
+ print("Calling local model, attempt", i + 1)
+
+ gen_config = LocalLLMGenerationConfig()
+ gen_config.update_temp(temperature)
+
+ # Call transformers model; use VLM model/tokenizer when USE_TRANSFORMERS_VLM_MODEL_AS_LLM and available
+ if vlm_model is not None and vlm_tokenizer is not None:
+ (
+ response,
+ num_transformer_input_tokens,
+ num_transformer_generated_tokens,
+ ) = call_transformers_model(
+ prompt,
+ system_prompt,
+ gen_config,
+ model=vlm_model,
+ tokenizer=vlm_tokenizer,
+ use_vlm_safe_generation=VLM_DEFAULT_DO_SAMPLE,
+ )
+ else:
+ (
+ response,
+ num_transformer_input_tokens,
+ num_transformer_generated_tokens,
+ ) = call_transformers_model(
+ prompt,
+ system_prompt,
+ gen_config,
+ )
+ response_text = response
+
+ break
+ except Exception as e:
+ # If fails, try again after X seconds in case there is a throttle limit
+ print(
+ "Call to local model failed:",
+ e,
+ " Waiting for ",
+ str(timeout_wait),
+ "seconds and trying again.",
+ )
+
+ time.sleep(timeout_wait)
+
+ if i == number_of_api_retry_attempts:
+ return (
+ ResponseObject(text="", usage_metadata={"RequestId": "FAILED"}),
+ conversation_history,
+ response_text,
+ num_transformer_input_tokens,
+ num_transformer_generated_tokens,
+ )
+ elif "inference-server" in model_source:
+ # This is the inference-server API
+ for i in progress_bar:
+ try:
+ print("Calling inference-server API, attempt", i + 1)
+
+ if api_url is None:
+ raise ValueError(
+ "api_url is required when model_source is 'inference-server'"
+ )
+
+ gen_config = LocalLLMGenerationConfig()
+ gen_config.update_temp(temperature)
+
+ response = call_inference_server_api(
+ prompt,
+ system_prompt,
+ gen_config,
+ api_url=api_url,
+ model_name=model_choice,
+ use_llama_swap=USE_LLAMA_SWAP,
+ )
+
+ break
+ except Exception as e:
+ # If fails, try again after X seconds in case there is a throttle limit
+ print(
+ "Call to inference-server API failed:",
+ e,
+ " Waiting for ",
+ str(timeout_wait),
+ "seconds and trying again.",
+ )
+
+ time.sleep(timeout_wait)
+
+ if i == number_of_api_retry_attempts:
+ return (
+ ResponseObject(text="", usage_metadata={"RequestId": "FAILED"}),
+ conversation_history,
+ response_text,
+ num_transformer_input_tokens,
+ num_transformer_generated_tokens,
+ )
+ else:
+ print("Model source not recognised")
+ return (
+ ResponseObject(text="", usage_metadata={"RequestId": "FAILED"}),
+ conversation_history,
+ response_text,
+ num_transformer_input_tokens,
+ num_transformer_generated_tokens,
+ )
+
+ # Update the conversation history with the new prompt and response
+ conversation_history.append({"role": "user", "parts": [prompt]})
+
+ # Check if is a LLama.cpp model response or inference-server response
+ if isinstance(response, ResponseObject):
+ response_text = response.text
+ elif "choices" in response: # LLama.cpp model response or inference-server response
+ # Check for GPT-OSS thinking models (case-insensitive, handle both hyphen and underscore)
+ if "gpt-oss" in model_choice.lower() or "gpt_oss" in model_choice.lower():
+ content = _stringify_openai_message_content(
+ response["choices"][0]["message"].get("content")
+ )
+ # Split on the final channel marker to extract only the final output (not thinking tokens)
+ parts = content.split("<|start|>assistant<|channel|>final<|message|>")
+ if len(parts) > 1:
+ response_text = parts[1]
+ # Following format may be from llama.cpp inference-server response
+ elif len(parts) == 1:
+ parts = content.split("<|end|>")
+ if len(parts) > 1:
+ response_text = parts[1]
+ else:
+ print(
+ "Warning: Could not find final channel marker in GPT-OSS response. Using full content."
+ )
+ response_text = content
+ else:
+ # Fallback: if marker not found, use the full content (may include thinking tokens)
+ print(
+ "Warning: Could not find final channel marker in GPT-OSS response. Using full content."
+ )
+ response_text = content
+ else:
+ response_text = _extract_choice_message_text(response["choices"][0])
+ elif model_source == "Gemini":
+ response_text = response.text
+ else: # Assume transformers model response
+ # Check for GPT-OSS thinking models (case-insensitive, handle both hyphen and underscore)
+ if "gpt-oss" in model_choice.lower() or "gpt_oss" in model_choice.lower():
+ # Split on the final channel marker to extract only the final output (not thinking tokens)
+ parts = response.split("<|start|>assistant<|channel|>final<|message|>")
+ if len(parts) > 1:
+ response_text = parts[1]
+ else:
+ # Fallback: if marker not found, use the full content (may include thinking tokens)
+ print(
+ "Warning: Could not find final channel marker in GPT-OSS response. Using full content."
+ )
+ response_text = response
+ else:
+ response_text = response
+
+ # Strip <|end|> tags (used by GPT-OSS thinking models to mark end of thinking)
+ response_text = response_text or ""
+ response_text = re.sub(r"<\|end\|>", "", response_text)
+
+ # Replace multiple spaces with single space
+ response_text = re.sub(r" {2,}", " ", response_text)
+ response_text = response_text.strip()
+
+ conversation_history.append({"role": "assistant", "parts": [response_text]})
+
+ return (
+ response,
+ conversation_history,
+ response_text,
+ num_transformer_input_tokens,
+ num_transformer_generated_tokens,
+ )
+
+
+def process_requests(
+ prompts: List[str],
+ system_prompt: str,
+ conversation_history: List[dict],
+ whole_conversation: List[str],
+ whole_conversation_metadata: List[str],
+ client: ai.Client | OpenAI,
+ config: types.GenerateContentConfig,
+ model_choice: str,
+ temperature: float,
+ bedrock_runtime: boto3.Session.client,
+ model_source: str,
+ batch_no: int = 1,
+ local_model=_pii_model,
+ tokenizer=_pii_tokenizer,
+ assistant_model=_pii_assistant_model,
+ master: bool = False,
+ assistant_prefill="",
+ api_url: str = None,
+) -> Tuple[List[ResponseObject], List[dict], List[str], List[str]]:
+ """
+ Processes a list of prompts by sending them to the model, appending the responses to the conversation history, and updating the whole conversation and metadata.
+
+ Args:
+ prompts (List[str]): A list of prompts to be processed.
+ system_prompt (str): The system prompt.
+ conversation_history (List[dict]): The history of the conversation.
+ whole_conversation (List[str]): The complete conversation including prompts and responses.
+ whole_conversation_metadata (List[str]): Metadata about the whole conversation.
+ client (object): The client to use for processing the prompts, from either Gemini or OpenAI client.
+ config (dict): Configuration for the model.
+ model_choice (str): The choice of model to use.
+ temperature (float): The temperature parameter for the model.
+ model_source (str): Source of the model, whether local, AWS, Gemini, or inference-server
+ batch_no (int): Batch number of the large language model request.
+ local_model: Local gguf model (if loaded)
+ master (bool): Is this request for the master table.
+ assistant_prefill (str, optional): Is there a prefill for the assistant response. Currently only working for AWS model calls
+ bedrock_runtime: The client object for boto3 Bedrock runtime
+ api_url (str, optional): The API URL for inference-server calls. Required when model_source is 'inference-server'.
+
+ Returns:
+ Tuple[List[ResponseObject], List[dict], List[str], List[str]]: A tuple containing the list of responses, the updated conversation history, the updated whole conversation, and the updated whole conversation metadata.
+ """
+ responses = list()
+
+ # Clear any existing progress bars
+ tqdm._instances.clear()
+
+ for prompt in prompts:
+
+ (
+ response,
+ conversation_history,
+ response_text,
+ num_transformer_input_tokens,
+ num_transformer_generated_tokens,
+ ) = send_request(
+ prompt,
+ conversation_history,
+ client=client,
+ config=config,
+ model_choice=model_choice,
+ system_prompt=system_prompt,
+ temperature=temperature,
+ local_model=local_model,
+ tokenizer=tokenizer,
+ assistant_model=assistant_model,
+ assistant_prefill=assistant_prefill,
+ bedrock_runtime=bedrock_runtime,
+ model_source=model_source,
+ api_url=api_url,
+ )
+
+ responses.append(response)
+ whole_conversation.append(system_prompt)
+ whole_conversation.append(prompt)
+ whole_conversation.append(response_text)
+
+ whole_conversation_metadata.append(f"Batch {batch_no}:")
+
+ try:
+ if "AWS" in model_source:
+ output_tokens = response.usage_metadata.get("outputTokens", 0)
+ input_tokens = response.usage_metadata.get("inputTokens", 0)
+
+ elif "Gemini" in model_source:
+ output_tokens = response.usage_metadata.candidates_token_count
+ input_tokens = response.usage_metadata.prompt_token_count
+
+ elif "Azure/OpenAI" in model_source:
+ input_tokens = response.usage_metadata.get("inputTokens", 0)
+ output_tokens = response.usage_metadata.get("outputTokens", 0)
+
+ elif "Local" in model_source:
+ input_tokens = num_transformer_input_tokens
+ output_tokens = num_transformer_generated_tokens
+
+ elif "inference-server" in model_source:
+ # inference-server returns the same format as llama-cpp
+ output_tokens = response["usage"].get("completion_tokens", 0)
+ input_tokens = response["usage"].get("prompt_tokens", 0)
+
+ else:
+ input_tokens = 0
+ output_tokens = 0
+
+ whole_conversation_metadata.append(
+ "input_tokens: "
+ + str(input_tokens)
+ + " output_tokens: "
+ + str(output_tokens)
+ )
+
+ except KeyError as e:
+ print(f"Key error: {e} - Check the structure of response.usage_metadata")
+
+ return (
+ responses,
+ conversation_history,
+ whole_conversation,
+ whole_conversation_metadata,
+ response_text,
+ )
+
+
+def call_inference_server_api(
+ formatted_string: str,
+ system_prompt: str,
+ gen_config: LocalLLMGenerationConfig,
+ api_url: str = "http://localhost:8080",
+ model_name: str = None,
+ use_llama_swap: bool = USE_LLAMA_SWAP,
+):
+ """
+ Calls a inference-server API endpoint with a formatted user message and system prompt,
+ using generation parameters from the LocalLLMGenerationConfig object.
+
+ This function provides the same interface as call_transformers_model but calls
+ a remote inference-server instance instead of a local model.
+
+ Args:
+ formatted_string (str): The formatted input text for the user's message.
+ system_prompt (str): The system-level instructions for the model.
+ gen_config (LocalLLMGenerationConfig): An object containing generation parameters.
+ api_url (str): The base URL of the inference-server API (default: "http://localhost:8080").
+ model_name (str): Optional model name to use. If None, uses the default model.
+ use_llama_swap (bool): Whether to use llama-swap for the model.
+ Returns:
+ dict: Response in the same format as the inference-server chat completions API
+
+ Example:
+ # Create generation config
+ gen_config = LocalLLMGenerationConfig(temperature=0.7, max_tokens=100)
+
+ # Call the API
+ response = call_inference_server_api(
+ formatted_string="Hello, how are you?",
+ system_prompt="You are a helpful assistant.",
+ gen_config=gen_config,
+ api_url="http://localhost:8080"
+ )
+
+ # Extract the response text
+ response_text = response['choices'][0]['message']['content']
+
+ Integration Example:
+ # To use inference-server instead of local model:
+ # 1. Set model_source to "inference-server"
+ # 2. Provide api_url parameter
+ # 3. Call your existing functions as normal
+
+ responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = call_llm_with_markdown_table_checks(
+ batch_prompts=["Your prompt here"],
+ system_prompt="Your system prompt",
+ conversation_history=[],
+ whole_conversation=[],
+ whole_conversation_metadata=[],
+ client=None, # Not used for inference-server
+ client_config=None, # Not used for inference-server
+ model_choice="your-model-name", # Model name on the server
+ temperature=0.7,
+ reported_batch_no=1,
+ local_model=None, # Not used for inference-server
+ tokenizer=None, # Not used for inference-server
+ bedrock_runtime=None, # Not used for inference-server
+ model_source="inference-server",
+ MAX_OUTPUT_VALIDATION_ATTEMPTS=3,
+ api_url="http://localhost:8080"
+ )
+ """
+ # Extract parameters from the gen_config object
+ temperature = gen_config.temperature
+ top_k = gen_config.top_k
+ top_p = gen_config.top_p
+ repeat_penalty = gen_config.repeat_penalty
+ seed = gen_config.seed
+ max_tokens = gen_config.max_tokens
+ stream = gen_config.stream
+
+ # Prepare the request payload
+ messages = [
+ {"role": "system", "content": system_prompt},
+ {"role": "user", "content": formatted_string},
+ ]
+
+ payload = {
+ "messages": messages,
+ "temperature": temperature,
+ "top_k": top_k,
+ "top_p": top_p,
+ "repeat_penalty": repeat_penalty,
+ "seed": seed,
+ "max_tokens": max_tokens,
+ "stream": stream,
+ "stop": LLM_STOP_STRINGS if LLM_STOP_STRINGS else [],
+ }
+ # Include model in payload when set (vLLM/OpenAI-compatible servers; llama-swap or not).
+ if model_name or model_name != "":
+ payload["model"] = model_name
+
+ # Match VLM path: Qwen3 / Qwen3.5 on vLLM may stream only "thinking" unless disabled.
+ if INFERENCE_SERVER_DISABLE_THINKING:
+ payload["chat_template_kwargs"] = {"enable_thinking": False}
+
+ # Determine the endpoint based on streaming preference
+ if stream:
+ endpoint = f"{api_url}/v1/chat/completions"
+ else:
+ endpoint = f"{api_url}/v1/chat/completions"
+
+ try:
+ if stream:
+ # Handle streaming response
+ response = requests.post(
+ endpoint,
+ json=payload,
+ headers={"Content-Type": "application/json"},
+ stream=True,
+ timeout=timeout_wait,
+ )
+ response.raise_for_status()
+
+ final_tokens = []
+ output_tokens = 0
+ line_buffer = ""
+
+ for line in response.iter_lines():
+ if line:
+ line = line.decode("utf-8")
+ if line.startswith("data: "):
+ data = line[6:] # Remove 'data: ' prefix
+ if data.strip() == "[DONE]":
+ if REPORT_LLM_OUTPUTS_TO_GUI and line_buffer.strip():
+ _report_llm_output_to_gui(line_buffer)
+ break
+ try:
+ chunk = json.loads(data)
+ if "choices" in chunk and len(chunk["choices"]) > 0:
+ delta = chunk["choices"][0].get("delta", {})
+ token = delta.get("content")
+ token = _stringify_openai_message_content(token)
+ if not token:
+ for alt in (
+ "reasoning_content",
+ "reasoning",
+ ):
+ t = delta.get(alt)
+ if isinstance(t, str) and t:
+ token = t
+ break
+ if token:
+ print(token, end="", flush=True)
+ final_tokens.append(token)
+ output_tokens += 1
+ if REPORT_LLM_OUTPUTS_TO_GUI:
+ line_buffer += token
+ if "\n" in token:
+ parts = line_buffer.split("\n")
+ for complete_line in parts[:-1]:
+ if complete_line.strip():
+ _report_llm_output_to_gui(
+ complete_line
+ )
+ line_buffer = parts[-1] if parts else ""
+ except json.JSONDecodeError:
+ continue
+
+ if REPORT_LLM_OUTPUTS_TO_GUI and line_buffer.strip():
+ _report_llm_output_to_gui(line_buffer)
+ print() # newline after stream finishes
+
+ text = "".join(final_tokens)
+
+ # Estimate input tokens (rough approximation)
+ input_tokens = len((system_prompt + "\n" + formatted_string).split())
+
+ return {
+ "choices": [
+ {
+ "index": 0,
+ "finish_reason": "stop",
+ "message": {"role": "assistant", "content": text},
+ }
+ ],
+ "usage": {
+ "prompt_tokens": input_tokens,
+ "completion_tokens": output_tokens,
+ "total_tokens": input_tokens + output_tokens,
+ },
+ }
+ else:
+ # Handle non-streaming response
+ response = requests.post(
+ endpoint,
+ json=payload,
+ headers={"Content-Type": "application/json"},
+ timeout=timeout_wait,
+ )
+ response.raise_for_status()
+
+ result = response.json()
+
+ # Ensure the response has the expected format
+ if "choices" not in result:
+ raise ValueError("Invalid response format from inference-server")
+
+ return result
+
+ except requests.exceptions.RequestException as e:
+ raise ConnectionError(
+ f"Failed to connect to inference-server at {api_url}: {str(e)}"
+ )
+ except json.JSONDecodeError as e:
+ raise ValueError(f"Invalid JSON response from inference-server: {str(e)}")
+ except Exception as e:
+ raise RuntimeError(f"Error calling inference-server API: {str(e)}")
+
+
+###
+# LLM FUNCTIONS
+###
+
+
+def construct_gemini_generative_model(
+ in_api_key: str,
+ temperature: float,
+ model_choice: str,
+ system_prompt: str,
+ max_tokens: int,
+ random_seed=seed,
+) -> Tuple[object, dict]:
+ """
+ Constructs a GenerativeModel for Gemini API calls.
+ ...
+ """
+ # Construct a GenerativeModel
+ try:
+ if in_api_key:
+ # print("Getting API key from textbox")
+ api_key = in_api_key
+ client = ai.Client(api_key=api_key)
+ elif "GOOGLE_API_KEY" in os.environ:
+ # print("Searching for API key in environmental variables")
+ api_key = os.environ["GOOGLE_API_KEY"]
+ client = ai.Client(api_key=api_key)
+ else:
+ print("No Gemini API key found")
+ raise Warning("No Gemini API key found.")
+ except Exception as e:
+ print("Error constructing Gemini generative model:", e)
+ raise Warning("Error constructing Gemini generative model:", e)
+
+ config = types.GenerateContentConfig(
+ temperature=temperature, max_output_tokens=max_tokens, seed=random_seed
+ )
+
+ return client, config
+
+
+def construct_azure_client(in_api_key: str, endpoint: str) -> Tuple[object, dict]:
+ """
+ Constructs an OpenAI client for Azure/OpenAI AI Inference.
+ """
+ try:
+ key = None
+ if in_api_key:
+ key = in_api_key
+ elif os.environ.get("AZURE_OPENAI_API_KEY"):
+ key = os.environ["AZURE_OPENAI_API_KEY"]
+ if not key:
+ raise Warning("No Azure/OpenAI API key found.")
+
+ if not endpoint:
+ endpoint = os.environ.get("AZURE_OPENAI_INFERENCE_ENDPOINT", "")
+ if not endpoint:
+ # Assume using OpenAI API
+ client = OpenAI(
+ api_key=key,
+ )
+ else:
+ # Use the provided endpoint
+ client = OpenAI(
+ api_key=key,
+ base_url=f"{endpoint}",
+ )
+
+ return client, dict()
+ except Exception as e:
+ print("Error constructing Azure/OpenAI client:", e)
+ raise
+
+
+def call_aws_bedrock(
+ prompt: str,
+ system_prompt: str,
+ temperature: float,
+ max_tokens: int,
+ model_choice: str,
+ bedrock_runtime: boto3.Session.client,
+ assistant_prefill: str = "",
+ max_retries: int = 5,
+ retry_delay_seconds: float = 2.0,
+) -> ResponseObject:
+ """
+ This function sends a request to AWS Bedrock with the following parameters:
+ - prompt: The user's input prompt to be processed by the model.
+ - system_prompt: A system-defined prompt that provides context or instructions for the model.
+ - temperature: A value that controls the randomness of the model's output, with higher values resulting in more diverse responses.
+ - max_tokens: The maximum number of tokens (words or characters) in the model's response.
+ - model_choice: The specific model to use for processing the request.
+ - bedrock_runtime: The client object for boto3 Bedrock runtime
+ - assistant_prefill: A string indicating the text that the response should start with.
+ - max_retries: Maximum number of retry attempts on failure (default 5).
+ - retry_delay_seconds: Delay in seconds between retries (default 2.0).
+
+ The function constructs the request configuration, invokes the model, extracts the response text, and returns a ResponseObject containing the text and metadata.
+ """
+
+ inference_config = {
+ "maxTokens": max_tokens,
+ "temperature": temperature,
+ }
+
+ # Using an assistant prefill only works for Anthropic models.
+ if assistant_prefill and "anthropic" in model_choice:
+ assistant_prefill_added = True
+ messages = [
+ {
+ "role": "user",
+ "content": [
+ {"text": prompt},
+ ],
+ },
+ {
+ "role": "assistant",
+ # Pre-filling with '|'
+ "content": [{"text": assistant_prefill}],
+ },
+ ]
+ else:
+ assistant_prefill_added = False
+ messages = [
+ {
+ "role": "user",
+ "content": [
+ {"text": prompt},
+ ],
+ }
+ ]
+
+ system_prompt_list = [{"text": system_prompt}]
+
+ last_error = None
+ for attempt in range(1, max_retries + 1):
+ try:
+ # The converse API call.
+ api_response = bedrock_runtime.converse(
+ modelId=model_choice,
+ messages=messages,
+ system=system_prompt_list,
+ inferenceConfig=inference_config,
+ )
+
+ output_message = api_response["output"]["message"]
+
+ if "reasoningContent" in output_message["content"][0]:
+ # Extract the reasoning text
+ output_message["content"][0]["reasoningContent"]["reasoningText"][
+ "text"
+ ]
+
+ # Extract the output text
+ if assistant_prefill_added:
+ text = assistant_prefill + output_message["content"][1]["text"]
+ else:
+ text = output_message["content"][1]["text"]
+ else:
+ if assistant_prefill_added:
+ text = assistant_prefill + output_message["content"][0]["text"]
+ else:
+ text = output_message["content"][0]["text"]
+
+ # The usage statistics are neatly provided in the 'usage' key.
+ usage = api_response["usage"]
+
+ # The full API response metadata is in 'ResponseMetadata' if you still need it.
+ api_response["ResponseMetadata"]
+
+ # Create ResponseObject with the cleanly extracted data.
+ response = ResponseObject(text=text, usage_metadata=usage)
+
+ return response
+
+ except Exception as e:
+ last_error = e
+ if attempt < max_retries:
+ print(
+ f"Bedrock converse API attempt {attempt}/{max_retries} failed: {e}. "
+ f"Retrying in {retry_delay_seconds}s..."
+ )
+ time.sleep(retry_delay_seconds)
+ else:
+ raise RuntimeError(
+ f"Failed to call Bedrock API after {max_retries} attempts: {str(last_error)}"
+ ) from last_error
+
+
+def calculate_tokens_from_metadata(
+ metadata_string: str, model_choice: str, model_name_map: dict
+):
+ """
+ Calculate the number of input and output tokens for given queries based on metadata strings.
+
+ Args:
+ metadata_string (str): A string containing all relevant metadata from the string.
+ model_choice (str): A string describing the model name
+ model_name_map (dict): A dictionary mapping model name to source
+ """
+
+ # Regex to find the numbers following the keys in the "Query summary metadata" section
+ # This ensures we get the final, aggregated totals for the whole query.
+ input_regex = r"input_tokens: (\d+)"
+ output_regex = r"output_tokens: (\d+)"
+
+ # re.findall returns a list of all matching strings (the captured groups).
+ input_token_strings = re.findall(input_regex, metadata_string)
+ output_token_strings = re.findall(output_regex, metadata_string)
+
+ # Convert the lists of strings to lists of integers and sum them up
+ total_input_tokens = sum([int(token) for token in input_token_strings])
+ total_output_tokens = sum([int(token) for token in output_token_strings])
+
+ number_of_calls = len(input_token_strings)
+
+ print(f"Found {number_of_calls} LLM call entries in metadata.")
+ print("-" * 20)
+ print(f"Total Input Tokens: {total_input_tokens}")
+ print(f"Total Output Tokens: {total_output_tokens}")
+
+ return total_input_tokens, total_output_tokens, number_of_calls
diff --git a/tools/load_spacy_model_custom_recognisers.py b/tools/load_spacy_model_custom_recognisers.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c746a9ab8490f1b956030049159217599aa1870
--- /dev/null
+++ b/tools/load_spacy_model_custom_recognisers.py
@@ -0,0 +1,974 @@
+from typing import List
+
+import spacy
+from presidio_analyzer import (
+ AnalyzerEngine,
+ EntityRecognizer,
+ Pattern,
+ PatternRecognizer,
+ RecognizerResult,
+)
+from presidio_analyzer.nlp_engine import (
+ NerModelConfiguration,
+ NlpArtifacts,
+ SpacyNlpEngine,
+)
+from spacy.matcher import Matcher
+from spaczz.matcher import FuzzyMatcher
+
+spacy.prefer_gpu()
+import os
+import re
+
+import gradio as gr
+import Levenshtein
+import requests
+from spacy.cli.download import download
+
+from tools.config import (
+ CUSTOM_ENTITIES,
+ DEFAULT_LANGUAGE,
+ SPACY_MODEL_PATH,
+ TESSERACT_DATA_FOLDER,
+)
+
+score_threshold = 0.001
+custom_entities = CUSTOM_ENTITIES
+
+
+# Create a class inheriting from SpacyNlpEngine
+class LoadedSpacyNlpEngine(SpacyNlpEngine):
+ def __init__(self, loaded_spacy_model, language_code: str):
+ super().__init__(
+ ner_model_configuration=NerModelConfiguration(
+ labels_to_ignore=["CARDINAL", "ORDINAL"]
+ )
+ ) # Ignore non-relevant labels
+ self.nlp = {language_code: loaded_spacy_model}
+
+
+def _base_language_code(language: str) -> str:
+ lang = _normalize_language_input(language)
+ if "_" in lang:
+ return lang.split("_")[0]
+ return lang
+
+
+def load_spacy_model(language: str = DEFAULT_LANGUAGE):
+ """
+ Load a spaCy model for the requested language and return it as `nlp`.
+
+ Accepts common inputs like: "en", "en_lg", "en_sm", "de", "fr", "es", "it", "nl", "pt", "zh", "ja", "xx".
+ Falls back through sensible candidates and will download if missing.
+ """
+
+ # Set spaCy data path for custom model storage (only if specified)
+ import os
+
+ if SPACY_MODEL_PATH and SPACY_MODEL_PATH.strip():
+ os.environ["SPACY_DATA"] = SPACY_MODEL_PATH
+ print(f"Setting spaCy model path to: {SPACY_MODEL_PATH}")
+ else:
+ print("Using default spaCy model storage location")
+
+ synonyms = {
+ "english": "en",
+ "catalan": "ca",
+ "danish": "da",
+ "german": "de",
+ "french": "fr",
+ "greek": "el",
+ "finnish": "fi",
+ "croatian": "hr",
+ "lithuanian": "lt",
+ "macedonian": "mk",
+ "norwegian_bokmaal": "nb",
+ "polish": "pl",
+ "russian": "ru",
+ "slovenian": "sl",
+ "swedish": "sv",
+ "dutch": "nl",
+ "portuguese": "pt",
+ "chinese": "zh",
+ "japanese": "ja",
+ "multilingual": "xx",
+ }
+
+ lang_norm = _normalize_language_input(language)
+ lang_norm = synonyms.get(lang_norm, lang_norm)
+ base_lang = _base_language_code(lang_norm)
+
+ candidates_by_lang = {
+ # English - prioritize lg, then trf, then md, then sm
+ "en": [
+ "en_core_web_lg",
+ "en_core_web_trf",
+ "en_core_web_md",
+ "en_core_web_sm",
+ ],
+ "en_lg": ["en_core_web_lg"],
+ "en_trf": ["en_core_web_trf"],
+ "en_md": ["en_core_web_md"],
+ "en_sm": ["en_core_web_sm"],
+ # Major languages (news pipelines) - prioritize lg, then md, then sm
+ "ca": ["ca_core_news_lg", "ca_core_news_md", "ca_core_news_sm"], # Catalan
+ "da": ["da_core_news_lg", "da_core_news_md", "da_core_news_sm"], # Danish
+ "de": ["de_core_news_lg", "de_core_news_md", "de_core_news_sm"], # German
+ "el": ["el_core_news_lg", "el_core_news_md", "el_core_news_sm"], # Greek
+ "es": ["es_core_news_lg", "es_core_news_md", "es_core_news_sm"], # Spanish
+ "fi": ["fi_core_news_lg", "fi_core_news_md", "fi_core_news_sm"], # Finnish
+ "fr": ["fr_core_news_lg", "fr_core_news_md", "fr_core_news_sm"], # French
+ "hr": ["hr_core_news_lg", "hr_core_news_md", "hr_core_news_sm"], # Croatian
+ "it": ["it_core_news_lg", "it_core_news_md", "it_core_news_sm"], # Italian
+ "ja": ["ja_core_news_lg", "ja_core_news_md", "ja_core_news_sm"], # Japanese
+ "ko": ["ko_core_news_lg", "ko_core_news_md", "ko_core_news_sm"], # Korean
+ "lt": ["lt_core_news_lg", "lt_core_news_md", "lt_core_news_sm"], # Lithuanian
+ "mk": ["mk_core_news_lg", "mk_core_news_md", "mk_core_news_sm"], # Macedonian
+ "nb": [
+ "nb_core_news_lg",
+ "nb_core_news_md",
+ "nb_core_news_sm",
+ ], # Norwegian Bokmål
+ "nl": ["nl_core_news_lg", "nl_core_news_md", "nl_core_news_sm"], # Dutch
+ "pl": ["pl_core_news_lg", "pl_core_news_md", "pl_core_news_sm"], # Polish
+ "pt": ["pt_core_news_lg", "pt_core_news_md", "pt_core_news_sm"], # Portuguese
+ "ro": ["ro_core_news_lg", "ro_core_news_md", "ro_core_news_sm"], # Romanian
+ "ru": ["ru_core_news_lg", "ru_core_news_md", "ru_core_news_sm"], # Russian
+ "sl": ["sl_core_news_lg", "sl_core_news_md", "sl_core_news_sm"], # Slovenian
+ "sv": ["sv_core_news_lg", "sv_core_news_md", "sv_core_news_sm"], # Swedish
+ "uk": ["uk_core_news_lg", "uk_core_news_md", "uk_core_news_sm"], # Ukrainian
+ "zh": [
+ "zh_core_web_lg",
+ "zh_core_web_mod",
+ "zh_core_web_sm",
+ "zh_core_web_trf",
+ ], # Chinese
+ # Multilingual NER
+ "xx": ["xx_ent_wiki_sm"],
+ }
+
+ if lang_norm in candidates_by_lang:
+ candidates = candidates_by_lang[lang_norm]
+ elif base_lang in candidates_by_lang:
+ candidates = candidates_by_lang[base_lang]
+ else:
+ # Fallback to multilingual if unknown
+ candidates = candidates_by_lang["xx"]
+
+ last_error = None
+ if language != "en":
+ print(
+ f"Attempting to load spaCy model for language '{language}' with candidates: {candidates}"
+ )
+ print(
+ "Note: Models are prioritized by size (lg > md > sm) - will stop after first successful load"
+ )
+
+ for i, candidate in enumerate(candidates):
+ if language != "en":
+ print(f"Trying candidate {i+1}/{len(candidates)}: {candidate}")
+
+ # Try importable package first (fast-path when installed as a package)
+ try:
+ module = __import__(candidate)
+ print(f"✓ Successfully imported spaCy model: {candidate}")
+ return module.load()
+ except Exception as e:
+ last_error = e
+
+ # Try spacy.load if package is linked/installed
+ try:
+ nlp = spacy.load(candidate)
+ print(f"✓ Successfully loaded spaCy model via spacy.load: {candidate}")
+ return nlp
+ except OSError:
+ # Model not found, proceed with download
+ print(f"Model {candidate} not found, attempting to download...")
+ try:
+ download(candidate)
+ print(f"✓ Successfully downloaded spaCy model: {candidate}")
+
+ # Refresh spaCy's model registry after download
+ import importlib
+ import sys
+
+ importlib.reload(spacy)
+
+ # Clear any cached imports that might interfere
+ if candidate in sys.modules:
+ del sys.modules[candidate]
+
+ # Small delay to ensure model is fully registered
+ import time
+
+ time.sleep(0.5)
+
+ # Try to load the downloaded model
+ nlp = spacy.load(candidate)
+ print(f"✓ Successfully loaded downloaded spaCy model: {candidate}")
+ return nlp
+ except Exception as download_error:
+ print(f"✗ Failed to download or load {candidate}: {download_error}")
+ # Try alternative loading methods
+ try:
+ # Try importing the module directly after download
+ module = __import__(candidate)
+ print(
+ f"✓ Successfully loaded {candidate} via direct import after download"
+ )
+ return module.load()
+ except Exception as import_error:
+ print(f"✗ Direct import also failed: {import_error}")
+
+ # Try one more approach - force spaCy to refresh its model registry
+ try:
+ from spacy.util import get_model_path
+
+ model_path = get_model_path(candidate)
+ if model_path and os.path.exists(model_path):
+ print(f"Found model at path: {model_path}")
+ nlp = spacy.load(model_path)
+ print(
+ f"✓ Successfully loaded {candidate} from path: {model_path}"
+ )
+ return nlp
+ except Exception as path_error:
+ print(f"✗ Path-based loading also failed: {path_error}")
+
+ last_error = download_error
+ continue
+ except Exception as e:
+ print(f"✗ Failed to load {candidate}: {e}")
+ last_error = e
+ continue
+
+ # Provide more helpful error message
+ error_msg = f"Failed to load spaCy model for language '{language}'"
+ if last_error:
+ error_msg += f". Last error: {last_error}"
+ error_msg += f". Tried candidates: {candidates}"
+
+ raise RuntimeError(error_msg)
+
+
+# Language-aware spaCy model loader
+def _normalize_language_input(language: str) -> str:
+ return language.strip().lower().replace("-", "_")
+
+
+# Update the global variables to use the new function
+ACTIVE_LANGUAGE_CODE = _base_language_code(DEFAULT_LANGUAGE)
+nlp = None # Placeholder, will be loaded in the create_nlp_analyser function below #load_spacy_model(DEFAULT_LANGUAGE)
+
+
+def get_tesseract_lang_code(short_code: str):
+ """
+ Maps a two-letter language code to the corresponding Tesseract OCR code.
+
+ Args:
+ short_code (str): The two-letter language code (e.g., "en", "de").
+
+ Returns:
+ str or None: The Tesseract language code (e.g., "eng", "deu"),
+ or None if no mapping is found.
+ """
+ # Mapping from 2-letter codes to Tesseract 3-letter codes
+ # Based on ISO 639-2/T codes.
+ lang_map = {
+ "en": "eng",
+ "de": "deu",
+ "fr": "fra",
+ "es": "spa",
+ "it": "ita",
+ "nl": "nld",
+ "pt": "por",
+ "zh": "chi_sim", # Mapping to Simplified Chinese by default
+ "ja": "jpn",
+ "ko": "kor",
+ "lt": "lit",
+ "mk": "mkd",
+ "nb": "nor",
+ "pl": "pol",
+ "ro": "ron",
+ "ru": "rus",
+ "sl": "slv",
+ "sv": "swe",
+ "uk": "ukr",
+ }
+
+ return lang_map.get(short_code)
+
+
+def download_tesseract_lang_pack(
+ short_lang_code: str, tessdata_dir=TESSERACT_DATA_FOLDER
+):
+ """
+ Downloads a Tesseract language pack to a local directory.
+
+ Args:
+ lang_code (str): The short code for the language (e.g., "eng", "fra").
+ tessdata_dir (str, optional): The directory to save the language pack.
+ Defaults to "tessdata".
+ """
+
+ # Create the directory if it doesn't exist
+ if not os.path.exists(tessdata_dir):
+ os.makedirs(tessdata_dir)
+
+ # Get the Tesseract language code
+ lang_code = get_tesseract_lang_code(short_lang_code)
+
+ if lang_code is None:
+ raise ValueError(
+ f"Language code {short_lang_code} not found in Tesseract language map"
+ )
+
+ # Set the local file path
+ file_path = os.path.join(tessdata_dir, f"{lang_code}.traineddata")
+
+ # Check if the file already exists
+ if os.path.exists(file_path):
+ print(f"Language pack {lang_code}.traineddata already exists at {file_path}")
+ return file_path
+
+ # Construct the URL for the language pack
+ url = f"https://raw.githubusercontent.com/tesseract-ocr/tessdata/main/{lang_code}.traineddata"
+
+ # Download the file
+ try:
+ response = requests.get(url, stream=True, timeout=60)
+ response.raise_for_status() # Raise an exception for bad status codes
+
+ with open(file_path, "wb") as f:
+ for chunk in response.iter_content(chunk_size=8192):
+ f.write(chunk)
+
+ print(f"Successfully downloaded {lang_code}.traineddata to {file_path}")
+ return file_path
+
+ except requests.exceptions.RequestException as e:
+ print(f"Error downloading {lang_code}.traineddata: {e}")
+ return None
+
+
+#### Custom recognisers
+def _is_regex_pattern(term: str) -> bool:
+ """
+ Detect if a term is intended to be a regex pattern or a literal string.
+
+ Args:
+ term: The term to check
+
+ Returns:
+ True if the term appears to be a regex pattern, False if it's a literal string
+ """
+ term = term.strip()
+ if not term:
+ return False
+
+ # First, try to compile as regex to validate it
+ # This catches patterns like \d\d\d-\d\d\d that use regex escape sequences
+ try:
+ re.compile(term)
+ is_valid_regex = True
+ except re.error:
+ # If it doesn't compile as regex, treat as literal
+ return False
+
+ # If it compiles, check if it contains regex-like features
+ # Regex metacharacters that suggest a pattern (excluding escaped literals)
+ regex_metacharacters = [
+ "+",
+ "*",
+ "?",
+ "{",
+ "}",
+ "[",
+ "]",
+ "(",
+ ")",
+ "|",
+ "^",
+ "$",
+ ".",
+ ]
+
+ # Common regex escape sequences that indicate regex intent
+ regex_escape_sequences = [
+ "\\d",
+ "\\w",
+ "\\s",
+ "\\D",
+ "\\W",
+ "\\S",
+ "\\b",
+ "\\B",
+ "\\n",
+ "\\t",
+ "\\r",
+ ]
+
+ # Check if term contains regex metacharacters or escape sequences
+ has_metacharacters = False
+ has_escape_sequences = False
+
+ i = 0
+ while i < len(term):
+ if term[i] == "\\" and i + 1 < len(term):
+ # Check if it's a regex escape sequence
+ escape_seq = term[i : i + 2]
+ if escape_seq in regex_escape_sequences:
+ has_escape_sequences = True
+ # Skip the escape sequence (backslash + next char)
+ i += 2
+ continue
+ if term[i] in regex_metacharacters:
+ has_metacharacters = True
+ i += 1
+
+ # If it's a valid regex and contains regex features, treat as regex pattern
+ if is_valid_regex and (has_metacharacters or has_escape_sequences):
+ return True
+
+ # If it compiles but has no regex features, it might be a literal that happens to compile
+ # (e.g., "test" compiles as regex but is just literal text)
+ # In this case, if it has escape sequences, it's definitely regex
+ if has_escape_sequences:
+ return True
+
+ # Otherwise, treat as literal
+ return False
+
+
+def custom_word_list_recogniser(custom_list: List[str] = list()):
+ # Create regex pattern, handling quotes carefully
+ # Supports both literal strings and regex patterns
+
+ quote_str = '"'
+ replace_str = '(?:"|"|")'
+
+ regex_patterns = []
+ literal_patterns = []
+
+ # Separate regex patterns from literal strings
+ for term in custom_list:
+ term = term.strip()
+ if not term:
+ continue
+
+ if _is_regex_pattern(term):
+ # Use regex pattern as-is (but wrap with word boundaries if appropriate)
+ # Note: Word boundaries might not be appropriate for all regex patterns
+ # (e.g., email patterns), so we'll add them conditionally
+ regex_patterns.append(term)
+ else:
+ # Escape literal strings and add word boundaries
+ escaped_term = re.escape(term).replace(quote_str, replace_str)
+ literal_patterns.append(rf"(? str:
+ """
+ Extracts the street name and preceding word (that should contain at least one number) from the given text.
+
+ """
+
+ street_types = [
+ "Street",
+ "St",
+ "Boulevard",
+ "Blvd",
+ "Highway",
+ "Hwy",
+ "Broadway",
+ "Freeway",
+ "Causeway",
+ "Cswy",
+ "Expressway",
+ "Way",
+ "Walk",
+ "Lane",
+ "Ln",
+ "Road",
+ "Rd",
+ "Avenue",
+ "Ave",
+ "Circle",
+ "Cir",
+ "Cove",
+ "Cv",
+ "Drive",
+ "Dr",
+ "Parkway",
+ "Pkwy",
+ "Park",
+ "Court",
+ "Ct",
+ "Square",
+ "Sq",
+ "Loop",
+ "Place",
+ "Pl",
+ "Parade",
+ "Estate",
+ "Alley",
+ "Arcade",
+ "Avenue",
+ "Ave",
+ "Bay",
+ "Bend",
+ "Brae",
+ "Byway",
+ "Close",
+ "Corner",
+ "Cove",
+ "Crescent",
+ "Cres",
+ "Cul-de-sac",
+ "Dell",
+ "Drive",
+ "Dr",
+ "Esplanade",
+ "Glen",
+ "Green",
+ "Grove",
+ "Heights",
+ "Hts",
+ "Mews",
+ "Parade",
+ "Path",
+ "Piazza",
+ "Promenade",
+ "Quay",
+ "Ridge",
+ "Row",
+ "Terrace",
+ "Ter",
+ "Track",
+ "Trail",
+ "View",
+ "Villas",
+ "Marsh",
+ "Embankment",
+ "Cut",
+ "Hill",
+ "Passage",
+ "Rise",
+ "Vale",
+ "Side",
+ ]
+
+ # Dynamically construct the regex pattern with all possible street types
+ street_types_pattern = "|".join(
+ rf"{re.escape(street_type)}" for street_type in street_types
+ )
+
+ # The overall regex pattern to capture the street name and preceding word(s)
+
+ pattern = r"(?P\w*\d\w*)\s*"
+ pattern += rf"(?P\w+\s*\b(?:{street_types_pattern})\b)"
+
+ # Find all matches in text
+ matches = re.finditer(pattern, text, re.DOTALL | re.MULTILINE | re.IGNORECASE)
+
+ start_positions = list()
+ end_positions = list()
+
+ for match in matches:
+ match.group("preceding_word").strip()
+ match.group("street_name").strip()
+ start_pos = match.start()
+ end_pos = match.end()
+ # print(f"Start: {start_pos}, End: {end_pos}")
+ # print(f"Preceding words: {preceding_word}")
+ # print(f"Street name: {street_name}")
+
+ start_positions.append(start_pos)
+ end_positions.append(end_pos)
+
+ return start_positions, end_positions
+
+
+class StreetNameRecognizer(EntityRecognizer):
+
+ def load(self) -> None:
+ """No loading is required."""
+ pass
+
+ def analyze(
+ self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts
+ ) -> List[RecognizerResult]:
+ """
+ Logic for detecting a specific PII
+ """
+
+ start_pos, end_pos = extract_street_name(text)
+
+ results = list()
+
+ for i in range(0, len(start_pos)):
+
+ result = RecognizerResult(
+ entity_type="STREETNAME", start=start_pos[i], end=end_pos[i], score=1
+ )
+
+ results.append(result)
+
+ return results
+
+
+street_recogniser = StreetNameRecognizer(supported_entities=["STREETNAME"])
+
+
+## Custom fuzzy match recogniser for list of strings
+def custom_fuzzy_word_list_regex(text: str, custom_list: List[str] = list()):
+ # Create regex pattern, handling quotes carefully
+
+ quote_str = '"'
+ replace_str = '(?:"|"|")'
+
+ custom_regex_pattern = "|".join(
+ rf"(? None:
+ """No loading is required."""
+ pass
+
+ def analyze(
+ self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts
+ ) -> List[RecognizerResult]:
+ """
+ Logic for detecting a specific PII
+ """
+ start_pos, end_pos = spacy_fuzzy_search(
+ text, self.custom_list, self.spelling_mistakes_max, self.search_whole_phrase
+ ) # Pass new parameters
+
+ results = list()
+
+ for i in range(0, len(start_pos)):
+ result = RecognizerResult(
+ entity_type="CUSTOM_FUZZY", start=start_pos[i], end=end_pos[i], score=1
+ )
+ results.append(result)
+
+ return results
+
+
+custom_list_default = list()
+custom_word_fuzzy_recognizer = CustomWordFuzzyRecognizer(
+ supported_entities=["CUSTOM_FUZZY"], custom_list=custom_list_default
+)
+
+# Pass the loaded model to the new LoadedSpacyNlpEngine
+loaded_nlp_engine = LoadedSpacyNlpEngine(
+ loaded_spacy_model=nlp, language_code=ACTIVE_LANGUAGE_CODE
+)
+
+
+def create_nlp_analyser(
+ language: str = DEFAULT_LANGUAGE,
+ custom_list: List[str] = None,
+ spelling_mistakes_max: int = 1,
+ search_whole_phrase: bool = True,
+ existing_nlp_analyser: AnalyzerEngine = None,
+ return_also_model: bool = False,
+):
+ """
+ Create an nlp_analyser object based on the specified language input.
+
+ Args:
+ language (str): Language code (e.g., "en", "de", "fr", "es", etc.)
+ custom_list (List[str], optional): List of custom words to recognize. Defaults to None.
+ spelling_mistakes_max (int, optional): Maximum number of spelling mistakes for fuzzy matching. Defaults to 1.
+ search_whole_phrase (bool, optional): Whether to search for whole phrases or individual words. Defaults to True.
+ existing_nlp_analyser (AnalyzerEngine, optional): Existing nlp_analyser object to use. Defaults to None.
+ return_also_model (bool, optional): Whether to return the nlp_model object as well. Defaults to False.
+
+ Returns:
+ AnalyzerEngine: Configured nlp_analyser object with custom recognizers
+ """
+
+ if existing_nlp_analyser is None:
+ pass
+ else:
+ if existing_nlp_analyser.supported_languages[0] == language:
+ nlp_analyser = existing_nlp_analyser
+ print(f"Using existing nlp_analyser for {language}")
+ return nlp_analyser
+
+ # Load spaCy model for the specified language
+ nlp_model = load_spacy_model(language)
+
+ # Get base language code
+ base_lang_code = _base_language_code(language)
+
+ # Create custom recognizers
+ if custom_list is None:
+ custom_list = list()
+
+ custom_recogniser = custom_word_list_recogniser(custom_list)
+ custom_word_fuzzy_recognizer = CustomWordFuzzyRecognizer(
+ supported_entities=["CUSTOM_FUZZY"],
+ custom_list=custom_list,
+ spelling_mistakes_max=spelling_mistakes_max,
+ search_whole_phrase=search_whole_phrase,
+ )
+
+ # Create NLP engine with loaded model
+ loaded_nlp_engine = LoadedSpacyNlpEngine(
+ loaded_spacy_model=nlp_model, language_code=base_lang_code
+ )
+
+ # Create analyzer engine
+ nlp_analyser = AnalyzerEngine(
+ nlp_engine=loaded_nlp_engine,
+ default_score_threshold=score_threshold,
+ supported_languages=[base_lang_code],
+ log_decision_process=False,
+ )
+
+ # Add custom recognizers to nlp_analyser
+ nlp_analyser.registry.add_recognizer(custom_recogniser)
+ nlp_analyser.registry.add_recognizer(custom_word_fuzzy_recognizer)
+
+ # Add language-specific recognizers for English
+ if base_lang_code == "en":
+ nlp_analyser.registry.add_recognizer(street_recogniser)
+ nlp_analyser.registry.add_recognizer(ukpostcode_recogniser)
+ nlp_analyser.registry.add_recognizer(titles_recogniser)
+
+ if return_also_model:
+ return nlp_analyser, nlp_model
+
+ return nlp_analyser
+
+
+# Create the default nlp_analyser using the new function
+nlp_analyser, nlp = create_nlp_analyser(DEFAULT_LANGUAGE, return_also_model=True)
+
+
+def spacy_fuzzy_search(
+ text: str,
+ custom_query_list: List[str] = list(),
+ spelling_mistakes_max: int = 1,
+ search_whole_phrase: bool = True,
+ nlp=nlp,
+ progress=gr.Progress(track_tqdm=True),
+):
+ """Conduct fuzzy match on a list of text data."""
+
+ all_matches = list()
+ all_start_positions = list()
+ all_end_positions = list()
+ all_ratios = list()
+
+ # print("custom_query_list:", custom_query_list)
+
+ if not text:
+ out_message = "No text data found. Skipping page."
+ print(out_message)
+ return all_start_positions, all_end_positions
+
+ for string_query in custom_query_list:
+
+ query = nlp(string_query)
+
+ if search_whole_phrase is False:
+ # Keep only words that are not stop words
+ token_query = [
+ token.text
+ for token in query
+ if not token.is_space and not token.is_stop and not token.is_punct
+ ]
+
+ spelling_mistakes_fuzzy_pattern = "FUZZY" + str(spelling_mistakes_max)
+
+ if len(token_query) > 1:
+ # pattern_lemma = [{"LEMMA": {"IN": query}}]
+ pattern_fuzz = [
+ {"TEXT": {spelling_mistakes_fuzzy_pattern: {"IN": token_query}}}
+ ]
+ else:
+ # pattern_lemma = [{"LEMMA": query[0]}]
+ pattern_fuzz = [
+ {"TEXT": {spelling_mistakes_fuzzy_pattern: token_query[0]}}
+ ]
+
+ matcher = Matcher(nlp.vocab)
+ matcher.add(string_query, [pattern_fuzz])
+ # matcher.add(string_query, [pattern_lemma])
+
+ else:
+ # If matching a whole phrase, use Spacy PhraseMatcher, then consider similarity after using Levenshtein distance.
+ # If you want to match the whole phrase, use phrase matcher
+ matcher = FuzzyMatcher(nlp.vocab)
+ patterns = [nlp.make_doc(string_query)] # Convert query into a Doc object
+ matcher.add("PHRASE", patterns, [{"ignore_case": True}])
+
+ batch_size = 256
+ docs = nlp.pipe([text], batch_size=batch_size)
+
+ # Get number of matches per doc
+ for doc in docs: # progress.tqdm(docs, desc = "Searching text", unit = "rows"):
+ matches = matcher(doc)
+ match_count = len(matches)
+
+ # If considering each sub term individually, append match. If considering together, consider weight of the relevance to that of the whole phrase.
+ if search_whole_phrase is False:
+ all_matches.append(match_count)
+
+ for match_id, start, end in matches:
+ span = str(doc[start:end]).strip()
+ query_search = str(query).strip()
+
+ # Convert word positions to character positions
+ start_char = doc[start].idx # Start character position
+ end_char = doc[end - 1].idx + len(
+ doc[end - 1]
+ ) # End character position
+
+ # The positions here are word position, not character position
+ all_matches.append(match_count)
+ all_start_positions.append(start_char)
+ all_end_positions.append(end_char)
+
+ else:
+ for match_id, start, end, ratio, pattern in matches:
+ span = str(doc[start:end]).strip()
+ query_search = str(query).strip()
+
+ # Calculate Levenshtein distance. Only keep matches with less than specified number of spelling mistakes
+ distance = Levenshtein.distance(query_search.lower(), span.lower())
+
+ # print("Levenshtein distance:", distance)
+
+ if distance > spelling_mistakes_max:
+ match_count = match_count - 1
+ else:
+ # Convert word positions to character positions
+ start_char = doc[start].idx # Start character position
+ end_char = doc[end - 1].idx + len(
+ doc[end - 1]
+ ) # End character position
+
+ all_matches.append(match_count)
+ all_start_positions.append(start_char)
+ all_end_positions.append(end_char)
+ all_ratios.append(ratio)
+
+ return all_start_positions, all_end_positions
diff --git a/tools/presidio_analyzer_custom.py b/tools/presidio_analyzer_custom.py
new file mode 100644
index 0000000000000000000000000000000000000000..560701fbf6c6f68f493039c38a829119931693d3
--- /dev/null
+++ b/tools/presidio_analyzer_custom.py
@@ -0,0 +1,142 @@
+from typing import Any, Dict, Iterable, Iterator, List, Optional, Tuple, Union
+
+import gradio as gr
+
+# from tqdm import tqdm
+from presidio_analyzer import DictAnalyzerResult, RecognizerResult
+from presidio_analyzer.nlp_engine import NlpArtifacts
+
+
+def recognizer_result_from_dict(data: Dict) -> RecognizerResult:
+ """
+ Create RecognizerResult from a dictionary.
+
+ :param data: e.g. {
+ "entity_type": "NAME",
+ "start": 24,
+ "end": 32,
+ "score": 0.8,
+ "recognition_metadata": None
+ }
+ :return: RecognizerResult
+ """
+
+ entity_type = data.get("Type")
+ start = data.get("BeginOffset")
+ end = data.get("EndOffset")
+ score = data.get("Score")
+ analysis_explanation = None
+ recognition_metadata = None
+
+ return RecognizerResult(
+ entity_type, start, end, score, analysis_explanation, recognition_metadata
+ )
+
+
+def analyze_iterator_custom(
+ self,
+ texts: Iterable[Union[str, bool, float, int]],
+ language: str,
+ list_length: int,
+ progress=gr.Progress(),
+ **kwargs,
+) -> List[List[RecognizerResult]]:
+ """
+ Analyze an iterable of strings.
+
+ :param texts: An list containing strings to be analyzed.
+ :param language: Input language
+ :param list_length: Length of the input list.
+ :param kwargs: Additional parameters for the `AnalyzerEngine.analyze` method.
+ """
+
+ # validate types
+ texts = self._validate_types(texts)
+
+ # Process the texts as batch for improved performance
+ nlp_artifacts_batch: Iterator[Tuple[str, NlpArtifacts]] = (
+ self.analyzer_engine.nlp_engine.process_batch(texts=texts, language=language)
+ )
+
+ list_results = list()
+
+ # Uncomment this if you want to show progress within a file
+ # for text, nlp_artifacts in progress.tqdm(nlp_artifacts_batch, total = list_length, desc = "Analysing text for personal information", unit = "rows"):
+ for text, nlp_artifacts in nlp_artifacts_batch:
+ results = self.analyzer_engine.analyze(
+ text=str(text), nlp_artifacts=nlp_artifacts, language=language, **kwargs
+ )
+
+ list_results.append(results)
+
+ return list_results
+
+
+def analyze_dict(
+ self,
+ input_dict: Dict[str, Union[Any, Iterable[Any]]],
+ language: str,
+ keys_to_skip: Optional[List[str]] = None,
+ **kwargs,
+) -> Iterator[DictAnalyzerResult]:
+ """
+ Analyze a dictionary of keys (strings) and values/iterable of values.
+
+ Non-string values are returned as is.
+
+ :param input_dict: The input dictionary for analysis
+ :param language: Input language
+ :param keys_to_skip: Keys to ignore during analysis
+ :param kwargs: Additional keyword arguments
+ for the `AnalyzerEngine.analyze` method.
+ Use this to pass arguments to the analyze method,
+ such as `ad_hoc_recognizers`, `context`, `return_decision_process`.
+ See `AnalyzerEngine.analyze` for the full list.
+ """
+
+ context = list()
+ if "context" in kwargs:
+ context = kwargs["context"]
+ del kwargs["context"]
+
+ if not keys_to_skip:
+ keys_to_skip = list()
+
+ for key, value in input_dict.items():
+ if not value or key in keys_to_skip:
+ yield DictAnalyzerResult(key=key, value=value, recognizer_results=[])
+ continue # skip this key as requested
+
+ # Add the key as an additional context
+ specific_context = context[:]
+ specific_context.append(key)
+
+ if type(value) in (str, int, bool, float):
+ results: List[RecognizerResult] = self.analyzer_engine.analyze(
+ text=str(value), language=language, context=[key], **kwargs
+ )
+ elif isinstance(value, dict):
+ new_keys_to_skip = self._get_nested_keys_to_skip(key, keys_to_skip)
+ results = self.analyze_dict(
+ input_dict=value,
+ language=language,
+ context=specific_context,
+ keys_to_skip=new_keys_to_skip,
+ **kwargs,
+ )
+ elif isinstance(value, Iterable):
+ # Recursively iterate nested dicts
+ list_length = len(value)
+
+ results: List[List[RecognizerResult]] = analyze_iterator_custom(
+ self,
+ texts=value,
+ language=language,
+ context=specific_context,
+ list_length=list_length,
+ **kwargs,
+ )
+ else:
+ raise ValueError(f"type {type(value)} is unsupported.")
+
+ yield DictAnalyzerResult(key=key, value=value, recognizer_results=results)
diff --git a/tools/quickstart.py b/tools/quickstart.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c3448cb266b0661da8042fade4ee2998dfdddb7
--- /dev/null
+++ b/tools/quickstart.py
@@ -0,0 +1,934 @@
+"""Helper functions for the quickstart walkthrough in the redaction app."""
+
+import os
+
+import gradio as gr
+import pandas as pd
+
+from tools.config import (
+ AWS_LLM_PII_OPTION,
+ AWS_PII_OPTION,
+ CHOSEN_COMPREHEND_ENTITIES,
+ CHOSEN_LLM_ENTITIES,
+ CHOSEN_REDACT_ENTITIES,
+ DEFAULT_PII_DETECTION_MODEL,
+ INFERENCE_SERVER_PII_OPTION,
+ LOCAL_OCR_MODEL_TEXT_EXTRACT_OPTION,
+ LOCAL_PII_OPTION,
+ LOCAL_TRANSFORMERS_LLM_PII_OPTION,
+ NO_REDACTION_PII_OPTION,
+ SHOW_AWS_TEXT_EXTRACTION_OPTIONS,
+ SHOW_INFERENCE_SERVER_VLM_MODEL_OPTIONS,
+ SHOW_OCR_GUI_OPTIONS,
+ SHOW_PII_IDENTIFICATION_OPTIONS,
+ TEXTRACT_TEXT_EXTRACT_OPTION,
+)
+from tools.helper_functions import put_columns_in_df
+
+
+def is_data_file_type_walkthrough(files):
+ """Check if files are data file types (xlsx, xls, csv, parquet, docx)."""
+ if not files:
+ return False
+ data_file_extensions = {".xlsx", ".xls", ".csv", ".parquet", ".docx"}
+ for file in files:
+ if file:
+ file_path = file.name if hasattr(file, "name") else str(file)
+ file_ext = os.path.splitext(file_path)[1].lower()
+ if file_ext in data_file_extensions:
+ return True
+ return False
+
+
+def route_walkthrough_files(files):
+ """Route files from walkthrough to appropriate component and determine if data file.
+
+ Also returns visibility updates for step 2 text extraction components: when the
+ upload is CSV/Excel (data file), those are hidden; when it is a document, they
+ follow SHOW_OCR_GUI_OPTIONS (radio) and accordions are left unchanged.
+ """
+ if not files:
+ show_text_extract = SHOW_OCR_GUI_OPTIONS
+ return (
+ None,
+ None,
+ False,
+ gr.Walkthrough(selected=2),
+ gr.update(visible=show_text_extract),
+ gr.update(),
+ gr.update(),
+ )
+
+ is_data = is_data_file_type_walkthrough(files)
+ doc_files = []
+ data_files = []
+
+ data_file_extensions = {".xlsx", ".xls", ".csv", ".parquet", ".docx"}
+
+ for file in files:
+ if file:
+ file_path = file.name if hasattr(file, "name") else str(file)
+ file_ext = os.path.splitext(file_path)[1].lower()
+ if file_ext in data_file_extensions:
+ data_files.append(file)
+ else:
+ doc_files.append(file)
+
+ # Hide text extraction options on step 2 when CSV/Excel (data file) was uploaded
+ show_text_extract = (not is_data) and SHOW_OCR_GUI_OPTIONS
+ if is_data:
+ text_extract_updates = (
+ gr.update(visible=False),
+ gr.update(visible=False),
+ gr.update(visible=False),
+ )
+ else:
+ # Document: show radio if enabled; leave accordions unchanged (they follow radio selection)
+ text_extract_updates = (
+ gr.update(visible=show_text_extract),
+ gr.update(),
+ gr.update(),
+ )
+
+ if is_data:
+ return None, data_files, True, gr.Walkthrough(selected=2), *text_extract_updates
+ else:
+ return doc_files, None, False, gr.Walkthrough(selected=2), *text_extract_updates
+
+
+def handle_step_2_next(
+ files,
+ is_data_file,
+ walkthrough_colnames_val,
+ walkthrough_excel_sheets_val,
+ text_extract_method_val,
+):
+ """Handle step 2 next button - populate dropdowns if data files and sync with main components."""
+ # Show text extraction method radio in Step 2 if SHOW_OCR_GUI_OPTIONS is True
+ show_text_extract_method = SHOW_OCR_GUI_OPTIONS
+
+ if is_data_file and files:
+ # Use put_columns_in_df to populate dropdowns
+ colnames_dropdown, excel_sheets_dropdown = put_columns_in_df(files)
+ # Use the selected values from walkthrough if available, otherwise use the populated values
+ if (
+ walkthrough_colnames_val
+ and len(walkthrough_colnames_val) > 0
+ and walkthrough_colnames_val[0] != "Choose columns to anonymise"
+ ):
+ main_colnames_update = gr.Dropdown(value=walkthrough_colnames_val)
+ else:
+ main_colnames_update = colnames_dropdown
+
+ if (
+ walkthrough_excel_sheets_val
+ and len(walkthrough_excel_sheets_val) > 0
+ and walkthrough_excel_sheets_val[0] != "Choose Excel sheets to anonymise"
+ ):
+ main_excel_sheets_update = gr.Dropdown(
+ value=walkthrough_excel_sheets_val, visible=True
+ )
+ else:
+ main_excel_sheets_update = excel_sheets_dropdown
+
+ # Preserve user's column selection in walkthrough_colnames; do not overwrite with colnames_dropdown
+ # (colnames_dropdown has value=all columns, which would reset a one-column selection to all four)
+ if (
+ walkthrough_colnames_val
+ and len(walkthrough_colnames_val) > 0
+ and walkthrough_colnames_val[0] != "Choose columns to anonymise"
+ ):
+ walkthrough_colnames_update = gr.update(
+ value=walkthrough_colnames_val, visible=True
+ )
+ else:
+ walkthrough_colnames_update = colnames_dropdown
+
+ # Preserve user's sheet selection in walkthrough_excel_sheets when they have made one
+ if (
+ walkthrough_excel_sheets_val
+ and len(walkthrough_excel_sheets_val) > 0
+ and walkthrough_excel_sheets_val[0] != "Choose Excel sheets to anonymise"
+ ):
+ walkthrough_excel_sheets_update = gr.update(
+ value=walkthrough_excel_sheets_val, visible=True
+ )
+ else:
+ walkthrough_excel_sheets_update = excel_sheets_dropdown
+
+ # Return updates for both walkthrough and main components, and advance walkthrough
+ # Note: walkthrough_local_ocr_method_radio and walkthrough_handwrite_signature_checkbox visibility
+ # are controlled by event handler on walkthrough_text_extract_method_radio
+ # Note: Step 3 PII components visibility is controlled by event handler on walkthrough_redaction_method_dropdown
+ return (
+ walkthrough_colnames_update, # walkthrough_colnames
+ walkthrough_excel_sheets_update, # walkthrough_excel_sheets
+ main_colnames_update, # in_colnames
+ main_excel_sheets_update, # in_excel_sheets (defined in "Word or Excel/CSV files" tab)
+ gr.Radio(
+ visible=show_text_extract_method
+ ), # walkthrough_text_extract_method_radio
+ gr.Walkthrough(selected=3), # walkthrough
+ )
+ else:
+ # Return unchanged dropdowns and advance
+ # Note: walkthrough_local_ocr_method_radio and walkthrough_handwrite_signature_checkbox visibility
+ # are controlled by event handler on walkthrough_text_extract_method_radio
+ # Note: Step 3 PII components visibility is controlled by event handler on walkthrough_redaction_method_dropdown
+ return (
+ gr.Dropdown(visible=False), # walkthrough_colnames
+ gr.Dropdown(visible=False), # walkthrough_excel_sheets
+ gr.Dropdown(), # in_colnames (no change)
+ gr.Dropdown(visible=False), # in_excel_sheets (no change)
+ gr.Radio(
+ visible=show_text_extract_method
+ ), # walkthrough_text_extract_method_radio
+ gr.Walkthrough(selected=3), # walkthrough
+ )
+
+
+def _data_files_fingerprint(files):
+ """Return a stable key for the current file list to detect redundant updates."""
+ if not files:
+ return ()
+ return tuple(getattr(f, "name", str(f)) for f in files if f is not None)
+
+
+def update_step_2_on_data_file_upload(files, is_data_file, last_processed_keys=None):
+ """Update Step 2 components when data files are uploaded.
+
+ When last_processed_keys is provided (from gr.State), returns (colnames, sheets, new_keys)
+ and skips recomputation if files are unchanged to avoid Gradio re-firing change and
+ causing an infinite loading loop on the column dropdown.
+ """
+ keys = _data_files_fingerprint(files)
+ if last_processed_keys is not None and keys == last_processed_keys:
+ return gr.update(), gr.update(), last_processed_keys
+
+ if is_data_file and files:
+ colnames_dropdown, excel_sheets_dropdown = put_columns_in_df(files)
+ if last_processed_keys is not None:
+ return colnames_dropdown, excel_sheets_dropdown, keys
+ return colnames_dropdown, excel_sheets_dropdown, keys
+ else:
+ no_op = gr.Dropdown(visible=False), gr.Dropdown(visible=False), keys
+ if last_processed_keys is not None:
+ return *no_op, () if not keys else keys
+ return no_op
+
+
+def handle_text_extract_method_selection(text_extract_method: str):
+ """Handle text extraction method selection - show local OCR radio only if Local OCR model is selected,
+ and show AWS Textract settings only if AWS Textract is selected.
+
+ Args:
+ text_extract_method: Selected text extraction method
+
+ Returns:
+ Tuple of visibility updates for local OCR radio, and AWS Textract accordion
+ """
+ # Normalize (Gradio can send None when .change() fires before sync); default so something stays visible
+ if isinstance(text_extract_method, str):
+ text_extract_method = text_extract_method.strip()
+ if text_extract_method is None or text_extract_method == "":
+ text_extract_method = LOCAL_OCR_MODEL_TEXT_EXTRACT_OPTION
+
+ # Show local OCR method radio only if "Local OCR model - PDFs without selectable text" is selected
+ # When "AWS Bedrock VLM OCR" is selected, the local OCR method is automatically set to "bedrock-vlm" but the component is hidden
+ show_local_ocr = text_extract_method == LOCAL_OCR_MODEL_TEXT_EXTRACT_OPTION
+ # Show AWS Textract settings accordion only if "AWS Textract service - all PDF types" is selected
+ show_aws_textract = (
+ text_extract_method == TEXTRACT_TEXT_EXTRACT_OPTION
+ and SHOW_AWS_TEXT_EXTRACTION_OPTIONS
+ )
+
+ return (
+ gr.update(visible=show_local_ocr), # walkthrough_local_ocr_method_radio
+ gr.update(
+ visible=show_aws_textract
+ ), # walkthrough_handwrite_signature_checkbox
+ )
+
+
+def _is_llm_pii_method_shown_for_redaction(redaction_method, pii_method):
+ """Whether LLM PII controls (entity dropdown + custom instructions accordion) should be visible.
+
+ Mirrors the logic in handle_redaction_method_selection for is_llm_method_init.
+ """
+ if isinstance(redaction_method, str):
+ redaction_method = redaction_method.strip()
+ if redaction_method is None or redaction_method == "":
+ redaction_method = "Redact all PII"
+ if isinstance(pii_method, str):
+ pii_method = pii_method.strip()
+ if pii_method is None or pii_method == "":
+ pii_method = DEFAULT_PII_DETECTION_MODEL
+
+ is_redact_all_pii = redaction_method == "Redact all PII"
+ is_redact_selected_terms = redaction_method == "Redact selected terms"
+ is_redact_all_pii_or_selected_terms = is_redact_all_pii or is_redact_selected_terms
+ show_pii_method = (
+ is_redact_all_pii_or_selected_terms
+ ) and SHOW_PII_IDENTIFICATION_OPTIONS
+
+ pii_method_for_visibility = pii_method
+ if is_redact_all_pii_or_selected_terms and pii_method == NO_REDACTION_PII_OPTION:
+ pii_method_for_visibility = DEFAULT_PII_DETECTION_MODEL
+
+ return show_pii_method and (
+ pii_method_for_visibility == LOCAL_TRANSFORMERS_LLM_PII_OPTION
+ or pii_method_for_visibility == INFERENCE_SERVER_PII_OPTION
+ or pii_method_for_visibility == AWS_LLM_PII_OPTION
+ )
+
+
+def handle_redaction_method_selection(redaction_method: str, pii_method: str):
+ """Handle redaction method selection in Step 3 - show appropriate components based on selection."""
+ # Normalize inputs (Gradio can send whitespace or None when .change() fires before sync)
+ if isinstance(redaction_method, str):
+ redaction_method = redaction_method.strip()
+ if redaction_method is None or redaction_method == "":
+ redaction_method = "Redact all PII"
+ if isinstance(pii_method, str):
+ pii_method = pii_method.strip()
+ if pii_method is None or pii_method == "":
+ pii_method = DEFAULT_PII_DETECTION_MODEL
+
+ # Check which redaction method is selected
+ is_redact_all_pii = redaction_method == "Redact all PII"
+ is_redact_selected_terms = redaction_method == "Redact selected terms"
+ is_redact_all_pii_or_selected_terms = is_redact_all_pii or is_redact_selected_terms
+ is_extract_text_only = (
+ isinstance(redaction_method, str)
+ and redaction_method.strip() == "Extract text only"
+ )
+
+ # When switching from "Extract text only", the PII dropdown may still be
+ # NO_REDACTION_PII_OPTION; use DEFAULT_PII_DETECTION_MODEL so exactly one
+ # entity dropdown is visible and the UI doesn’t show all three or none.
+ pii_method_for_visibility = pii_method
+ if is_redact_all_pii_or_selected_terms and pii_method == NO_REDACTION_PII_OPTION:
+ pii_method_for_visibility = DEFAULT_PII_DETECTION_MODEL
+
+ # Show PII detection settings if "Redact all PII" OR "Redact selected terms" is selected
+ # Both options need PII detection method to determine what to redact
+ show_pii_method = (
+ is_redact_all_pii_or_selected_terms
+ ) and SHOW_PII_IDENTIFICATION_OPTIONS
+
+ # Determine visibility of entity dropdowns based on PII method
+ show_local_entities_init = show_pii_method and (
+ pii_method_for_visibility == LOCAL_PII_OPTION
+ )
+ show_comprehend_entities_init = show_pii_method and (
+ pii_method_for_visibility == AWS_PII_OPTION
+ )
+ is_llm_method_init = _is_llm_pii_method_shown_for_redaction(
+ redaction_method, pii_method
+ )
+
+ # For "Extract text only", hide all components
+ # For "Redact all PII", show PII detection components
+ # For "Redact selected terms", show both PII detection components AND deny/allow/fully redacted list components
+
+ # When we overrode pii_method for visibility, also update the PII dropdown value
+ pii_drop_value = None
+ if is_redact_all_pii_or_selected_terms and pii_method == NO_REDACTION_PII_OPTION:
+ pii_drop_value = DEFAULT_PII_DETECTION_MODEL
+
+ # Set entity values based on redaction method
+ if is_redact_selected_terms:
+ # For "Redact selected terms", only show CUSTOM entity
+ local_entities_update = gr.Dropdown(
+ visible=show_local_entities_init, value=["CUSTOM"]
+ )
+ comprehend_entities_update = gr.Dropdown(
+ visible=show_comprehend_entities_init, value=["CUSTOM"]
+ )
+ llm_entities_update = gr.Dropdown(visible=is_llm_method_init, value=["CUSTOM"])
+ walkthrough_pii_identification_method_drop_update = (
+ gr.update(visible=show_pii_method, value=pii_drop_value)
+ if pii_drop_value is not None
+ else gr.update(visible=show_pii_method)
+ )
+
+ elif is_redact_all_pii:
+ # For "Redact all PII", use default entities
+ # Ensure entities are lists (they should already be parsed in config.py)
+ local_entities_val = (
+ CHOSEN_REDACT_ENTITIES
+ if isinstance(CHOSEN_REDACT_ENTITIES, list)
+ else ["CUSTOM"]
+ )
+ comprehend_entities_val = (
+ CHOSEN_COMPREHEND_ENTITIES
+ if isinstance(CHOSEN_COMPREHEND_ENTITIES, list)
+ else ["CUSTOM"]
+ )
+ llm_entities_val = (
+ CHOSEN_LLM_ENTITIES if isinstance(CHOSEN_LLM_ENTITIES, list) else ["CUSTOM"]
+ )
+ local_entities_update = gr.Dropdown(
+ visible=show_local_entities_init, value=local_entities_val
+ )
+ comprehend_entities_update = gr.Dropdown(
+ visible=show_comprehend_entities_init, value=comprehend_entities_val
+ )
+ llm_entities_update = gr.Dropdown(
+ visible=is_llm_method_init, value=llm_entities_val
+ )
+ walkthrough_pii_identification_method_drop_update = (
+ gr.update(visible=show_pii_method, value=pii_drop_value)
+ if pii_drop_value is not None
+ else gr.update(visible=show_pii_method)
+ )
+ elif is_extract_text_only:
+ # For "Extract text only", just update visibility without changing value
+ local_entities_update = gr.Dropdown(visible=show_local_entities_init)
+ comprehend_entities_update = gr.Dropdown(visible=show_comprehend_entities_init)
+ llm_entities_update = gr.Dropdown(visible=is_llm_method_init)
+ walkthrough_pii_identification_method_drop_update = gr.update(
+ visible=show_pii_method, value=NO_REDACTION_PII_OPTION
+ )
+
+ return (
+ walkthrough_pii_identification_method_drop_update, # walkthrough_pii_identification_method_drop
+ local_entities_update, # walkthrough_in_redact_entities
+ comprehend_entities_update, # walkthrough_in_redact_comprehend_entities
+ gr.update(visible=is_llm_method_init), # walkthrough_llm_entities_accordion
+ llm_entities_update, # walkthrough_in_redact_llm_entities
+ gr.update(
+ visible=is_redact_all_pii_or_selected_terms
+ ), # walkthrough_list_accordion
+ gr.update(
+ visible=is_redact_all_pii_or_selected_terms
+ ), # walkthrough_max_fuzzy_spelling_mistakes_num
+ )
+
+
+# Update visibility of PII-related components and accordions when general redaction method is selected
+def handle_main_redaction_method_selection(redaction_method, pii_method):
+ """Wrapper that applies handle_redaction_method_selection and updates accordion visibility.
+
+ handle_redaction_method_selection returns (for walkthrough): pii_drop, local_entities,
+ comprehend_entities, llm_accordion_visible, llm_entities, list_accordion, checkbox, num.
+ The main app expects: pii_drop, local_entities, comprehend_entities, llm_entities,
+ custom_llm_entities_accordion, list_accordion, checkbox, num, entity_accordion, terms_accordion.
+ So we remap: use inner[4] for in_redact_llm_entities and set custom_llm_entities_accordion
+ visibility (same rule as handle_main_pii_method_selection; inner gr.update() would hide the box).
+ """
+ raw = list(handle_redaction_method_selection(redaction_method, pii_method))
+ is_redact_all_pii = redaction_method == "Redact all PII"
+ is_redact_selected_terms = redaction_method == "Redact selected terms"
+ is_extract_text_only = (
+ isinstance(redaction_method, str)
+ and redaction_method.strip() == "Extract text only"
+ )
+ show_pii_method = (
+ is_redact_all_pii or is_redact_selected_terms
+ ) and SHOW_PII_IDENTIFICATION_OPTIONS
+ show_selected_terms_lists = is_redact_selected_terms
+ is_llm_method_for_custom_instructions = _is_llm_pii_method_shown_for_redaction(
+ redaction_method, pii_method
+ )
+ # Map to main app outputs: pii_drop, local_entities, comprehend_entities, llm_entities,
+ # custom_llm_entities_accordion, list_accordion, checkbox, num,
+ # then entity/terms accordions, then only_extract_text_radio.
+ # raw[3] is walkthrough llm accordion visibility (unused here); raw[4] is llm_entities.
+ # When "Extract text only" is selected, force "Only extract text (no redaction)" checkbox to True.
+ results = [
+ raw[0], # pii_identification_method_drop
+ raw[1], # in_redact_entities
+ raw[2], # in_redact_comprehend_entities
+ raw[
+ 4
+ ], # in_redact_llm_entities (was wrongly going to textbox as str(["CUSTOM"]) -> "['CUSTOM']")
+ gr.update(
+ visible=is_llm_method_for_custom_instructions
+ ), # custom_llm_entities_accordion
+ raw[5], # walkthrough_list_accordion
+ raw[6], # max_fuzzy_spelling_mistakes_num
+ gr.update(visible=show_pii_method), # entity_types_to_redact_accordion
+ gr.update(visible=show_selected_terms_lists), # terms_accordion
+ gr.update(value=is_extract_text_only), # only_extract_text_radio
+ ]
+ return results
+
+
+def handle_pii_method_selection(pii_method: str):
+ """Handle PII method selection - show appropriate entity dropdowns."""
+ # When value is None/empty (e.g. first .change() after loading an example sets the
+ # component programmatically), avoid hiding all entity selectors by defaulting to Local.
+ if pii_method is None or (isinstance(pii_method, str) and not pii_method.strip()):
+ show_local_entities = True
+ show_comprehend_entities = False
+ is_llm_method = False
+ else:
+ # Check if method is Local
+ show_local_entities = pii_method == LOCAL_PII_OPTION
+ # Check if method is AWS Comprehend
+ show_comprehend_entities = pii_method == AWS_PII_OPTION
+ # Check if method is an LLM option
+ is_llm_method = (
+ pii_method == LOCAL_TRANSFORMERS_LLM_PII_OPTION
+ or pii_method == INFERENCE_SERVER_PII_OPTION
+ or pii_method == AWS_LLM_PII_OPTION
+ )
+
+ # Use gr.update(visible=...) only to avoid value resets that can trigger change
+ # events on the target components and cause loading loops (e.g. tabular PII -> LLM).
+ # Only return to the two entity dropdowns and the accordion; components inside the
+ # accordion (walkthrough_in_redact_llm_entities, walkthrough_custom_llm_instructions_textbox)
+ # are shown/hidden by the accordion visibility.
+ return (
+ gr.update(visible=show_local_entities), # walkthrough_in_redact_entities
+ gr.update(
+ visible=show_comprehend_entities
+ ), # walkthrough_in_redact_comprehend_entities
+ gr.update(visible=is_llm_method), # walkthrough_llm_entities_accordion
+ )
+
+
+def handle_pii_method_selection_tabular(pii_method: str):
+ """Handle tabular PII method selection. Updates only accordion visibility for the
+ LLM block; leaves walkthrough_in_redact_llm_entities and
+ walkthrough_custom_llm_instructions_textbox as no-ops to avoid loading spinners
+ hanging on those nested components when switching to LLM (AWS Bedrock).
+ """
+ if pii_method is None or (isinstance(pii_method, str) and not pii_method.strip()):
+ show_local_entities = True
+ show_comprehend_entities = False
+ is_llm_method = False
+ else:
+ show_local_entities = pii_method == LOCAL_PII_OPTION
+ show_comprehend_entities = pii_method == AWS_PII_OPTION
+ is_llm_method = (
+ pii_method == LOCAL_TRANSFORMERS_LLM_PII_OPTION
+ or pii_method == INFERENCE_SERVER_PII_OPTION
+ or pii_method == AWS_LLM_PII_OPTION
+ )
+ return (
+ gr.update(visible=show_local_entities),
+ gr.update(visible=show_comprehend_entities),
+ gr.update(visible=is_llm_method), # accordion controls visibility of LLM block
+ )
+
+
+def handle_step_3_next(
+ text_extract_method_val,
+ local_ocr_method_val,
+ handwrite_signature_val,
+ pii_method_val,
+ redact_entities_val,
+ redact_comprehend_entities_val,
+ redact_llm_entities_val,
+ custom_llm_instructions_val,
+ deny_list_val,
+ allow_list_val,
+ fully_redacted_list_val,
+ pii_method_tabular_val,
+ anon_strategy_val,
+ do_initial_clean_val,
+ redact_duplicate_pages_val,
+ max_fuzzy_spelling_mistakes_num_val,
+):
+ """Handle step 3 next button - write values to main components."""
+ # Update text extraction method with walkthrough value
+ text_extract_method_update = (
+ gr.Radio(value=text_extract_method_val)
+ if text_extract_method_val
+ else gr.Radio()
+ )
+
+ # Update OCR components with walkthrough values
+ local_ocr_update = (
+ gr.Radio(value=local_ocr_method_val) if local_ocr_method_val else gr.Radio()
+ )
+ handwrite_signature_update = (
+ gr.CheckboxGroup(value=handwrite_signature_val)
+ if handwrite_signature_val
+ else gr.CheckboxGroup()
+ )
+
+ # Update PII components with walkthrough values
+ pii_method_update = gr.Radio(value=pii_method_val) if pii_method_val else gr.Radio()
+ # Always update dropdowns with the value, even if it's an empty list
+ # This ensures that empty selections are correctly written to main components
+ redact_entities_update = (
+ gr.Dropdown(value=redact_entities_val)
+ if redact_entities_val is not None
+ else gr.Dropdown()
+ )
+ redact_comprehend_entities_update = (
+ gr.Dropdown(value=redact_comprehend_entities_val)
+ if redact_comprehend_entities_val is not None
+ else gr.Dropdown()
+ )
+ redact_llm_entities_update = (
+ gr.Dropdown(value=redact_llm_entities_val)
+ if redact_llm_entities_val is not None
+ else gr.Dropdown()
+ )
+ custom_llm_instructions_update = (
+ gr.Textbox(value=custom_llm_instructions_val)
+ if custom_llm_instructions_val is not None
+ else gr.Textbox()
+ )
+
+ # Update deny/allow/fully redacted list components with walkthrough values
+ # Convert DataFrame to list if needed (for backward compatibility)
+ # Ensure all items are strings for Dropdown components
+ if deny_list_val is not None:
+ if isinstance(deny_list_val, pd.DataFrame):
+ deny_list_val = (
+ deny_list_val.iloc[:, 0].tolist() if not deny_list_val.empty else []
+ )
+ # Ensure all items are strings
+ if isinstance(deny_list_val, list):
+ deny_list_val = (
+ [str(item) for item in deny_list_val if item] if deny_list_val else []
+ )
+ deny_list_update = (
+ gr.Dropdown(value=deny_list_val) if deny_list_val else gr.Dropdown()
+ )
+ else:
+ deny_list_update = gr.Dropdown()
+
+ if allow_list_val is not None:
+ if isinstance(allow_list_val, pd.DataFrame):
+ allow_list_val = (
+ allow_list_val.iloc[:, 0].tolist() if not allow_list_val.empty else []
+ )
+ # Ensure all items are strings
+ if isinstance(allow_list_val, list):
+ allow_list_val = (
+ [str(item) for item in allow_list_val if item] if allow_list_val else []
+ )
+ allow_list_update = (
+ gr.Dropdown(value=allow_list_val) if allow_list_val else gr.Dropdown()
+ )
+ else:
+ allow_list_update = gr.Dropdown()
+
+ if fully_redacted_list_val is not None:
+ if isinstance(fully_redacted_list_val, pd.DataFrame):
+ fully_redacted_list_val = (
+ fully_redacted_list_val.iloc[:, 0].tolist()
+ if not fully_redacted_list_val.empty
+ else []
+ )
+ # Ensure all items are strings
+ if isinstance(fully_redacted_list_val, list):
+ fully_redacted_list_val = (
+ [str(item) for item in fully_redacted_list_val if item]
+ if fully_redacted_list_val
+ else []
+ )
+ fully_redacted_list_update = (
+ gr.Dropdown(value=fully_redacted_list_val)
+ if fully_redacted_list_val
+ else gr.Dropdown()
+ )
+ else:
+ fully_redacted_list_update = gr.Dropdown()
+
+ # Update tabular data components with walkthrough values
+ pii_method_tabular_update = (
+ gr.Radio(value=pii_method_tabular_val)
+ if pii_method_tabular_val is not None
+ else gr.Radio()
+ )
+ anon_strategy_update = (
+ gr.Radio(value=anon_strategy_val)
+ if anon_strategy_val is not None
+ else gr.Radio()
+ )
+ do_initial_clean_update = (
+ gr.Checkbox(value=do_initial_clean_val)
+ if do_initial_clean_val is not None
+ else gr.Checkbox()
+ )
+
+ # Update redact duplicate pages checkbox with walkthrough value
+ redact_duplicate_pages_update = (
+ gr.Checkbox(value=redact_duplicate_pages_val)
+ if redact_duplicate_pages_val is not None
+ else gr.Checkbox()
+ )
+
+ # Update max fuzzy spelling mistakes number with walkthrough value
+ max_fuzzy_spelling_mistakes_num_update = (
+ gr.Number(value=max_fuzzy_spelling_mistakes_num_val)
+ if max_fuzzy_spelling_mistakes_num_val is not None
+ else gr.Number()
+ )
+
+ return (
+ text_extract_method_update, # text_extract_method_radio
+ local_ocr_update, # local_ocr_method_radio
+ handwrite_signature_update, # handwrite_signature_checkbox
+ pii_method_update, # pii_identification_method_drop
+ redact_entities_update, # in_redact_entities
+ redact_comprehend_entities_update, # in_redact_comprehend_entities
+ redact_llm_entities_update, # in_redact_llm_entities
+ custom_llm_instructions_update, # custom_llm_instructions_textbox
+ deny_list_update, # in_deny_list_state
+ allow_list_update, # in_allow_list_state
+ fully_redacted_list_update, # in_fully_redacted_list_state
+ pii_method_tabular_update, # pii_identification_method_drop_tabular
+ anon_strategy_update, # anon_strategy
+ do_initial_clean_update, # do_initial_clean
+ redact_duplicate_pages_update, # redact_duplicate_pages_checkbox
+ gr.Walkthrough(selected=4), # walkthrough
+ max_fuzzy_spelling_mistakes_num_update, # max_fuzzy_spelling_mistakes_num
+ )
+
+
+def handle_step_4_next(
+ page_min_val,
+ page_max_val,
+ textract_output_found_val,
+ relevant_ocr_output_with_words_found_val,
+ total_pdf_page_count_val,
+ estimated_aws_costs_val,
+ estimated_time_taken_val,
+ cost_code_dataframe_val,
+ cost_code_choice_val,
+):
+ """Handle step 4 next button - write values to main components."""
+ # Update page selection components
+ page_min_update = (
+ gr.Number(value=page_min_val) if page_min_val is not None else gr.Number()
+ )
+ page_max_update = (
+ gr.Number(value=page_max_val) if page_max_val is not None else gr.Number()
+ )
+
+ # Update cost-related components (if SHOW_COSTS is True)
+ textract_output_found_update = (
+ gr.Checkbox(value=textract_output_found_val)
+ if textract_output_found_val is not None
+ else gr.Checkbox()
+ )
+ relevant_ocr_output_with_words_found_update = (
+ gr.Checkbox(value=relevant_ocr_output_with_words_found_val)
+ if relevant_ocr_output_with_words_found_val is not None
+ else gr.Checkbox()
+ )
+ total_pdf_page_count_update = (
+ gr.Number(value=total_pdf_page_count_val)
+ if total_pdf_page_count_val is not None
+ else gr.Number()
+ )
+ estimated_aws_costs_update = (
+ gr.Number(value=estimated_aws_costs_val)
+ if estimated_aws_costs_val is not None
+ else gr.Number()
+ )
+ estimated_time_taken_update = (
+ gr.Number(value=estimated_time_taken_val)
+ if estimated_time_taken_val is not None
+ else gr.Number()
+ )
+
+ # Update cost code components (if GET_COST_CODES or ENFORCE_COST_CODES is True)
+ cost_code_dataframe_update = (
+ gr.Dataframe(value=cost_code_dataframe_val)
+ if cost_code_dataframe_val is not None
+ else gr.Dataframe()
+ )
+ cost_code_choice_update = (
+ gr.Dropdown(value=cost_code_choice_val)
+ if cost_code_choice_val is not None
+ else gr.Dropdown()
+ )
+
+ return (
+ page_min_update, # page_min
+ page_max_update, # page_max
+ textract_output_found_update, # textract_output_found_checkbox
+ relevant_ocr_output_with_words_found_update, # relevant_ocr_output_with_words_found_checkbox
+ total_pdf_page_count_update, # total_pdf_page_count
+ estimated_aws_costs_update, # estimated_aws_costs_number
+ estimated_time_taken_update, # estimated_time_taken_number
+ cost_code_dataframe_update, # cost_code_dataframe
+ cost_code_choice_update, # cost_code_choice_drop
+ gr.Walkthrough(selected=5), # walkthrough
+ )
+
+
+def sync_walkthrough_outputs_to_original(summary_text, output_file_value):
+ """Sync walkthrough output components to original components.
+
+ This function takes the outputs from the redaction process and duplicates
+ them to both walkthrough and original output components.
+
+ Args:
+ summary_text: The output summary text
+ output_file_value: The output file value
+
+ Returns:
+ Tuple of (walkthrough_summary, walkthrough_file, original_summary, original_file)
+ """
+ return (
+ summary_text, # walkthrough_redaction_output_summary_textbox
+ output_file_value, # walkthrough_output_file
+ summary_text, # redaction_output_summary_textbox (original)
+ output_file_value, # output_file (original)
+ )
+
+
+def sync_walkthrough_tabular_outputs_to_original(summary_text, output_file_value):
+ """Sync walkthrough tabular output components to original components.
+
+ This function takes the outputs from the tabular redaction process and duplicates
+ them to both walkthrough and original output components.
+
+ Args:
+ summary_text: The output summary text
+ output_file_value: The output file value
+
+ Returns:
+ Tuple of (walkthrough_summary, walkthrough_file, original_summary, original_file)
+ """
+ return (
+ summary_text, # walkthrough_text_output_summary
+ output_file_value, # walkthrough_text_output_file
+ summary_text, # text_output_summary (original)
+ output_file_value, # text_output_file (original)
+ )
+
+
+def update_step_3_tabular_visibility(is_data_file):
+ """Update visibility of Step 3 components based on file type.
+
+ When a data file (CSV/Excel) is chosen: show tabular options, hide document-only options.
+ When a document is chosen: show document options (PII method, duplicate pages, etc.), hide tabular options.
+
+ Args:
+ is_data_file: Boolean indicating if uploaded file is a data file
+
+ Returns:
+ Tuple of visibility updates for document-only and tabular components
+ """
+ show_doc = not is_data_file
+ return (
+ gr.update(visible=show_doc), # walkthrough_local_ocr_method_radio
+ gr.update(visible=show_doc), # walkthrough_pii_identification_method_drop
+ gr.update(visible=show_doc), # walkthrough_fully_redacted_list_state
+ gr.update(visible=show_doc), # walkthrough_redact_duplicate_pages_checkbox
+ gr.update(
+ visible=is_data_file
+ ), # walkthrough_pii_identification_method_drop_tabular
+ gr.update(visible=is_data_file), # walkthrough_anon_strategy
+ gr.update(visible=is_data_file), # walkthrough_do_initial_clean
+ )
+
+
+def update_step_4_visibility(is_data_file):
+ """Update visibility of Step 4 components based on file type.
+
+ Args:
+ is_data_file: Boolean indicating if uploaded file is a data file
+
+ Returns:
+ Tuple of visibility updates for document and tabular components
+ """
+ # For Row components, we need to update visibility of children
+ # Return updates for button and both output components in each row
+ return (
+ gr.update(visible=not is_data_file), # step_4_next_document_redact_btn
+ gr.update(visible=is_data_file), # step_4_next_tabular_redact_btn
+ )
+
+
+def handle_main_text_extract_method_selection(text_extract_method: str):
+ """Handle text extraction method selection for main components - show local OCR options only if Local OCR model is selected,
+ and show AWS Textract settings only if AWS Textract is selected.
+
+ Args:
+ text_extract_method: Selected text extraction method
+
+ Returns:
+ Tuple of visibility updates for local OCR accordion, inference server accordion, and AWS Textract accordion
+ """
+ # Normalize (Gradio can send None when .change() fires before sync); default so something stays visible
+ if isinstance(text_extract_method, str):
+ text_extract_method = text_extract_method.strip()
+ if text_extract_method is None or text_extract_method == "":
+ text_extract_method = LOCAL_OCR_MODEL_TEXT_EXTRACT_OPTION
+
+ # Show local OCR method accordion only if "Local OCR model - PDFs without selectable text" is selected
+ # When "AWS Bedrock VLM OCR" is selected, the local OCR method is automatically set to "bedrock-vlm" but the component is hidden
+ show_local_ocr = text_extract_method == LOCAL_OCR_MODEL_TEXT_EXTRACT_OPTION
+ # Show AWS Textract settings accordion only if "AWS Textract service - all PDF types" is selected
+ show_aws_textract = (
+ text_extract_method == TEXTRACT_TEXT_EXTRACT_OPTION
+ and SHOW_AWS_TEXT_EXTRACTION_OPTIONS
+ )
+ # Show inference server VLM model accordion only if local OCR is selected (not Bedrock VLM) and the option is enabled
+ show_inference_server = (
+ text_extract_method == LOCAL_OCR_MODEL_TEXT_EXTRACT_OPTION
+ and SHOW_INFERENCE_SERVER_VLM_MODEL_OPTIONS
+ )
+
+ return (
+ gr.update(visible=show_local_ocr), # local_ocr_method_accordion
+ gr.update(
+ visible=show_inference_server
+ ), # inference_server_vlm_model_accordion
+ gr.update(visible=show_aws_textract), # aws_textract_signature_accordion
+ )
+
+
+def handle_main_pii_method_selection(pii_method):
+ """Handle PII method selection for main components - show appropriate entity dropdowns and hide all if No PII redaction is selected.
+
+ Args:
+ pii_method: Selected PII detection method
+
+ Returns:
+ Tuple of visibility updates for PII method dropdown, local entities accordion, comprehend entities accordion,
+ LLM entities accordion, and LLM custom instructions accordion
+ """
+ # Normalize string (Gradio can send whitespace)
+ if isinstance(pii_method, str):
+ pii_method = pii_method.strip()
+ # When value is None/empty (e.g. .change() fired before component synced), default to Local so at least one section is visible (e.g. when user clicked Local)
+ if pii_method is None or pii_method == "":
+ return (
+ gr.update(visible=True), # local_entities
+ gr.update(visible=False), # comprehend_entities
+ gr.update(visible=False), # llm_entities
+ gr.update(visible=False), # llm_custom_instructions
+ )
+
+ # Check if "No PII redaction" is selected
+ is_no_redaction = pii_method == NO_REDACTION_PII_OPTION
+
+ # If no redaction, hide all PII-related components
+ if is_no_redaction:
+ return (
+ gr.update(visible=False), # local_entities
+ gr.update(visible=False), # comprehend_entities
+ gr.update(visible=False), # llm_entities
+ gr.update(visible=False), # llm_custom_instructions
+ )
+
+ # Check if method is Local
+ show_local_entities = pii_method == LOCAL_PII_OPTION
+ # Check if method is AWS Comprehend
+ show_comprehend_entities = pii_method == AWS_PII_OPTION
+ # Check if method is an LLM option
+ is_llm_method = (
+ pii_method == LOCAL_TRANSFORMERS_LLM_PII_OPTION
+ or pii_method == INFERENCE_SERVER_PII_OPTION
+ or pii_method == AWS_LLM_PII_OPTION
+ )
+
+ return (
+ gr.update(visible=show_local_entities), # local_entities
+ gr.update(visible=show_comprehend_entities), # comprehend_entities
+ gr.update(visible=is_llm_method), # llm_entities
+ gr.update(visible=is_llm_method), # llm_custom_instructions
+ )
diff --git a/tools/redaction_review.py b/tools/redaction_review.py
new file mode 100644
index 0000000000000000000000000000000000000000..0829270afdb688a19501525fb4796cab29c97ab4
--- /dev/null
+++ b/tools/redaction_review.py
@@ -0,0 +1,4040 @@
+import gc
+import os
+import re
+import time
+import uuid
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from datetime import datetime, timedelta, timezone
+from typing import Dict, List, Tuple
+from xml.etree.ElementTree import Element, SubElement, tostring
+
+import defusedxml
+import defusedxml.ElementTree as defused_etree
+import defusedxml.minidom as defused_minidom
+
+# Defuse the standard library XML modules for security
+defusedxml.defuse_stdlib()
+
+import gradio as gr
+import numpy as np
+import pandas as pd
+import polars as pl
+import pymupdf
+from gradio_image_annotation.image_annotator import AnnotatedImageData
+from PIL import Image, ImageDraw
+from pymupdf import Document, Rect
+
+from tools.config import (
+ COMPRESS_REDACTED_PDF,
+ CUSTOM_BOX_COLOUR,
+ ENABLE_PARALLEL_FILES_APPLY_REDACTIONS,
+ ENABLE_REVIEW_CSV_PARALLELISM,
+ INPUT_FOLDER,
+ MAX_IMAGE_PIXELS,
+ MAX_WORKERS,
+ OUTPUT_FOLDER,
+ PROFILE_REDACTION_APPLY,
+ RETURN_PDF_FOR_REVIEW,
+ TWO_PASS_REVIEW_PDF_LOW_MEMORY,
+ USE_POLARS_FOR_REVIEW,
+)
+from tools.file_conversion import (
+ convert_annotation_data_to_dataframe,
+ convert_annotation_json_to_review_df,
+ convert_review_df_to_annotation_json,
+ divide_coordinates_by_page_sizes,
+ divide_coordinates_by_page_sizes_pl,
+ fill_missing_ids,
+ is_pdf,
+ multiply_coordinates_by_page_sizes,
+ process_single_page_for_image_conversion,
+ remove_duplicate_images_with_blank_boxes,
+ save_pdf_with_or_without_compression,
+)
+from tools.file_redaction import redact_page_with_pymupdf, set_cropbox_safely
+from tools.helper_functions import (
+ _generate_unique_ids,
+ detect_file_type,
+ get_file_name_without_type,
+)
+from tools.secure_path_utils import (
+ secure_file_write,
+)
+
+if not MAX_IMAGE_PIXELS:
+ Image.MAX_IMAGE_PIXELS = None
+
+# Chunked review CSV: minimum number of pages to enable parallel annotation->DF build
+REVIEW_CSV_PARALLEL_MIN_PAGES = 20
+# Pages per chunk when building review DF from annotations in parallel
+REVIEW_CSV_PAGES_PER_CHUNK = 15
+
+
+def _concat_frames_without_all_na_warning(
+ dfs: List[pd.DataFrame], *, ignore_index: bool = True
+) -> pd.DataFrame:
+ """
+ Vertically concat frames while avoiding pandas FutureWarning about concat dtype
+ rules when some inputs have all-NA columns (pandas >= 2.2).
+
+ Skips void frames (no rows and no columns). For 0-row frames that still define
+ columns, does not run dropna(axis=1): on empty DataFrames that would remove
+ every column.
+ """
+ usable = [df for df in dfs if not df.empty or len(df.columns) > 0]
+ if not usable:
+ return pd.DataFrame()
+ # No concat → no dtype FutureWarning; keep a single frame unchanged.
+ if len(usable) == 1:
+ return usable[0].copy()
+
+ union_cols = list(dict.fromkeys(c for df in usable for c in df.columns))
+
+ def _strip_all_na_columns(df: pd.DataFrame) -> pd.DataFrame:
+ if df.empty:
+ return df.copy()
+ return df.dropna(axis=1, how="all")
+
+ cleaned = [_strip_all_na_columns(df) for df in usable]
+ return pd.concat(cleaned, ignore_index=ignore_index).reindex(columns=union_cols)
+
+
+def _ensure_box_colour_string(colour):
+ """Ensure colour is a string for gradio_image_annotation (JS expects .startsWith)."""
+ if colour is None:
+ return "(0, 0, 0)"
+ if isinstance(colour, str):
+ return colour
+ if isinstance(colour, (tuple, list)) and len(colour) >= 3:
+ return f"({int(colour[0])}, {int(colour[1])}, {int(colour[2])})"
+ return "(0, 0, 0)"
+
+
+def decrease_page(number: int, all_annotations: dict):
+ """
+ Decrease page number for review redactions page.
+ """
+ if not all_annotations:
+ raise Warning("No annotator object loaded")
+
+ if number > 1:
+ return number - 1, number - 1
+ else:
+ gr.Info("At first page", duration=5)
+ raise gr.Error(
+ message="At first page",
+ title="At first page...",
+ visible=False,
+ print_exception=False,
+ )
+
+
+def increase_page(number: int, all_annotations: dict):
+ """
+ Increase page number for review redactions page.
+ """
+
+ if not all_annotations:
+ raise Warning("No annotator object loaded")
+ # return 1, 1
+
+ max_pages = len(all_annotations)
+
+ if number < max_pages:
+ return number + 1, number + 1
+ else:
+ gr.Info("At last page", duration=5)
+ raise gr.Error(
+ message="At last page",
+ title="At last page...",
+ visible=False,
+ print_exception=False,
+ )
+
+
+def update_zoom(
+ current_zoom_level: int, annotate_current_page: int, decrease: bool = True
+):
+ if decrease is False:
+ if current_zoom_level >= 70:
+ current_zoom_level -= 10
+ else:
+ if current_zoom_level < 110:
+ current_zoom_level += 10
+
+ return current_zoom_level, annotate_current_page
+
+
+def update_dropdown_list_based_on_dataframe(
+ df: pd.DataFrame, column: str
+) -> List["str"]:
+ """
+ Gather unique elements from a string pandas Series, then append 'ALL' to the start and return the list.
+ """
+ if isinstance(df, pd.DataFrame):
+ # Check if the Series is empty or all NaN
+ if column not in df.columns or df[column].empty or df[column].isna().all():
+ return ["ALL"]
+ elif column != "page":
+ entities = df[column].astype(str).unique().tolist()
+ entities_for_drop = sorted(entities)
+ entities_for_drop.insert(0, "ALL")
+ else:
+ # Ensure the column can be converted to int - assumes it is the page column
+ try:
+ entities = df[column].astype(int).unique()
+ entities_for_drop = sorted(entities)
+ entities_for_drop = [
+ str(e) for e in entities_for_drop
+ ] # Convert back to string
+ entities_for_drop.insert(0, "ALL")
+ except ValueError:
+ return ["ALL"] # Handle case where conversion fails
+
+ return entities_for_drop # Ensure to return the list
+ else:
+ return ["ALL"]
+
+
+def get_filtered_recogniser_dataframe_and_dropdowns(
+ page_image_annotator_object: AnnotatedImageData,
+ recogniser_dataframe_base: pd.DataFrame,
+ recogniser_dropdown_value: str,
+ text_dropdown_value: str,
+ page_dropdown_value: str,
+ review_df: pd.DataFrame = list(),
+ page_sizes: List[str] = list(),
+):
+ """
+ Create a filtered recogniser dataframe and associated dropdowns based on current information in the image annotator and review data frame.
+ """
+
+ recogniser_entities_list = ["Redaction"]
+ recogniser_dataframe_out = recogniser_dataframe_base
+ pd.DataFrame()
+ review_dataframe = review_df
+
+ try:
+
+ review_dataframe = convert_annotation_json_to_review_df(
+ page_image_annotator_object, review_df, page_sizes
+ )
+
+ recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(
+ review_dataframe, "label"
+ )
+ recogniser_entities_drop_spec = dict(
+ value=recogniser_dropdown_value,
+ choices=recogniser_entities_for_drop,
+ allow_custom_value=True,
+ interactive=True,
+ )
+
+ # This is the choice list for entities when creating a new redaction box
+ recogniser_entities_list = [
+ entity
+ for entity in recogniser_entities_for_drop.copy()
+ if entity != "Redaction" and entity != "ALL"
+ ] # Remove any existing 'Redaction'
+ recogniser_entities_list.insert(
+ 0, "Redaction"
+ ) # Add 'Redaction' to the start of the list
+
+ text_entities_for_drop = update_dropdown_list_based_on_dataframe(
+ review_dataframe, "text"
+ )
+ text_entities_drop_spec = dict(
+ value=text_dropdown_value,
+ choices=text_entities_for_drop,
+ allow_custom_value=True,
+ interactive=True,
+ )
+
+ page_entities_for_drop = update_dropdown_list_based_on_dataframe(
+ review_dataframe, "page"
+ )
+ page_entities_drop_spec = dict(
+ value=page_dropdown_value,
+ choices=page_entities_for_drop,
+ allow_custom_value=True,
+ interactive=True,
+ )
+
+ recogniser_dataframe_out = review_dataframe.loc[
+ :, ["page", "label", "text", "id"]
+ ]
+
+ except Exception as e:
+ print("Could not extract recogniser information:", e)
+ recogniser_dataframe_out = recogniser_dataframe_base.loc[
+ :, ["page", "label", "text", "id"]
+ ]
+
+ label_choices = review_dataframe["label"].astype(str).unique().tolist()
+ text_choices = review_dataframe["text"].astype(str).unique().tolist()
+ page_choices = review_dataframe["page"].astype(str).unique().tolist()
+
+ recogniser_entities_drop_spec = dict(
+ value=recogniser_dropdown_value,
+ choices=label_choices,
+ allow_custom_value=True,
+ interactive=True,
+ )
+ recogniser_entities_list = ["Redaction"]
+ text_entities_drop_spec = dict(
+ value=text_dropdown_value,
+ choices=text_choices,
+ allow_custom_value=True,
+ interactive=True,
+ )
+ page_entities_drop_spec = dict(
+ value=page_dropdown_value,
+ choices=page_choices,
+ allow_custom_value=True,
+ interactive=True,
+ )
+
+ return (
+ recogniser_dataframe_out,
+ recogniser_dataframe_out,
+ recogniser_entities_drop_spec,
+ recogniser_entities_list,
+ text_entities_drop_spec,
+ page_entities_drop_spec,
+ )
+
+
+def update_recogniser_dataframes(
+ page_image_annotator_object: AnnotatedImageData,
+ recogniser_dataframe_base: pd.DataFrame,
+ recogniser_entities_dropdown_value: str = "ALL",
+ text_dropdown_value: str = "ALL",
+ page_dropdown_value: str = "ALL",
+ review_df: pd.DataFrame = list(),
+ page_sizes: list[str] = list(),
+):
+ """
+ Update recogniser dataframe information that appears alongside the pdf pages on the review screen.
+ """
+ recogniser_entities_list = ["Redaction"]
+ recogniser_dataframe_out = pd.DataFrame()
+ recogniser_dataframe_out_gr = pd.DataFrame()
+
+ # If base recogniser dataframe is empy, need to create it.
+ if recogniser_dataframe_base.empty:
+ (
+ recogniser_dataframe_out_gr,
+ recogniser_dataframe_out,
+ recogniser_entities_drop_spec,
+ recogniser_entities_list,
+ text_entities_drop_spec,
+ page_entities_drop_spec,
+ ) = get_filtered_recogniser_dataframe_and_dropdowns(
+ page_image_annotator_object,
+ recogniser_dataframe_base,
+ recogniser_entities_dropdown_value,
+ text_dropdown_value,
+ page_dropdown_value,
+ review_df,
+ page_sizes,
+ )
+ return (
+ recogniser_entities_list,
+ recogniser_dataframe_out_gr,
+ recogniser_dataframe_out,
+ gr.update(**recogniser_entities_drop_spec),
+ gr.update(**text_entities_drop_spec),
+ gr.update(**page_entities_drop_spec),
+ )
+ elif recogniser_dataframe_base.iloc[0, 0] == "":
+ (
+ recogniser_dataframe_out_gr,
+ recogniser_dataframe_out,
+ recogniser_entities_drop_spec,
+ recogniser_entities_list,
+ text_entities_drop_spec,
+ page_entities_drop_spec,
+ ) = get_filtered_recogniser_dataframe_and_dropdowns(
+ page_image_annotator_object,
+ recogniser_dataframe_base,
+ recogniser_entities_dropdown_value,
+ text_dropdown_value,
+ page_dropdown_value,
+ review_df,
+ page_sizes,
+ )
+ return (
+ recogniser_entities_list,
+ recogniser_dataframe_out_gr,
+ recogniser_dataframe_out,
+ gr.update(**recogniser_entities_drop_spec),
+ gr.update(**text_entities_drop_spec),
+ gr.update(**page_entities_drop_spec),
+ )
+ else:
+ (
+ recogniser_dataframe_out_gr,
+ recogniser_dataframe_out,
+ _recogniser_drop_spec,
+ recogniser_entities_list,
+ _text_drop_spec,
+ _page_drop_spec,
+ ) = get_filtered_recogniser_dataframe_and_dropdowns(
+ page_image_annotator_object,
+ recogniser_dataframe_base,
+ recogniser_entities_dropdown_value,
+ text_dropdown_value,
+ page_dropdown_value,
+ review_df,
+ page_sizes,
+ )
+
+ review_dataframe, text_entities_drop, page_entities_drop = (
+ update_entities_df_recogniser_entities(
+ recogniser_entities_dropdown_value,
+ recogniser_dataframe_out,
+ page_dropdown_value,
+ text_dropdown_value,
+ )
+ )
+
+ # recogniser_dataframe_out_gr = gr.Dataframe(
+ # review_dataframe[["page", "label", "text", "id"]],
+ # show_search="filter",
+ # type="pandas",
+ # headers=["page", "label", "text", "id"],
+ # wrap=True,
+ # max_height=400,
+ # )
+
+ recogniser_dataframe_out_gr = review_dataframe[["page", "label", "text", "id"]]
+
+ recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(
+ recogniser_dataframe_out, "label"
+ )
+
+ recogniser_entities_list_base = (
+ recogniser_dataframe_out["label"].astype(str).unique().tolist()
+ )
+
+ # Recogniser entities list is the list of choices that appear when you make a new redaction box
+ recogniser_entities_list = [
+ entity for entity in recogniser_entities_list_base if entity != "Redaction"
+ ]
+ recogniser_entities_list.insert(0, "Redaction")
+
+ return (
+ recogniser_entities_list,
+ recogniser_dataframe_out_gr,
+ recogniser_dataframe_out,
+ gr.update(
+ value=recogniser_entities_dropdown_value,
+ choices=recogniser_entities_for_drop,
+ allow_custom_value=True,
+ interactive=True,
+ ),
+ text_entities_drop,
+ page_entities_drop,
+ )
+
+
+def undo_last_removal(
+ backup_review_state: pd.DataFrame,
+ backup_image_annotations_state: list[dict],
+ backup_recogniser_entity_dataframe_base: pd.DataFrame,
+):
+
+ if backup_image_annotations_state:
+ return (
+ backup_review_state,
+ backup_image_annotations_state,
+ backup_recogniser_entity_dataframe_base,
+ )
+ else:
+ raise Warning("No actions have been taken to undo")
+
+
+def update_annotator_page_from_review_df(
+ review_df: pd.DataFrame,
+ image_file_paths: List[str],
+ page_sizes: List[dict],
+ current_image_annotations_state: List[dict],
+ current_page_annotator: object,
+ selected_recogniser_entity_df_row: pd.DataFrame,
+ input_folder: str,
+ doc_full_file_name_textbox: str,
+) -> Tuple[object, List[dict], int, List[dict], pd.DataFrame, int]:
+ """
+ Update the visible annotation object and related objects with the latest review file information,
+ optimising by processing only the current page's data.
+
+ Args:
+ review_df (pd.DataFrame): The DataFrame containing review information for all annotations.
+ image_file_paths (List[str]): List of image file paths, one per document page.
+ page_sizes (List[dict]): List of dictionaries holding page size metadata (width/height etc) for each page.
+ current_image_annotations_state (List[dict]): Annotation state for all pages; typically a list of dicts, one per page.
+ current_page_annotator (object): The annotation object for the currently visible page, usually a dict or a custom annotation object.
+ selected_recogniser_entity_df_row (pd.DataFrame): DataFrame row of the currently selected recogniser/entity, used to extract current page info.
+ input_folder (str): Folder containing input source data.
+ doc_full_file_name_textbox (str): The full filename of the document as displayed in the textbox/UI.
+
+ Returns:
+ Tuple[object, List[dict], int, List[dict], pd.DataFrame, int]:
+ A tuple containing:
+ - The updated annotation object for the current page.
+ - The updated annotation state for all pages.
+ - The current page number being displayed (1-based).
+ - The annotation state for all pages after any updates.
+ - The possibly updated recogniser/entity DataFrame row.
+ - The previous page number to annotate (for navigation/state logic).
+ """
+ # Assume current_image_annotations_state is List[dict] and current_page_annotator is dict
+ out_image_annotations_state: List[dict] = list(
+ current_image_annotations_state
+ ) # Make a copy to avoid modifying input in place
+ out_current_page_annotator: dict = current_page_annotator
+
+ # Get the target page number from the selected row
+ # Safely access the page number, handling potential errors or empty DataFrame
+ gradio_annotator_current_page_number: int = 1
+ annotate_previous_page: int = (
+ 0 # Renaming for clarity if needed, matches original output
+ )
+
+ if (
+ not selected_recogniser_entity_df_row.empty
+ and "page" in selected_recogniser_entity_df_row.columns
+ ):
+ try:
+ selected_page = selected_recogniser_entity_df_row["page"].iloc[0]
+ gradio_annotator_current_page_number = int(selected_page)
+ annotate_previous_page = (
+ gradio_annotator_current_page_number # Store original page number
+ )
+ except (IndexError, ValueError, TypeError):
+ print(
+ "Warning: Could not extract valid page number from selected_recogniser_entity_df_row. Defaulting to page 1."
+ )
+ gradio_annotator_current_page_number = (
+ 1 # Or 0 depending on 1-based vs 0-based indexing elsewhere
+ )
+
+ # Ensure page number is valid and 1-based for external display/logic
+ if gradio_annotator_current_page_number <= 0:
+ gradio_annotator_current_page_number = 1
+
+ page_max_reported = len(page_sizes) # len(out_image_annotations_state)
+ if gradio_annotator_current_page_number > page_max_reported:
+ print("current page is greater than highest page:", page_max_reported)
+ gradio_annotator_current_page_number = page_max_reported # Cap at max pages
+
+ page_num_reported_zero_indexed = gradio_annotator_current_page_number - 1
+
+ # Process page sizes DataFrame early, as it's needed for image path handling and potentially coordinate multiplication
+ page_sizes_df = pd.DataFrame(page_sizes)
+ if not page_sizes_df.empty:
+ # Safely convert page column to numeric and then int
+ page_sizes_df["page"] = pd.to_numeric(page_sizes_df["page"], errors="coerce")
+ page_sizes_df.dropna(subset=["page"], inplace=True)
+ if not page_sizes_df.empty:
+ page_sizes_df["page"] = page_sizes_df["page"].astype(int)
+ else:
+ print("Warning: Page sizes DataFrame became empty after processing.")
+
+ if not review_df.empty:
+ # Filter review_df for the current page
+ # Ensure 'page' column in review_df is comparable to page_num_reported
+ if "page" in review_df.columns:
+ review_df["page"] = (
+ pd.to_numeric(review_df["page"], errors="coerce").fillna(-1).astype(int)
+ )
+
+ current_image_path = out_image_annotations_state[
+ page_num_reported_zero_indexed
+ ]["image"]
+
+ replaced_image_path, page_sizes_df = (
+ replace_placeholder_image_with_real_image(
+ doc_full_file_name_textbox,
+ current_image_path,
+ page_sizes_df,
+ gradio_annotator_current_page_number,
+ input_folder,
+ )
+ )
+
+ # page_sizes_df has been changed - save back to page_sizes_object
+ page_sizes = page_sizes_df.to_dict(orient="records")
+ review_df.loc[
+ review_df["page"] == gradio_annotator_current_page_number, "image"
+ ] = replaced_image_path
+ images_list = list(page_sizes_df["image_path"])
+ images_list[page_num_reported_zero_indexed] = replaced_image_path
+ out_image_annotations_state[page_num_reported_zero_indexed][
+ "image"
+ ] = replaced_image_path
+
+ current_page_review_df = review_df[
+ review_df["page"] == gradio_annotator_current_page_number
+ ].copy()
+ current_page_review_df = multiply_coordinates_by_page_sizes(
+ current_page_review_df, page_sizes_df
+ )
+
+ else:
+ print(
+ f"Warning: 'page' column not found in review_df. Cannot filter for page {gradio_annotator_current_page_number}. Skipping update from review_df."
+ )
+ current_page_review_df = pd.DataFrame() # Empty dataframe if filter fails
+
+ if not current_page_review_df.empty:
+ # Convert the current page's review data to annotation list format for *this page*
+
+ current_page_annotations_list = list()
+ # Define expected annotation dict keys, including 'image', 'page', coords, 'label', 'text', 'color' etc.
+ # Assuming review_df has compatible columns
+ expected_annotation_keys = [
+ "label",
+ "color",
+ "xmin",
+ "ymin",
+ "xmax",
+ "ymax",
+ "text",
+ "id",
+ ] # Add/remove as needed
+
+ # Ensure necessary columns exist in current_page_review_df before converting rows
+ for key in expected_annotation_keys:
+ if key not in current_page_review_df.columns:
+ # Add missing column with default value. Use 0.0 for coords so
+ # gradio_image_annotation never receives None/NaN (causes TypeError in preprocess_boxes).
+ default_value = (
+ 0.0 if key in ["xmin", "ymin", "xmax", "ymax"] else ""
+ )
+ current_page_review_df[key] = default_value
+
+ # Ensure coord columns have no NaN/None so image_annotator preprocess_boxes doesn't raise TypeError
+ for coord in ["xmin", "ymin", "xmax", "ymax"]:
+ if coord in current_page_review_df.columns:
+ current_page_review_df[coord] = pd.to_numeric(
+ current_page_review_df[coord], errors="coerce"
+ ).fillna(0.0)
+
+ # Convert filtered DataFrame rows to list of dicts
+ # Using .to_dict(orient='records') is efficient for this
+ current_page_annotations_list_raw = current_page_review_df[
+ expected_annotation_keys
+ ].to_dict(orient="records")
+
+ current_page_annotations_list = current_page_annotations_list_raw
+
+ # Update the annotations state for the current page
+ page_state_entry_found = False
+ for i, page_state_entry in enumerate(out_image_annotations_state):
+ # Assuming page_state_entry has a 'page' key (1-based)
+
+ from tools.secure_regex_utils import (
+ safe_extract_page_number_from_filename,
+ )
+
+ page_no = safe_extract_page_number_from_filename(
+ page_state_entry["image"]
+ )
+ if page_no is None:
+ page_no = 0
+
+ if (
+ "image" in page_state_entry
+ and page_no == page_num_reported_zero_indexed
+ ):
+ # Replace the annotations list for this page with the new list from review_df
+ out_image_annotations_state[i][
+ "boxes"
+ ] = current_page_annotations_list
+
+ # Update the image path as well, based on review_df if available, or keep existing
+ # Assuming review_df has an 'image' column for this page
+ if (
+ "image" in current_page_review_df.columns
+ and not current_page_review_df.empty
+ ):
+ # Use the image path from the first row of the filtered review_df
+ out_image_annotations_state[i]["image"] = (
+ current_page_review_df["image"].iloc[0]
+ )
+ page_state_entry_found = True
+ break
+
+ if not page_state_entry_found:
+ print(
+ f"Warning: Entry for page {gradio_annotator_current_page_number} not found in current_image_annotations_state. Cannot update page annotations."
+ )
+
+ # --- Image Path and Page Size Handling ---
+ # Get the image path for the current page from the updated state
+ current_image_path = None
+ if (
+ len(out_image_annotations_state) > page_num_reported_zero_indexed
+ and "image" in out_image_annotations_state[page_num_reported_zero_indexed]
+ ):
+ current_image_path = out_image_annotations_state[
+ page_num_reported_zero_indexed
+ ]["image"]
+ else:
+ print(
+ f"Warning: Could not get image path from state for page index {page_num_reported_zero_indexed}."
+ )
+
+ # Replace placeholder image with real image path if needed
+ if current_image_path and not page_sizes_df.empty:
+ try:
+ replaced_image_path, page_sizes_df = (
+ replace_placeholder_image_with_real_image(
+ doc_full_file_name_textbox,
+ current_image_path,
+ page_sizes_df,
+ gradio_annotator_current_page_number,
+ input_folder, # Use 1-based page number
+ )
+ )
+
+ # Update state and review_df with the potentially replaced image path
+ if len(out_image_annotations_state) > page_num_reported_zero_indexed:
+ out_image_annotations_state[page_num_reported_zero_indexed][
+ "image"
+ ] = replaced_image_path
+
+ if "page" in review_df.columns and "image" in review_df.columns:
+ review_df.loc[
+ review_df["page"] == gradio_annotator_current_page_number, "image"
+ ] = replaced_image_path
+
+ except Exception as e:
+ print(
+ f"Error during image path replacement for page {gradio_annotator_current_page_number}: {e}"
+ )
+ else:
+ print(
+ f"Warning: Page index {page_num_reported_zero_indexed} out of bounds for all_image_annotations list."
+ )
+
+ # Save back page_sizes_df to page_sizes list format
+ if not page_sizes_df.empty:
+ page_sizes = page_sizes_df.to_dict(orient="records")
+ else:
+ page_sizes = list() # Ensure page_sizes is a list if df is empty
+
+ # --- Re-evaluate Coordinate Multiplication and Duplicate Removal ---
+ # Let's assume remove_duplicate_images_with_blank_boxes expects the raw list of dicts state format:
+ try:
+ out_image_annotations_state = remove_duplicate_images_with_blank_boxes(
+ out_image_annotations_state
+ )
+ except Exception as e:
+ print(
+ f"Error during duplicate removal: {e}. Proceeding without duplicate removal."
+ )
+
+ # Select the current page's annotation object from the (potentially updated) state
+ if len(out_image_annotations_state) > page_num_reported_zero_indexed:
+ out_current_page_annotator = out_image_annotations_state[
+ page_num_reported_zero_indexed
+ ]
+ else:
+ print(
+ f"Warning: Cannot select current page annotator object for index {page_num_reported_zero_indexed}."
+ )
+ out_current_page_annotator = {} # Or None, depending on expected output type
+
+ # Return final page number
+ final_page_number_returned = gradio_annotator_current_page_number
+
+ return (
+ out_current_page_annotator,
+ out_image_annotations_state,
+ final_page_number_returned,
+ page_sizes,
+ review_df, # review_df might have its 'page' column type changed, keep it as is or revert if necessary
+ annotate_previous_page,
+ ) # The original page number from selected_recogniser_entity_df_row
+
+
+def _merge_horizontally_adjacent_boxes(
+ df: pd.DataFrame,
+ x_merge_threshold: float = 0.02,
+ y_merge_threshold: float = 0.01,
+) -> pd.DataFrame:
+ """
+ Merges horizontally adjacent bounding boxes within the same visual line.
+
+ Only merges boxes that are on the same visual line (similar y position),
+ so that merged boxes do not span multiple lines and get incorrect ymax
+ (e.g. 1.0 when the OCR "line" field is shared across the page).
+
+ Args:
+ df (pd.DataFrame): DataFrame containing annotation boxes with columns
+ like 'page', 'line', 'xmin', 'xmax', 'ymin', 'ymax', etc.
+ x_merge_threshold (float): The maximum gap on the x-axis (normalised 0-1)
+ to consider two boxes as adjacent.
+ y_merge_threshold (float): The maximum vertical distance (normalised 0-1)
+ to consider two boxes on the same visual line.
+
+ Returns:
+ pd.DataFrame: A new DataFrame with adjacent boxes merged.
+ """
+ if df.empty:
+ return df
+
+ # 1. Sort by page, then by vertical position (ymin) then horizontal (xmin)
+ # so that we compare consecutive words on the same visual line.
+ df_sorted = df.sort_values(by=["page", "line", "xmin"]).copy()
+
+ # 2. Identify groups of boxes to merge using shift() and cumsum()
+ # Get properties of the 'previous' box in the sorted list
+ prev_xmax = df_sorted["xmax"].shift(1)
+ prev_page = df_sorted["page"].shift(1)
+ prev_line = df_sorted["line"].shift(1)
+
+ # Same text line
+ same_visual_line = (df_sorted["page"] == prev_page) & (
+ df_sorted["line"] == prev_line
+ )
+
+ # A box should be merged with the previous one if it's on the same page,
+ # same visual line (similar y), and the horizontal gap is within threshold.
+ is_adjacent = same_visual_line & (
+ df_sorted["xmin"] - prev_xmax <= x_merge_threshold
+ )
+
+ # A new group starts wherever a box is NOT adjacent to the previous one.
+ # cumsum() on this boolean series creates a unique ID for each group.
+ df_sorted["merge_group"] = (~is_adjacent).cumsum()
+
+ # 3. Aggregate each group into a single bounding box
+ # Define how to aggregate each column
+ agg_funcs = {
+ "xmin": "min",
+ "ymin": "min", # To get the highest point of the combined box
+ "xmax": "max",
+ "ymax": "max", # To ensure we cover all text
+ "text": lambda s: " ".join(s.astype(str)), # Join the text
+ # Carry over the first value for columns that are constant within a group
+ "page": "first",
+ "line": "first",
+ "image": "first",
+ "label": "first",
+ "color": "first",
+ }
+
+ merged_df = df_sorted.groupby("merge_group").agg(agg_funcs).reset_index(drop=True)
+
+ return merged_df
+
+
+def get_and_merge_current_page_annotations(
+ page_sizes: List[Dict],
+ annotate_current_page: int,
+ existing_annotations_list: List[Dict],
+ existing_annotations_df: pd.DataFrame,
+) -> pd.DataFrame:
+ """
+ Function to extract and merge annotations for the current page
+ into the main existing_annotations_df.
+ """
+ current_page_image = page_sizes[annotate_current_page - 1]["image_path"]
+
+ existing_annotations_current_page = [
+ item
+ for item in existing_annotations_list
+ if item["image"] == current_page_image
+ ]
+
+ current_page_annotations_df = convert_annotation_data_to_dataframe(
+ existing_annotations_current_page
+ )
+
+ # Concatenate and clean, ensuring no duplicates and sorted order.
+ # Deduplicate only by non-null id: pandas treats NaN==NaN in drop_duplicates(subset=["id"]),
+ # which would collapse all rows with missing id to one and drop annotations on other pages.
+ dfs_to_concat = [
+ df
+ for df in [existing_annotations_df, current_page_annotations_df]
+ if not df.empty
+ ]
+ if dfs_to_concat:
+ combined = _concat_frames_without_all_na_warning(
+ dfs_to_concat, ignore_index=True
+ )
+ if "id" in combined.columns:
+ has_id = combined["id"].notna()
+ if has_id.any():
+ deduped = combined.loc[has_id].drop_duplicates(
+ subset=["id"], keep="first"
+ )
+ no_id = combined.loc[~has_id]
+ parts = [p for p in [no_id, deduped] if not p.empty]
+ if len(parts) == 1:
+ updated_df = parts[0].sort_values(by=["page", "xmin", "ymin"])
+ else:
+ updated_df = _concat_frames_without_all_na_warning(
+ parts, ignore_index=True
+ ).sort_values(by=["page", "xmin", "ymin"])
+ else:
+ updated_df = combined.sort_values(by=["page", "xmin", "ymin"])
+ else:
+ updated_df = combined.sort_values(by=["page", "xmin", "ymin"])
+ else:
+ # Return empty DataFrame with expected columns from convert_annotation_data_to_dataframe
+ updated_df = pd.DataFrame(
+ columns=[
+ "image",
+ "page",
+ "label",
+ "color",
+ "xmin",
+ "xmax",
+ "ymin",
+ "ymax",
+ "text",
+ "id",
+ ]
+ )
+
+ # Ensure no box spans to the very bottom (ymax == 1); cap ymax to just below 1
+ # so that unmerged boxes (e.g. from OCR with line shared across page) don't get ymax=1.
+ if (
+ not updated_df.empty
+ and "ymax" in updated_df.columns
+ and "ymin" in updated_df.columns
+ ):
+ ymax_cap = 1.0
+ ymax_vals = pd.to_numeric(updated_df["ymax"], errors="coerce")
+ need_cap = ymax_vals >= 1.0
+ if need_cap.any():
+ updated_df = updated_df.copy()
+ updated_df.loc[need_cap, "ymax"] = ymax_vals.loc[need_cap].clip(
+ upper=ymax_cap
+ )
+ # Keep box valid: ymax must remain > ymin
+ ymin_vals = pd.to_numeric(updated_df.loc[need_cap, "ymin"], errors="coerce")
+ invalid = updated_df.loc[need_cap, "ymax"].values <= ymin_vals.values
+ if invalid.any():
+ idx = updated_df.index[need_cap][invalid]
+ updated_df.loc[idx, "ymax"] = (
+ pd.to_numeric(updated_df.loc[idx, "ymin"], errors="coerce") + 1e-6
+ )
+
+ return updated_df
+
+
+def create_annotation_objects_from_filtered_ocr_results_with_words(
+ filtered_ocr_results_with_words_df: pd.DataFrame,
+ ocr_results_with_words_df_base: pd.DataFrame,
+ page_sizes: List[Dict],
+ existing_annotations_df: pd.DataFrame,
+ existing_annotations_list: List[Dict],
+ existing_recogniser_entity_df: pd.DataFrame,
+ redaction_label: str = "Redaction",
+ colour_label: str = str(CUSTOM_BOX_COLOUR),
+ annotate_current_page: int = 1,
+ progress: gr.Progress = gr.Progress(),
+) -> Tuple[
+ List[Dict], List[Dict], pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame
+]:
+ """
+ This function processes filtered OCR results with words to create new annotation objects. It merges these new annotations with existing ones, ensuring that horizontally adjacent boxes are combined for cleaner redactions. The function also updates the existing recogniser entity DataFrame and returns the updated annotations in both DataFrame and list-of-dicts formats.
+
+ Args:
+ filtered_ocr_results_with_words_df (pd.DataFrame): A DataFrame containing filtered OCR results with words.
+ ocr_results_with_words_df_base (pd.DataFrame): The base DataFrame of OCR results with words.
+ page_sizes (List[Dict]): A list of dictionaries containing page sizes.
+ existing_annotations_df (pd.DataFrame): A DataFrame of existing annotations.
+ existing_annotations_list (List[Dict]): A list of dictionaries representing existing annotations.
+ existing_recogniser_entity_df (pd.DataFrame): A DataFrame of existing recogniser entities.
+ progress (gr.Progress, optional): A progress tracker. Defaults to gr.Progress(track_tqdm=True).
+
+ Returns:
+ Tuple[List[Dict], List[Dict], pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: A tuple containing the updated annotations list, updated existing annotations list, updated annotations DataFrame, updated existing annotations DataFrame, updated recogniser entity DataFrame, and the original existing recogniser entity DataFrame.
+ """
+
+ existing_annotations_df = get_and_merge_current_page_annotations(
+ page_sizes,
+ annotate_current_page,
+ existing_annotations_list,
+ existing_annotations_df,
+ )
+
+ # Validate colour_label: must be a 3-number tuple string (0-255), or tuple/list, or hex string
+ # If invalid, fallback to '(0, 0, 0)' as requested
+ fallback_colour = str(CUSTOM_BOX_COLOUR)
+ colour_label = str(colour_label)
+
+ def _parse_hex_to_rgb(s: str):
+ """Parse #RGB, #RRGGBB, RGB or RRGGBB to (r, g, b) or None."""
+ s = s.strip()
+ if s.startswith("#"):
+ s = s[1:].strip()
+ if len(s) not in (3, 6):
+ return None
+ if not all(c in "0123456789aAbBcCdDeEfF" for c in s):
+ return None
+ try:
+ if len(s) == 3:
+ r_val = int(s[0] * 2, 16)
+ g_val = int(s[1] * 2, 16)
+ b_val = int(s[2] * 2, 16)
+ else:
+ r_val = int(s[0:2], 16)
+ g_val = int(s[2:4], 16)
+ b_val = int(s[4:6], 16)
+ return (r_val, g_val, b_val)
+ except ValueError:
+ return None
+
+ def _parse_rgba_to_rgb(s: str):
+ """Parse rgba(r, g, b, a) to (r, g, b) in 0-255, or None. Handles 0-1 and 0-255 RGB."""
+ s = s.strip()
+ if len(s) > 120:
+ return None
+ if not s.lower().startswith("rgba(") or ")" not in s:
+ return None
+ match = re.match(
+ r"rgba\s*\(\s*([\d.]+)\s*,\s*([\d.]+)\s*,\s*([\d.]+)\s*,\s*[\d.]+\s*\)",
+ s,
+ re.IGNORECASE,
+ )
+ if not match:
+ return None
+ try:
+ r_val = float(match.group(1))
+ g_val = float(match.group(2))
+ b_val = float(match.group(3))
+ if max(r_val, g_val, b_val) > 1:
+ r_val, g_val, b_val = (
+ int(round(r_val)),
+ int(round(g_val)),
+ int(round(b_val)),
+ )
+ else:
+ r_val = int(round(r_val * 255))
+ g_val = int(round(g_val * 255))
+ b_val = int(round(b_val * 255))
+ if 0 <= r_val <= 255 and 0 <= g_val <= 255 and 0 <= b_val <= 255:
+ return (r_val, g_val, b_val)
+ except (ValueError, TypeError):
+ pass
+ return None
+
+ try:
+ valid = False
+ if isinstance(colour_label, str):
+ label_str = colour_label.strip()
+ from tools.secure_regex_utils import safe_extract_rgb_values
+
+ rgb_values = safe_extract_rgb_values(label_str)
+ if not rgb_values:
+ rgb_values = _parse_hex_to_rgb(label_str)
+ if not rgb_values:
+ rgb_values = _parse_rgba_to_rgb(label_str)
+ if rgb_values:
+ r_val, g_val, b_val = rgb_values
+ if 0 <= r_val <= 255 and 0 <= g_val <= 255 and 0 <= b_val <= 255:
+ colour_label = f"({r_val}, {g_val}, {b_val})"
+ valid = True
+ elif isinstance(colour_label, (tuple, list)) and len(colour_label) == 3:
+ r_val, g_val, b_val = colour_label
+ if all(isinstance(v, int) for v in (r_val, g_val, b_val)) and all(
+ 0 <= v <= 255 for v in (r_val, g_val, b_val)
+ ):
+ colour_label = f"({r_val}, {g_val}, {b_val})"
+ valid = True
+ if not valid:
+ colour_label = fallback_colour
+ except Exception:
+ colour_label = fallback_colour
+
+ progress(0.2, desc="Identifying new redactions to add")
+ print("Identifying new redactions to add")
+ if filtered_ocr_results_with_words_df.empty:
+ print("No new annotations to add.")
+ updated_annotations_df = existing_annotations_df.copy()
+ else:
+ # Assuming index relationship holds for fast lookup
+ filtered_ocr_results_with_words_df.index = filtered_ocr_results_with_words_df[
+ "index"
+ ]
+ new_annotations_df = ocr_results_with_words_df_base.loc[
+ filtered_ocr_results_with_words_df.index
+ ].copy()
+
+ if new_annotations_df.empty:
+ print("No new annotations to add.")
+ updated_annotations_df = existing_annotations_df.copy()
+ else:
+ page_to_image_map = {
+ item["page"]: item["image_path"] for item in page_sizes
+ }
+
+ # Prepare the initial new annotations DataFrame
+ new_annotations_df = new_annotations_df.assign(
+ image=lambda df: df["page"].map(page_to_image_map),
+ label=redaction_label,
+ color=colour_label,
+ ).rename(
+ columns={
+ "word_x0": "xmin",
+ "word_y0": "ymin",
+ "word_x1": "xmax",
+ "word_y1": "ymax",
+ "word_text": "text",
+ }
+ )
+
+ # Clip box to line-level bounds (all four coordinates) when available
+ _eps = 1e-6
+ line_cols = ["line_x0", "line_x1", "line_y0", "line_y1"]
+ has_line = all(c in new_annotations_df.columns for c in line_cols)
+ if has_line:
+ ymax_fallback = 1.0 - _eps
+ lx0 = pd.to_numeric(new_annotations_df["line_x0"], errors="coerce")
+ lx1 = pd.to_numeric(new_annotations_df["line_x1"], errors="coerce")
+ ly0 = pd.to_numeric(new_annotations_df["line_y0"], errors="coerce")
+ ly1 = pd.to_numeric(new_annotations_df["line_y1"], errors="coerce")
+ valid = (
+ lx0.notna()
+ & lx1.notna()
+ & ly0.notna()
+ & ly1.notna()
+ & (lx0 >= 0)
+ & (lx1 <= 1)
+ & (ly0 >= 0)
+ & (ly1 <= 1)
+ & (lx0 < lx1)
+ & (ly0 < ly1)
+ )
+ if valid.any():
+ new_annotations_df = new_annotations_df.copy()
+ ly1_safe = ly1.where(ly1 < 1).fillna(ymax_fallback)
+ new_annotations_df.loc[valid, "xmin"] = pd.to_numeric(
+ new_annotations_df.loc[valid, "xmin"], errors="coerce"
+ ).clip(lower=lx0.loc[valid])
+ new_annotations_df.loc[valid, "xmax"] = pd.to_numeric(
+ new_annotations_df.loc[valid, "xmax"], errors="coerce"
+ ).clip(upper=lx1.loc[valid])
+ new_annotations_df.loc[valid, "ymin"] = pd.to_numeric(
+ new_annotations_df.loc[valid, "ymin"], errors="coerce"
+ ).clip(lower=ly0.loc[valid])
+ new_annotations_df.loc[valid, "ymax"] = pd.to_numeric(
+ new_annotations_df.loc[valid, "ymax"], errors="coerce"
+ ).clip(upper=ly1_safe.loc[valid])
+ # Ensure valid box
+ xinv = (
+ new_annotations_df.loc[valid, "xmin"]
+ >= new_annotations_df.loc[valid, "xmax"]
+ )
+ yinv = (
+ new_annotations_df.loc[valid, "ymin"]
+ >= new_annotations_df.loc[valid, "ymax"]
+ )
+ if xinv.any():
+ idx = new_annotations_df.index[valid][xinv]
+ mid = (
+ pd.to_numeric(
+ new_annotations_df.loc[idx, "xmin"], errors="coerce"
+ )
+ + pd.to_numeric(
+ new_annotations_df.loc[idx, "xmax"], errors="coerce"
+ )
+ ) / 2
+ new_annotations_df.loc[idx, "xmin"] = (mid - _eps).clip(0, 1)
+ new_annotations_df.loc[idx, "xmax"] = (mid + _eps).clip(0, 1)
+ if yinv.any():
+ idx = new_annotations_df.index[valid][yinv]
+ mid = (
+ pd.to_numeric(
+ new_annotations_df.loc[idx, "ymin"], errors="coerce"
+ )
+ + pd.to_numeric(
+ new_annotations_df.loc[idx, "ymax"], errors="coerce"
+ )
+ ) / 2
+ new_annotations_df.loc[idx, "ymin"] = (mid - _eps).clip(0, 1)
+ new_annotations_df.loc[idx, "ymax"] = (mid + _eps).clip(0, 1)
+ else:
+ # No line bounds: cap ymax only so no box spans to bottom
+ ymax_vals = pd.to_numeric(new_annotations_df["ymax"], errors="coerce")
+ need_cap = ymax_vals >= 1.0
+ if need_cap.any():
+ new_annotations_df = new_annotations_df.copy()
+ new_annotations_df.loc[need_cap, "ymax"] = ymax_vals.loc[
+ need_cap
+ ].clip(upper=1.0 - _eps)
+ ymin_vals = pd.to_numeric(
+ new_annotations_df.loc[need_cap, "ymin"], errors="coerce"
+ )
+ invalid = (
+ new_annotations_df.loc[need_cap, "ymax"].values
+ <= ymin_vals.values
+ )
+ if invalid.any():
+ idx = new_annotations_df.index[need_cap][invalid]
+ new_annotations_df.loc[idx, "ymax"] = (
+ pd.to_numeric(
+ new_annotations_df.loc[idx, "ymin"], errors="coerce"
+ )
+ + _eps
+ )
+
+ progress(0.3, desc="Checking for adjacent annotations to merge...")
+ new_annotations_df = _merge_horizontally_adjacent_boxes(new_annotations_df)
+
+ progress(0.4, desc="Creating new redaction IDs...")
+ existing_ids = (
+ set(existing_annotations_df["id"].dropna())
+ if "id" in existing_annotations_df.columns
+ else set()
+ )
+ num_new_ids = len(new_annotations_df)
+ new_id_list = _generate_unique_ids(num_new_ids, existing_ids)
+ new_annotations_df["id"] = new_id_list
+
+ annotation_cols = [
+ "image",
+ "page",
+ "label",
+ "color",
+ "xmin",
+ "ymin",
+ "xmax",
+ "ymax",
+ "text",
+ "id",
+ ]
+ new_annotations_df = new_annotations_df[annotation_cols]
+
+ key_cols = ["page", "label", "xmin", "ymin", "xmax", "ymax", "text"]
+
+ progress(0.5, desc="Checking for duplicate redactions")
+
+ if existing_annotations_df.empty or not all(
+ col in existing_annotations_df.columns for col in key_cols
+ ):
+ unique_new_df = new_annotations_df
+ else:
+ # Ensure that columns of both sides have the same type
+ new_annotations_df.loc[:, key_cols] = new_annotations_df.loc[
+ :, key_cols
+ ].astype(existing_annotations_df.loc[:, key_cols].dtypes)
+
+ # Do not add duplicate redactions
+ merged = pd.merge(
+ new_annotations_df,
+ existing_annotations_df[key_cols].drop_duplicates(),
+ on=key_cols,
+ how="left",
+ indicator=True,
+ )
+ unique_new_df = merged[merged["_merge"] == "left_only"].drop(
+ columns=["_merge"]
+ )
+
+ print(f"Found {len(unique_new_df)} new unique annotations to add.")
+ gr.Info(f"Found {len(unique_new_df)} new unique annotations to add.")
+ # Filter out empty DataFrames before concatenation to avoid FutureWarning
+ dfs_to_concat = [
+ df for df in [existing_annotations_df, unique_new_df] if not df.empty
+ ]
+ if dfs_to_concat:
+ updated_annotations_df = _concat_frames_without_all_na_warning(
+ dfs_to_concat, ignore_index=True
+ )
+ else:
+ # Return empty DataFrame with expected columns matching existing_annotations_df structure
+ updated_annotations_df = pd.DataFrame(
+ columns=[
+ "image",
+ "page",
+ "label",
+ "color",
+ "xmin",
+ "xmax",
+ "ymin",
+ "ymax",
+ "text",
+ "id",
+ ]
+ )
+
+ # --- Part 4: Convert final DataFrame to list-of-dicts ---
+ updated_recogniser_entity_df = pd.DataFrame()
+ if not updated_annotations_df.empty:
+ updated_recogniser_entity_df = updated_annotations_df[
+ ["page", "label", "text", "id"]
+ ]
+
+ if not page_sizes:
+ print("Warning: page_sizes is empty. No pages to process.")
+ return (
+ [],
+ existing_annotations_list,
+ pd.DataFrame(),
+ existing_annotations_df,
+ pd.DataFrame(),
+ existing_recogniser_entity_df,
+ )
+
+ # Always derive image paths from page using current page_sizes, so that
+ # updated_annotations_df never has None/missing image when page is valid
+ # (e.g. after copy from existing_annotations_df or concat with unique_new_df).
+
+ all_pages_df = pd.DataFrame(page_sizes).rename(columns={"image_path": "image"})
+
+ # Join image paths to updated_annotations_df based on page number
+ # Drop image column from updated_annotations_df
+ updated_annotations_df = updated_annotations_df.drop(columns=["image"])
+
+ # set page to number
+ updated_annotations_df["page"] = updated_annotations_df["page"].astype(int)
+ all_pages_df["page"] = all_pages_df["page"].astype(int)
+
+ updated_annotations_df = pd.merge(
+ updated_annotations_df, all_pages_df[["page", "image"]], on="page", how="left"
+ )
+
+ if not updated_annotations_df.empty and "page" in updated_annotations_df.columns:
+ missing_image = updated_annotations_df["image"].isna()
+ if missing_image.any():
+ n_missing = missing_image.sum()
+ print(
+ f"Warning: {n_missing} annotation(s) have page not in page_sizes; "
+ "they will not appear in output. Dropping them from updated_annotations_df."
+ )
+ updated_annotations_df = updated_annotations_df.loc[~missing_image].copy()
+ # Keep recogniser entity in sync with possibly trimmed annotations
+ if not updated_annotations_df.empty:
+ updated_recogniser_entity_df = updated_annotations_df[
+ ["page", "label", "text", "id"]
+ ]
+ else:
+ updated_recogniser_entity_df = pd.DataFrame()
+
+ if not updated_annotations_df.empty:
+ merged_df = pd.merge(
+ all_pages_df[["image"]], updated_annotations_df, on="image", how="left"
+ )
+ else:
+ merged_df = all_pages_df[["image"]]
+
+ # 1. Get the list of image paths in the exact order they appear in page_sizes.
+ # all_pages_df was created from page_sizes, so it preserves this order.
+ image_order = all_pages_df["image"].tolist()
+
+ # 2. Convert the 'image' column to a special 'Categorical' type.
+ # This tells pandas that this column has a custom, non-alphabetical order.
+ merged_df["image"] = pd.Categorical(
+ merged_df["image"], categories=image_order, ordered=True
+ )
+
+ # 3. Sort the DataFrame based on this new custom order.
+ merged_df = merged_df.sort_values("image")
+
+ final_annotations_list = list()
+ box_cols = ["label", "color", "xmin", "ymin", "xmax", "ymax", "text", "id"]
+
+ # Process each (image_path, group) in parallel; preserve order via index.
+ group_items = [
+ (i, image_path, group)
+ for i, (image_path, group) in enumerate(
+ merged_df.groupby("image", sort=False, observed=False)
+ )
+ ]
+
+ def _process_one_group(item):
+ _i, _image_path, _group = item
+ if pd.isna(_group.iloc[0].get("id")):
+ _boxes = list()
+ else:
+ _valid_box_cols = [col for col in box_cols if col in _group.columns]
+ _sorted_group = _group.sort_values(by=["ymin", "xmin"]).copy()
+ # Ensure coord columns have no NaN so image_annotator preprocess_boxes doesn't raise TypeError
+ for coord in ["xmin", "ymin", "xmax", "ymax"]:
+ if coord in _sorted_group.columns:
+ _sorted_group[coord] = pd.to_numeric(
+ _sorted_group[coord], errors="coerce"
+ ).fillna(0.0)
+ _boxes = _sorted_group[_valid_box_cols].to_dict("records")
+ return (_i, {"image": _image_path, "boxes": _boxes})
+
+ if group_items:
+ n_groups = len(group_items)
+ max_workers = min(MAX_WORKERS, n_groups)
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
+ ordered_results = sorted(
+ executor.map(_process_one_group, group_items), key=lambda x: x[0]
+ )
+ final_annotations_list = [r[1] for r in ordered_results]
+
+ progress(1.0, desc="Completed annotation processing")
+
+ return (
+ final_annotations_list,
+ existing_annotations_list,
+ updated_annotations_df,
+ existing_annotations_df,
+ updated_recogniser_entity_df,
+ existing_recogniser_entity_df,
+ )
+
+
+def exclude_selected_items_from_redaction(
+ review_df: pd.DataFrame,
+ selected_rows_df: pd.DataFrame,
+ image_file_paths: List[str],
+ page_sizes: List[dict],
+ image_annotations_state: dict,
+ recogniser_entity_dataframe_base: pd.DataFrame,
+):
+ """
+ Remove selected items from the review dataframe from the annotation object and review dataframe.
+ """
+
+ backup_review_state = review_df
+ backup_image_annotations_state = image_annotations_state
+ backup_recogniser_entity_dataframe_base = recogniser_entity_dataframe_base
+
+ if not selected_rows_df.empty and not review_df.empty:
+ use_id = (
+ "id" in selected_rows_df.columns
+ and "id" in review_df.columns
+ and not selected_rows_df["id"].isnull().all()
+ and not review_df["id"].isnull().all()
+ )
+
+ selected_merge_cols = ["id"] if use_id else ["label", "page", "text"]
+
+ # Subset and drop duplicates from selected_rows_df
+ selected_subset = selected_rows_df[selected_merge_cols].drop_duplicates(
+ subset=selected_merge_cols
+ )
+
+ # Perform anti-join using merge with indicator
+ merged_df = review_df.merge(
+ selected_subset, on=selected_merge_cols, how="left", indicator=True
+ )
+ out_review_df = merged_df[merged_df["_merge"] == "left_only"].drop(
+ columns=["_merge"]
+ )
+
+ out_image_annotations_state = convert_review_df_to_annotation_json(
+ out_review_df, image_file_paths, page_sizes
+ )
+
+ out_recogniser_entity_dataframe_base = out_review_df[
+ ["page", "label", "text", "id"]
+ ]
+
+ # Either there is nothing left in the selection dataframe, or the review dataframe
+ else:
+ out_review_df = review_df
+ out_recogniser_entity_dataframe_base = recogniser_entity_dataframe_base
+ out_image_annotations_state = image_annotations_state
+
+ return (
+ out_review_df,
+ out_image_annotations_state,
+ out_recogniser_entity_dataframe_base,
+ backup_review_state,
+ backup_image_annotations_state,
+ backup_recogniser_entity_dataframe_base,
+ )
+
+
+def replace_annotator_object_img_np_array_with_page_sizes_image_path(
+ all_image_annotations: List[dict],
+ page_image_annotator_object: AnnotatedImageData,
+ page_sizes: List[dict],
+ page: int,
+ page_sizes_df: pd.DataFrame = None,
+):
+ """
+ Check if the image value in an AnnotatedImageData dict is a placeholder or np.array. If either of these, replace the value with the file path of the image that is hopefully already loaded into the app related to this page.
+ """
+ page_zero_index = page - 1
+
+ if (
+ isinstance(all_image_annotations[page_zero_index]["image"], np.ndarray)
+ or "placeholder_image"
+ in str(all_image_annotations[page_zero_index].get("image", ""))
+ or isinstance(page_image_annotator_object.get("image"), np.ndarray)
+ ):
+ if page_sizes_df is None or page_sizes_df.empty:
+ page_sizes_df = pd.DataFrame(page_sizes)
+ page_sizes_df[["page"]] = page_sizes_df[["page"]].apply(
+ pd.to_numeric, errors="coerce"
+ )
+
+ # Check for matching pages (single .loc)
+ matching_paths = page_sizes_df.loc[
+ page_sizes_df["page"] == page, "image_path"
+ ].unique()
+
+ if matching_paths.size > 0:
+ image_path = matching_paths[0]
+ page_image_annotator_object["image"] = image_path
+ all_image_annotations[page_zero_index]["image"] = image_path
+ else:
+ print(f"No image path found for page {page}.")
+
+ return page_image_annotator_object, all_image_annotations
+
+
+def replace_placeholder_image_with_real_image(
+ doc_full_file_name_textbox: str,
+ current_image_path: str,
+ page_sizes_df: pd.DataFrame,
+ page_num_reported: int,
+ input_folder: str,
+):
+ """If image path is still not valid, load in a new image an overwrite it. Then replace all items in the image annotation object for all pages based on the updated information."""
+ if page_num_reported <= 0:
+ page_num_reported = 1
+
+ page_num_reported_zero_indexed = page_num_reported - 1
+
+ # Compute mask once to avoid repeated boolean indexing over the full DataFrame
+ if "page" not in page_sizes_df.columns:
+ page_mask = pd.Series(False, index=page_sizes_df.index)
+ else:
+ page_col = pd.to_numeric(page_sizes_df["page"], errors="coerce")
+ page_mask = page_col == page_num_reported
+
+ if not os.path.exists(current_image_path):
+
+ page_num, replaced_image_path, width, height = (
+ process_single_page_for_image_conversion(
+ doc_full_file_name_textbox,
+ page_num_reported_zero_indexed,
+ input_folder=input_folder,
+ )
+ )
+
+ page_sizes_df.loc[page_mask, "image_width"] = width
+ page_sizes_df.loc[page_mask, "image_height"] = height
+ page_sizes_df.loc[page_mask, "image_path"] = replaced_image_path
+
+ else:
+ if page_mask.any():
+ width_vals = page_sizes_df.loc[page_mask, "image_width"]
+ if not width_vals.isnull().all():
+ width = width_vals.max()
+ height = page_sizes_df.loc[page_mask, "image_height"].max()
+ else:
+ image = Image.open(current_image_path)
+ width = image.width
+ height = image.height
+ page_sizes_df.loc[page_mask, "image_width"] = width
+ page_sizes_df.loc[page_mask, "image_height"] = height
+ else:
+ width = height = None
+
+ page_sizes_df.loc[page_mask, "image_path"] = current_image_path
+ replaced_image_path = current_image_path
+
+ return replaced_image_path, page_sizes_df
+
+
+def update_annotator_object_and_filter_df(
+ all_image_annotations: List[AnnotatedImageData],
+ gradio_annotator_current_page_number: int,
+ recogniser_entities_dropdown_value: str = "ALL",
+ page_dropdown_value: str = "ALL",
+ page_dropdown_redaction_value: str = "1",
+ text_dropdown_value: str = "ALL",
+ recogniser_dataframe_base: pd.DataFrame = None, # Simplified default
+ zoom: int = 100,
+ review_df: pd.DataFrame = None, # Use None for default empty DataFrame
+ page_sizes: List[dict] = list(),
+ doc_full_file_name_textbox: str = "",
+ input_folder: str = INPUT_FOLDER,
+) -> Tuple[
+ AnnotatedImageData,
+ int,
+ int,
+ int,
+ str,
+ pd.DataFrame,
+ pd.DataFrame,
+ List[str],
+ List[str],
+ List[dict],
+ List[AnnotatedImageData],
+]:
+ """
+ Update a gradio_image_annotation object with new annotation data for the current page
+ and update filter dataframes, optimizing by processing only the current page's data for display.
+
+ Args:
+ all_image_annotations (List[AnnotatedImageData]): All image annotation objects to process.
+ gradio_annotator_current_page_number (int): The current page number as selected in the annotator.
+ recogniser_entities_dropdown_value (str, optional): Value for the recogniser dropdown filter. Defaults to "ALL".
+ page_dropdown_value (str, optional): Value for the page dropdown filter. Defaults to "ALL".
+ page_dropdown_redaction_value (str, optional): Value for the redaction page dropdown filter. Defaults to "1".
+ text_dropdown_value (str, optional): Value for the text dropdown filter. Defaults to "ALL".
+ recogniser_dataframe_base (pd.DataFrame, optional): The base recogniser dataframe. Defaults to None.
+ zoom (int, optional): Zoom level for display in the annotator. Defaults to 100.
+ review_df (pd.DataFrame, optional): Review DataFrame containing annotation boxes. Defaults to None.
+ page_sizes (List[dict], optional): List of dictionaries containing page size information. Defaults to empty list.
+ doc_full_file_name_textbox (str, optional): Full file name shown in the textbox. Defaults to empty string.
+ input_folder (str, optional): Path to the input folder. Defaults to INPUT_FOLDER.
+
+ Returns:
+ Tuple[
+ image_annotator,
+ int,
+ int,
+ int,
+ str,
+ pd.DataFrame,
+ pd.DataFrame,
+ List[str],
+ List[str],
+ List[dict],
+ List[AnnotatedImageData],
+ ]: Updated Gradio components and relevant page annotations.
+ """
+
+ str(zoom) + "%"
+
+ # Handle default empty review_df and recogniser_dataframe_base
+ if review_df is None or not isinstance(review_df, pd.DataFrame):
+ review_df = pd.DataFrame(
+ columns=[
+ "image",
+ "page",
+ "label",
+ "color",
+ "xmin",
+ "ymin",
+ "xmax",
+ "ymax",
+ "text",
+ "id",
+ ]
+ )
+ if recogniser_dataframe_base is None: # Create a simple default if None
+ recogniser_dataframe_base = pd.DataFrame(
+ pd.DataFrame(columns=["page", "label", "text", "id"])
+ )
+
+ # Handle empty all_image_annotations state early
+ if not all_image_annotations:
+ print("No all_image_annotation object found")
+ # Return blank/default outputs
+
+ blank_annotator = None
+ blank_df_out_gr = pd.DataFrame(columns=["page", "label", "text", "id"])
+ blank_df_modified = pd.DataFrame(columns=["page", "label", "text", "id"])
+
+ return (
+ blank_annotator,
+ 1,
+ 1,
+ 1,
+ recogniser_entities_dropdown_value,
+ blank_df_out_gr,
+ blank_df_modified,
+ [],
+ [],
+ [],
+ [],
+ [],
+ ) # Return empty lists/defaults for other outputs
+
+ # Validate and bound the current page number (1-based logic)
+ page_num_reported = max(
+ 1, gradio_annotator_current_page_number
+ ) # Minimum page is 1
+ page_max_reported = len(all_image_annotations)
+ if page_num_reported > page_max_reported:
+ page_num_reported = page_max_reported
+
+ page_num_reported_zero_indexed = page_num_reported - 1
+
+ # --- Process page sizes DataFrame ---
+ page_sizes_df = pd.DataFrame(page_sizes)
+ if not page_sizes_df.empty:
+ page_sizes_df["page"] = pd.to_numeric(page_sizes_df["page"], errors="coerce")
+ page_sizes_df.dropna(subset=["page"], inplace=True)
+ if not page_sizes_df.empty:
+ page_sizes_df["page"] = page_sizes_df["page"].astype(int)
+ else:
+ print("Warning: Page sizes DataFrame became empty after processing.")
+
+ # --- Handle Image Path Replacement for the Current Page ---
+ if len(all_image_annotations) > page_num_reported_zero_indexed:
+
+ page_object_to_update = all_image_annotations[page_num_reported_zero_indexed]
+
+ # Use the helper function to replace the image path within the page object
+ updated_page_object, all_image_annotations_after_img_replace = (
+ replace_annotator_object_img_np_array_with_page_sizes_image_path(
+ all_image_annotations,
+ page_object_to_update,
+ page_sizes,
+ page_num_reported,
+ page_sizes_df=page_sizes_df,
+ )
+ )
+
+ all_image_annotations = all_image_annotations_after_img_replace
+
+ # Now handle the actual image file path replacement using replace_placeholder_image_with_real_image
+ current_image_path = updated_page_object.get(
+ "image"
+ ) # Get potentially updated image path
+
+ if current_image_path and not page_sizes_df.empty:
+ try:
+ replaced_image_path, page_sizes_df = (
+ replace_placeholder_image_with_real_image(
+ doc_full_file_name_textbox,
+ current_image_path,
+ page_sizes_df,
+ page_num_reported,
+ input_folder=input_folder, # Use 1-based page num
+ )
+ )
+
+ # Update the image path in the state and review_df for the current page
+ # Find the correct entry in all_image_annotations list again by index
+ if len(all_image_annotations) > page_num_reported_zero_indexed:
+ all_image_annotations[page_num_reported_zero_indexed][
+ "image"
+ ] = replaced_image_path
+
+ # Update review_df's image path for this page
+ if "page" in review_df.columns and "image" in review_df.columns:
+ if not pd.api.types.is_numeric_dtype(review_df["page"]):
+ review_df["page"] = (
+ pd.to_numeric(review_df["page"], errors="coerce")
+ .fillna(-1)
+ .astype(int)
+ )
+ review_df.loc[review_df["page"] == page_num_reported, "image"] = (
+ replaced_image_path
+ )
+
+ except Exception as e:
+ print(
+ f"Error during image path replacement for page {page_num_reported}: {e}"
+ )
+ else:
+ print(
+ f"Warning: Page index {page_num_reported_zero_indexed} out of bounds for all_image_annotations list."
+ )
+
+ # Save back page_sizes_df to page_sizes list format
+ if not page_sizes_df.empty:
+ page_sizes = page_sizes_df.to_dict(orient="records")
+ else:
+ page_sizes = list() # Ensure page_sizes is a list if df is empty
+
+ # --- Prepare data *only* for the current page for display ---
+ current_page_image_annotator_object = None
+ if len(all_image_annotations) > page_num_reported_zero_indexed:
+ page_data_for_display = all_image_annotations[page_num_reported_zero_indexed]
+
+ # Convert current page annotations list to DataFrame for coordinate multiplication IF needed
+ # Assuming coordinate multiplication IS needed for display if state stores relative coords
+ current_page_annotations_df = convert_annotation_data_to_dataframe(
+ [page_data_for_display]
+ )
+
+ if not current_page_annotations_df.empty and not page_sizes_df.empty:
+ # Multiply coordinates *only* for this page's DataFrame (reuse single filter)
+ try:
+ page_size_row = page_sizes_df[
+ page_sizes_df["page"] == page_num_reported
+ ]
+ if not page_size_row.empty:
+ current_page_annotations_df = multiply_coordinates_by_page_sizes(
+ current_page_annotations_df,
+ page_size_row,
+ xmin="xmin",
+ xmax="xmax",
+ ymin="ymin",
+ ymax="ymax",
+ )
+ except Exception as e:
+ print(
+ f"Warning: Error during coordinate multiplication for page {page_num_reported}: {e}. Using original coordinates."
+ )
+ # If error, proceed with original coordinates or handle as needed
+
+ if "color" not in current_page_annotations_df.columns:
+ # Preserve user-defined box colours from review_df (e.g. after Apply redactions)
+ if (
+ review_df is not None
+ and not review_df.empty
+ and "color" in review_df.columns
+ and "page" in review_df.columns
+ ):
+ review_page = pd.to_numeric(review_df["page"], errors="coerce")
+ page_mask = review_page == page_num_reported
+ review_page_df = review_df.loc[page_mask].copy()
+ if (
+ not review_page_df.empty
+ and "id" in review_page_df.columns
+ and "id" in current_page_annotations_df.columns
+ ):
+ # Match by id so colours stay with the right box
+ id_to_color = review_page_df.set_index("id")["color"].to_dict()
+ current_page_annotations_df["color"] = current_page_annotations_df[
+ "id"
+ ].map(lambda x: id_to_color.get(x, CUSTOM_BOX_COLOUR))
+ elif not review_page_df.empty and len(review_page_df) == len(
+ current_page_annotations_df
+ ):
+ current_page_annotations_df["color"] = review_page_df[
+ "color"
+ ].values
+ else:
+ current_page_annotations_df["color"] = CUSTOM_BOX_COLOUR
+ else:
+ current_page_annotations_df["color"] = CUSTOM_BOX_COLOUR
+ # gradio_image_annotation JS expects colour as string (e.g. .startsWith("rgba"))
+ current_page_annotations_df["color"] = current_page_annotations_df[
+ "color"
+ ].apply(_ensure_box_colour_string)
+
+ # Ensure coord columns have no NaN/None so image_annotator preprocess_boxes doesn't raise TypeError
+ coord_cols = ["xmin", "xmax", "ymin", "ymax"]
+ for col in coord_cols:
+ if col in current_page_annotations_df.columns:
+ current_page_annotations_df[col] = pd.to_numeric(
+ current_page_annotations_df[col], errors="coerce"
+ ).fillna(0.0)
+
+ # Convert the processed DataFrame back to the list of dicts format for the annotator
+ processed_current_page_annotations_list = current_page_annotations_df[
+ ["xmin", "xmax", "ymin", "ymax", "label", "color", "text", "id"]
+ ].to_dict(orient="records")
+
+ # Construct the final object expected by the Gradio ImageAnnotator value parameter
+ current_page_image_annotator_object: AnnotatedImageData = {
+ "image": page_data_for_display.get(
+ "image"
+ ), # Use the (potentially updated) image path
+ "boxes": processed_current_page_annotations_list,
+ }
+
+ # --- Update Dropdowns and Review DataFrame ---
+ try:
+ (
+ recogniser_entities_list,
+ recogniser_dataframe_out_gr,
+ recogniser_dataframe_modified,
+ recogniser_entities_dropdown_value,
+ text_entities_drop,
+ page_entities_drop,
+ ) = update_recogniser_dataframes(
+ all_image_annotations, # Pass the updated full state
+ recogniser_dataframe_base,
+ recogniser_entities_dropdown_value,
+ text_dropdown_value,
+ page_dropdown_value,
+ review_df.copy(), # Keep the copy as per original function call
+ page_sizes, # Pass updated page sizes
+ )
+ # Generate default colors for labels (library expects hex string or RGB tuple; tuples are converted to hex)
+ [CUSTOM_BOX_COLOUR for _ in range(len(recogniser_entities_list))]
+
+ except Exception as e:
+ print(
+ f"Error calling update_recogniser_dataframes: {e}. Returning empty/default filter data."
+ )
+ recogniser_entities_list = list()
+ recogniser_dataframe_out_gr = pd.DataFrame(
+ columns=["page", "label", "text", "id"]
+ )
+ recogniser_dataframe_modified = pd.DataFrame(
+ columns=["page", "label", "text", "id"]
+ )
+ text_entities_drop = list()
+ page_entities_drop = list()
+
+ # --- Final Output Components ---
+ page_number_update = (
+ gr.update(value=page_num_reported, maximum=len(page_sizes)) if page_sizes else 0
+ )
+
+ ### Present image_annotator outputs
+ # Handle the case where current_page_image_annotator_object couldn't be prepared
+ if current_page_image_annotator_object is None:
+ # This should ideally be covered by the initial empty check for all_image_annotations,
+ # but as a safeguard:
+ print("Warning: Could not prepare annotator object for the current page.")
+ out_image_annotator = None
+ else:
+ if current_page_image_annotator_object["image"].startswith("placeholder_image"):
+ current_page_image_annotator_object["image"], page_sizes_df = (
+ replace_placeholder_image_with_real_image(
+ doc_full_file_name_textbox,
+ current_page_image_annotator_object["image"],
+ page_sizes_df,
+ gradio_annotator_current_page_number,
+ input_folder,
+ )
+ )
+
+ out_image_annotator = current_page_image_annotator_object
+
+ page_entities_drop_redaction_list = ["ALL"]
+ all_pages_in_doc_list = [str(i) for i in range(1, len(page_sizes) + 1)]
+ page_entities_drop_redaction_list.extend(all_pages_in_doc_list)
+
+ return (
+ out_image_annotator,
+ page_number_update,
+ page_number_update, # Redundant, but matches original return signature
+ page_num_reported, # Plain integer value
+ recogniser_entities_dropdown_value,
+ recogniser_dataframe_out_gr,
+ recogniser_dataframe_modified,
+ text_entities_drop, # List of text entities for dropdown
+ page_entities_drop, # List of page numbers for dropdown
+ gr.update(
+ value=page_dropdown_redaction_value,
+ choices=page_entities_drop_redaction_list,
+ allow_custom_value=True,
+ interactive=True,
+ ),
+ page_sizes, # Updated page_sizes list
+ all_image_annotations,
+ ) # Return the updated full state
+
+
+def update_all_page_annotation_object_based_on_previous_page(
+ page_image_annotator_object: AnnotatedImageData,
+ current_page: int,
+ previous_page: int,
+ all_image_annotations: List[AnnotatedImageData],
+ page_sizes: List[dict] = list(),
+ clear_all: bool = False,
+):
+ """
+ Overwrite image annotations on the page we are moving from with modifications.
+
+ Converts annotator output coordinates to relative (0-1) before storing, so that
+ manually added boxes (which the annotator returns in display/canvas pixel space)
+ are stored consistently with existing boxes. Without this, new boxes would be
+ misplaced on the next display (shifted and scaled incorrectly).
+ """
+
+ if current_page > len(page_sizes):
+ raise Warning("Selected page is higher than last page number")
+ elif current_page <= 0:
+ raise Warning("Selected page is lower than first page")
+
+ previous_page_zero_index = previous_page - 1
+
+ if not current_page:
+ current_page = 1
+
+ # Derive image_height, image_width from image shape; orientation from object
+ img = page_image_annotator_object.get("image")
+ image_height = page_image_annotator_object.get("image_height")
+ image_width = page_image_annotator_object.get("image_width")
+ if isinstance(img, np.ndarray) and img.size > 0:
+ # shape is (height, width, channels)
+ image_height = int(img.shape[0])
+ image_width = int(img.shape[1])
+ page_image_annotator_object["image_height"] = image_height
+ page_image_annotator_object["image_width"] = image_width
+ orientation = page_image_annotator_object.get("orientation")
+
+ # Transform box coordinates from current orientation back to orientation 0.
+ # Matches component: 90° CW new_x = H - old_y, new_y = old_x; 90° CCW new_x = old_y, new_y = W - old_x.
+ if (
+ orientation in (1, 2, 3)
+ and image_height is not None
+ and image_width is not None
+ ):
+ W, H = image_width, image_height
+ boxes = page_image_annotator_object.get("boxes") or []
+ for box in boxes:
+ xmin_d, ymin_d = box["xmin"], box["ymin"]
+ xmax_d, ymax_d = box["xmax"], box["ymax"]
+ if orientation == 1:
+ # 90° CW reverse: old_x = y_d, old_y = H - x_d
+ box["xmin"] = ymin_d
+ box["xmax"] = ymax_d
+ box["ymin"] = H - xmax_d
+ box["ymax"] = H - xmin_d
+ elif orientation == 2:
+ # 180° CW reverse: old_x = W - x_d, old_y = H - y_d
+ box["xmin"] = W - xmax_d
+ box["xmax"] = W - xmin_d
+ box["ymin"] = H - ymax_d
+ box["ymax"] = H - ymin_d
+ elif orientation == 3:
+ # 270° CW (90° CCW) reverse: old_x = W - y_d, old_y = x_d
+ box["xmin"] = W - ymax_d
+ box["xmax"] = W - ymin_d
+ box["ymin"] = xmin_d
+ box["ymax"] = xmax_d
+ page_image_annotator_object["orientation"] = 0
+
+ # This replaces the numpy array image object with the image file path
+ page_image_annotator_object, all_image_annotations = (
+ replace_annotator_object_img_np_array_with_page_sizes_image_path(
+ all_image_annotations,
+ page_image_annotator_object,
+ page_sizes,
+ previous_page,
+ )
+ )
+
+ if clear_all is False:
+ all_image_annotations[previous_page_zero_index] = page_image_annotator_object
+ else:
+ all_image_annotations[previous_page_zero_index]["boxes"] = list()
+
+ return all_image_annotations, current_page, current_page
+
+
+def _load_one_page_image_for_redact(
+ i: int,
+ all_image_annotations: List[AnnotatedImageData],
+ page_to_image_path: Dict[int, str],
+ input_folder: str,
+ file_name_with_ext: str,
+) -> Tuple[int, object, bool]:
+ """
+ Load (and optionally save) the image for page i. Safe to run in a thread.
+ Returns (page_index, image, should_close). Caller must close image if should_close.
+ """
+ image_loc = all_image_annotations[i]["image"]
+ should_close = False
+ image = None
+ if isinstance(image_loc, np.ndarray):
+ image = Image.fromarray(image_loc.astype("uint8"))
+ should_close = True
+ elif isinstance(image_loc, Image.Image):
+ image = image_loc
+ elif isinstance(image_loc, str):
+ path = image_loc
+ if not os.path.exists(path):
+ path = page_to_image_path.get(i + 1, path)
+ try:
+ image = Image.open(path)
+ should_close = True
+ except Exception:
+ image = None
+ if image is not None and hasattr(image, "save"):
+ expected_path = os.path.join(input_folder, f"{file_name_with_ext}_{i}.png")
+ if not os.path.exists(expected_path):
+ try:
+ image.save(expected_path)
+ except Exception:
+ pass
+ return (i, image, should_close)
+
+
+def apply_redactions_to_review_df_and_files(
+ page_image_annotator_object: AnnotatedImageData,
+ file_paths: List[str],
+ doc: Document,
+ all_image_annotations: List[AnnotatedImageData],
+ current_page: int,
+ review_file_state: pd.DataFrame,
+ output_folder: str = OUTPUT_FOLDER,
+ save_pdf: bool = True,
+ page_sizes: List[dict] = list(),
+ input_folder: str = INPUT_FOLDER,
+ COMPRESS_REDACTED_PDF: bool = COMPRESS_REDACTED_PDF,
+ progress=gr.Progress(track_tqdm=True),
+):
+ """
+ Applies the modified redaction annotations from the UI to the PyMuPDF document
+ and exports the updated review files, including the redacted PDF and associated logs.
+
+ Args:
+ page_image_annotator_object (AnnotatedImageData): The annotation data for the current page,
+ potentially including user modifications.
+ file_paths (List[str]): A list of file paths associated with the document, typically
+ including the original PDF and any generated image paths.
+ doc (Document): The PyMuPDF Document object representing the PDF file.
+ all_image_annotations (List[AnnotatedImageData]): A list containing annotation data
+ for all pages of the document.
+ current_page (int): The 1-based index of the page currently being processed or viewed.
+ review_file_state (pd.DataFrame): A Pandas DataFrame holding the current state of
+ redaction reviews, reflecting user selections.
+ output_folder (str, optional): The directory where output files (redacted PDFs,
+ log files) will be saved. Defaults to OUTPUT_FOLDER.
+ save_pdf (bool, optional): If True, the redacted PDF will be saved. Defaults to True.
+ page_sizes (List[dict], optional): A list of dictionaries, each containing size
+ information (e.g., width, height) for a page.
+ Defaults to an empty list.
+ input_folder (str, optional): The directory where input files are located and where
+ page images should be saved. Defaults to INPUT_FOLDER.
+ COMPRESS_REDACTED_PDF (bool, optional): If True, the output PDF will be compressed.
+ Defaults to COMPRESS_REDACTED_PDF.
+ progress (gr.Progress, optional): Gradio progress object for tracking task progress.
+ Defaults to gr.Progress(track_tqdm=True).
+
+ Memory notes:
+ - With RETURN_PDF_FOR_REVIEW, two full PyMuPDF documents are held by default while
+ applying redactions; set env TWO_PASS_REVIEW_PDF_LOW_MEMORY=True to process the
+ final and review PDFs in two sequential passes (lower peak RAM, ~2x apply work).
+ - Parallel review-CSV build holds chunk DataFrames until concat; del partial_dfs
+ after concat reduces peak slightly.
+
+ Returns:
+ Tuple[Document, List[AnnotatedImageData], List[str], List[str], pd.DataFrame]:
+ - doc: The updated PyMuPDF Document object (potentially redacted).
+ - all_image_annotations: The updated list of all image annotations.
+ - output_files: A list of paths to the generated output files (e.g., redacted PDF).
+ - output_log_files: A list of paths to any generated log files.
+ - review_df: The final Pandas DataFrame representing the review state.
+ """
+
+ output_files = list()
+ output_log_files = list()
+ review_df = review_file_state
+
+ # Always use the provided input_folder parameter
+ # This ensures images are created in the specified input folder, not in example_data
+
+ page_image_annotator_object = all_image_annotations[current_page - 1]
+
+ # This replaces the numpy array image object with the image file path
+ page_image_annotator_object, all_image_annotations = (
+ replace_annotator_object_img_np_array_with_page_sizes_image_path(
+ all_image_annotations, page_image_annotator_object, page_sizes, current_page
+ )
+ )
+ page_image_annotator_object["image"] = all_image_annotations[current_page - 1][
+ "image"
+ ]
+
+ if not page_image_annotator_object:
+ print("No image annotations object found for page")
+ return doc, all_image_annotations, output_files, output_log_files, review_df
+
+ if isinstance(file_paths, str):
+ file_paths = [file_paths]
+
+ # Remove empty/blank entries that give meaningless file_extension = ""
+ file_paths = [fp for fp in file_paths if fp and fp.strip()]
+
+ # If file_paths is still empty, try to recover the source path from the
+ # PyMuPDF Document that was passed in (pdf_doc_state). This handles the
+ # common case where doc_full_file_name_textbox is blank because the Review
+ # tab was populated programmatically (not via a user upload).
+ if not file_paths and hasattr(doc, "name") and doc.name:
+ recovered = doc.name
+ if os.path.isfile(recovered):
+ print(
+ f"file_paths was empty; recovering source path from doc.name: {recovered}"
+ )
+ file_paths = [recovered]
+
+ if not file_paths:
+ print("No valid file paths found. Cannot apply redactions.")
+ return doc, all_image_annotations, output_files, output_log_files, review_df
+
+ def _run_apply_redactions_loop(file_paths_to_process):
+ _out_files = []
+ _out_log_files = []
+ _review_df = review_file_state
+ for file_path in file_paths_to_process:
+ pdf_doc = None
+ review_pdf_doc = None
+ number_of_pages = 0
+ _tmp_pdf_path = None
+ _profile_page_times = []
+ _profile_image_times = []
+ file_name_without_ext = get_file_name_without_type(file_path)
+ file_name_with_ext = os.path.basename(file_path)
+ use_two_pass_pdf = False
+
+ file_extension = os.path.splitext(file_path)[1].lower()
+
+ # If the UI passed only a review CSV (e.g. after duplicate-pages flow),
+ # resolve the corresponding PDF so we can save the redacted output.
+ if (
+ save_pdf is True
+ and file_extension == ".csv"
+ and "_review_file" in (file_name_without_ext or "")
+ ):
+ pdf_basename = file_name_with_ext.replace("_review_file.csv", "")
+ review_dir = os.path.dirname(file_path)
+ if not review_dir:
+ review_dir = output_folder or "."
+ candidates = [
+ os.path.join(review_dir, pdf_basename),
+ ]
+ if output_folder:
+ candidates.append(
+ (output_folder + pdf_basename)
+ if output_folder.endswith(("/", os.sep))
+ else os.path.join(output_folder, pdf_basename)
+ )
+ if input_folder:
+ candidates.append(
+ (input_folder + pdf_basename)
+ if input_folder.endswith(("/", os.sep))
+ else os.path.join(input_folder, pdf_basename)
+ )
+ for candidate in candidates:
+ if candidate and os.path.isfile(candidate):
+ file_path = candidate
+ file_name_without_ext = get_file_name_without_type(file_path)
+ file_name_with_ext = os.path.basename(file_path)
+ file_extension = os.path.splitext(file_path)[1].lower()
+ break
+
+ # Build page_sizes_df and lookups once per file (reused for PDF redaction and review CSV)
+ _t0_page_sizes = time.perf_counter() if PROFILE_REDACTION_APPLY else None
+ page_sizes_df = pd.DataFrame(page_sizes) if page_sizes else pd.DataFrame()
+ page_to_image_path = {}
+ page_to_image_dimensions = {}
+ if not page_sizes_df.empty:
+ if "page" in page_sizes_df.columns:
+ page_sizes_df = page_sizes_df.copy()
+ page_sizes_df[["page"]] = page_sizes_df[["page"]].apply(
+ pd.to_numeric, errors="coerce"
+ )
+ if "image_width" in page_sizes_df.columns:
+ page_sizes_df[["image_width"]] = page_sizes_df[
+ ["image_width"]
+ ].apply(pd.to_numeric, errors="coerce")
+ if "image_height" in page_sizes_df.columns:
+ page_sizes_df[["image_height"]] = page_sizes_df[
+ ["image_height"]
+ ].apply(pd.to_numeric, errors="coerce")
+ if (
+ "image_path" in page_sizes_df.columns
+ and "page" in page_sizes_df.columns
+ ):
+ sub = page_sizes_df[["page", "image_path"]].drop_duplicates("page")
+ for p, path in zip(sub["page"], sub["image_path"]):
+ if pd.notna(p):
+ page_to_image_path[int(p)] = path
+ if (
+ "page" in page_sizes_df.columns
+ and "image_width" in page_sizes_df.columns
+ and "image_height" in page_sizes_df.columns
+ ):
+ sub = page_sizes_df[
+ ["page", "image_width", "image_height"]
+ ].drop_duplicates("page")
+ for _, row in sub.iterrows():
+ p = row["page"]
+ if pd.notna(p):
+ w, h = row["image_width"], row["image_height"]
+ if pd.notna(w) and pd.notna(h):
+ page_to_image_dimensions[int(p)] = {
+ "image_width": float(w),
+ "image_height": float(h),
+ }
+ if PROFILE_REDACTION_APPLY:
+ _t_page_sizes = time.perf_counter() - _t0_page_sizes
+ else:
+ _t_page_sizes = 0.0
+
+ if save_pdf is True:
+ # If working with image docs
+ if (is_pdf(file_path) is False) & (file_extension != ".csv"):
+ image = Image.open(file_path)
+
+ draw = ImageDraw.Draw(image)
+
+ output_image_path = (
+ output_folder + file_name_without_ext + "_redacted.png"
+ )
+ for img_annotation_box in page_image_annotator_object["boxes"]:
+ coords = [
+ img_annotation_box["xmin"],
+ img_annotation_box["ymin"],
+ img_annotation_box["xmax"],
+ img_annotation_box["ymax"],
+ ]
+
+ fill = img_annotation_box["color"]
+
+ # Parse color: may be (r,g,b) tuple/list or string like "(128, 128, 128)" / "[128 128 128]"
+ if not isinstance(fill, tuple):
+ if isinstance(fill, list) and len(fill) == 3:
+ fill = tuple(fill)
+ elif isinstance(fill, str):
+ from tools.secure_regex_utils import (
+ safe_extract_rgb_values,
+ )
+
+ parsed = safe_extract_rgb_values(fill.strip())
+ if parsed is not None:
+ fill = parsed
+ else:
+ # Try bracket+space format e.g. "[128 128 128]"
+ match = re.match(
+ r"\[\s*(\d{1,3})\s+(\d{1,3})\s+(\d{1,3})\s*\]",
+ fill.strip(),
+ )
+ if match:
+ r, g, b = (
+ int(match.group(1)),
+ int(match.group(2)),
+ int(match.group(3)),
+ )
+ if (
+ 0 <= r <= 255
+ and 0 <= g <= 255
+ and 0 <= b <= 255
+ ):
+ fill = (r, g, b)
+ else:
+ fill = CUSTOM_BOX_COLOUR
+ else:
+ fill = CUSTOM_BOX_COLOUR
+ else:
+ try:
+ fill = tuple(fill)
+ except Exception:
+ fill = CUSTOM_BOX_COLOUR
+
+ # Ensure fill is a valid RGB tuple with integer values 0-255
+ # Handle both list and tuple formats, and convert float values to proper RGB
+ if isinstance(fill, (list, tuple)) and len(fill) == 3:
+ # Convert to tuple if it's a list
+ if isinstance(fill, list):
+ fill = tuple(fill)
+
+ # Check if all elements are valid RGB values
+ valid_rgb = True
+ converted_fill = []
+
+ for c in fill:
+ if isinstance(c, (int, float)):
+ # If it's a float between 0-1, convert to 0-255 range
+ if isinstance(c, float) and 0 <= c <= 1:
+ converted_fill.append(int(c * 255))
+ # If it's already an integer 0-255, use as is
+ elif isinstance(c, int) and 0 <= c <= 255:
+ converted_fill.append(c)
+ # If it's a float > 1, assume it's already in 0-255 range
+ elif isinstance(c, float) and c > 1:
+ converted_fill.append(int(c))
+ else:
+ valid_rgb = False
+ break
+ else:
+ valid_rgb = False
+ break
+
+ if valid_rgb:
+ fill = tuple(converted_fill)
+ else:
+ print(
+ f"Invalid color values: {fill}. Defaulting to CUSTOM_BOX_COLOUR."
+ )
+ fill = CUSTOM_BOX_COLOUR
+ else:
+ print(
+ f"Invalid fill format: {fill}. Defaulting to CUSTOM_BOX_COLOUR."
+ )
+ fill = CUSTOM_BOX_COLOUR
+
+ # Ensure the image is in RGB mode
+ if image.mode not in ("RGB", "RGBA"):
+ image = image.convert("RGB")
+
+ draw = ImageDraw.Draw(image)
+
+ draw.rectangle(coords, fill=fill)
+
+ image.save(output_image_path)
+ _out_files.append(output_image_path)
+
+ # For image under review, also produce _redacted.pdf and _redactions_for_review.pdf (same as PDF route)
+ if doc is not None and getattr(doc, "page_count", 0) >= 1:
+ try:
+ _tmp_pdf_path = os.path.join(
+ output_folder,
+ file_name_without_ext + "_temp_apply.pdf",
+ )
+ doc.save(_tmp_pdf_path)
+ pdf_doc = pymupdf.open(_tmp_pdf_path)
+ review_pdf_doc = (
+ pymupdf.open(_tmp_pdf_path)
+ if RETURN_PDF_FOR_REVIEW
+ else None
+ )
+ number_of_pages = pdf_doc.page_count
+ except Exception as e:
+ print(f"Failed to create PDFs from image doc: {e}")
+ pdf_doc = None
+ review_pdf_doc = None
+ _tmp_pdf_path = None
+ else:
+ # Fallback: doc not available (e.g. pdf_doc_state is list() or None after initial redaction).
+ # Create one-page PDF from the image file so we still produce both PDFs.
+ try:
+ _tmp_pdf_path = os.path.join(
+ output_folder,
+ file_name_without_ext + "_temp_apply.pdf",
+ )
+ img_pdf = pymupdf.open()
+ img_page = img_pdf.new_page(
+ width=image.width, height=image.height
+ )
+ img_page.insert_image(img_page.rect, filename=file_path)
+ img_pdf.save(_tmp_pdf_path)
+ img_pdf.close()
+ pdf_doc = pymupdf.open(_tmp_pdf_path)
+ review_pdf_doc = (
+ pymupdf.open(_tmp_pdf_path)
+ if RETURN_PDF_FOR_REVIEW
+ else None
+ )
+ number_of_pages = pdf_doc.page_count
+ except Exception as e:
+ print(f"Failed to create PDFs from image file: {e}")
+ pdf_doc = None
+ review_pdf_doc = None
+ _tmp_pdf_path = None
+
+ elif file_extension == ".csv":
+ pdf_doc = list()
+
+ # If working with pdfs
+ elif is_pdf(file_path) is True:
+ orig_pdf_file_path = file_path
+ _out_files.append(orig_pdf_file_path)
+ if TWO_PASS_REVIEW_PDF_LOW_MEMORY and RETURN_PDF_FOR_REVIEW:
+ use_two_pass_pdf = True
+ pdf_doc = None
+ review_pdf_doc = None
+ number_of_pages = 0
+ else:
+ pdf_doc = pymupdf.open(file_path)
+ number_of_pages = pdf_doc.page_count
+ if RETURN_PDF_FOR_REVIEW:
+ review_pdf_doc = pymupdf.open(file_path)
+ else:
+ review_pdf_doc = None
+
+ else:
+ print("File type not recognised.")
+
+ # Two-pass PDF: one Document in memory at a time (lower peak RAM).
+ if use_two_pass_pdf:
+ for is_final_pass in (True, False):
+ pass_desc = (
+ "Saving final redacted pages"
+ if is_final_pass
+ else "Saving review PDF pages"
+ )
+ pdf_doc = pymupdf.open(file_path)
+ number_of_pages = pdf_doc.page_count
+ _page_iter = (
+ progress.tqdm(
+ range(0, number_of_pages),
+ desc=pass_desc,
+ unit="pages",
+ )
+ if progress is not None
+ else range(0, number_of_pages)
+ )
+ for i in _page_iter:
+ page_annotations = (
+ all_image_annotations[i]
+ if i < len(all_image_annotations)
+ else {}
+ )
+ page_boxes = (
+ page_annotations.get("boxes")
+ if isinstance(page_annotations, dict)
+ else []
+ )
+ has_boxes = bool(page_boxes and len(page_boxes) > 0)
+ image = None
+ image_should_close = False
+ if has_boxes:
+ if PROFILE_REDACTION_APPLY:
+ _t_img0 = time.perf_counter()
+ try:
+ _, image, image_should_close = (
+ _load_one_page_image_for_redact(
+ i,
+ all_image_annotations,
+ page_to_image_path,
+ input_folder,
+ file_name_with_ext,
+ )
+ )
+ except Exception:
+ image, image_should_close = None, False
+ if image is None:
+ image_should_close = False
+ if PROFILE_REDACTION_APPLY:
+ _profile_image_times.append(
+ time.perf_counter() - _t_img0
+ )
+ elif PROFILE_REDACTION_APPLY:
+ _profile_image_times.append(0.0)
+ pymupdf_page = pdf_doc.load_page(i)
+ current_cropbox = pymupdf_page.cropbox
+ pymupdf_page.set_cropbox(pymupdf_page.mediabox)
+ annots_to_remove = [
+ a
+ for a in pymupdf_page.annots()
+ if a.type[0] == pymupdf.PDF_ANNOT_REDACT
+ ]
+ for annot in annots_to_remove:
+ pymupdf_page.delete_annot(annot)
+ dims = page_to_image_dimensions.get(i + 1)
+ if has_boxes:
+ if PROFILE_REDACTION_APPLY:
+ _t_redact0 = time.perf_counter()
+ pymupdf_page = redact_page_with_pymupdf(
+ page=pymupdf_page,
+ page_annotations=all_image_annotations[i],
+ image=image,
+ original_cropbox=current_cropbox,
+ page_sizes_df=page_sizes_df,
+ return_pdf_for_review=not is_final_pass,
+ return_pdf_end_of_redaction=False,
+ input_folder=input_folder,
+ image_dimensions_override=dims,
+ review_page=None,
+ )
+ if PROFILE_REDACTION_APPLY:
+ _profile_page_times.append(
+ time.perf_counter() - _t_redact0
+ )
+ else:
+ set_cropbox_safely(pymupdf_page, current_cropbox)
+ pymupdf_page.clean_contents()
+ if PROFILE_REDACTION_APPLY:
+ _profile_page_times.append(0.0)
+ if image_should_close and image is not None:
+ try:
+ image.close()
+ except Exception:
+ pass
+ image = None
+ out_pdf = (
+ output_folder
+ + file_name_without_ext
+ + (
+ "_redacted.pdf"
+ if is_final_pass
+ else "_redactions_for_review.pdf"
+ )
+ )
+ save_pdf_with_or_without_compression(
+ pdf_doc, out_pdf, COMPRESS_REDACTED_PDF
+ )
+ _out_files.append(out_pdf)
+ pdf_doc.close()
+ pdf_doc = None
+ if number_of_pages >= 30:
+ gc.collect()
+ review_pdf_doc = None
+ progress(0.9, "Saving output files")
+
+ # Run page loop for both PDF and image (when doc was converted to temp PDF)
+ elif (
+ pdf_doc is not None
+ and hasattr(pdf_doc, "page_count")
+ and not isinstance(pdf_doc, list)
+ and number_of_pages > 0
+ ):
+ # page_sizes_df and page_to_image_path / page_to_image_dimensions
+ # already built once per file above
+
+ # Load images on demand per page (avoids holding all N images in memory).
+ # PyMuPDF is not thread-safe for document modification, so redaction stays sequential.
+ _page_iter = (
+ progress.tqdm(
+ range(0, number_of_pages),
+ desc="Saving redacted pages to file",
+ unit="pages",
+ )
+ if progress is not None
+ else range(0, number_of_pages)
+ )
+ for i in _page_iter:
+ page_annotations = (
+ all_image_annotations[i]
+ if i < len(all_image_annotations)
+ else {}
+ )
+ page_boxes = (
+ page_annotations.get("boxes")
+ if isinstance(page_annotations, dict)
+ else []
+ )
+ has_boxes = bool(page_boxes and len(page_boxes) > 0)
+
+ # Load image only when page has redaction boxes (avoids I/O for blank pages).
+ image = None
+ image_should_close = False
+ if has_boxes:
+ if PROFILE_REDACTION_APPLY:
+ _t_img0 = time.perf_counter()
+ try:
+ _, image, image_should_close = (
+ _load_one_page_image_for_redact(
+ i,
+ all_image_annotations,
+ page_to_image_path,
+ input_folder,
+ file_name_with_ext,
+ )
+ )
+ except Exception:
+ image, image_should_close = None, False
+ if image is None:
+ image_should_close = False
+ if PROFILE_REDACTION_APPLY:
+ _profile_image_times.append(
+ time.perf_counter() - _t_img0
+ )
+ elif PROFILE_REDACTION_APPLY:
+ _profile_image_times.append(0.0)
+
+ pymupdf_page = pdf_doc.load_page(i)
+ current_cropbox = pymupdf_page.cropbox
+ pymupdf_page.set_cropbox(pymupdf_page.mediabox)
+
+ # Remove existing redaction annotations (collect first to avoid iterator issues)
+ annots_to_remove = [
+ a
+ for a in pymupdf_page.annots()
+ if a.type[0] == pymupdf.PDF_ANNOT_REDACT
+ ]
+ for annot in annots_to_remove:
+ pymupdf_page.delete_annot(annot)
+
+ # Precomputed dimensions for this page (avoids .loc in redact_page_with_pymupdf)
+ dims = page_to_image_dimensions.get(i + 1)
+
+ review_pymupdf_page = None
+ if RETURN_PDF_FOR_REVIEW and review_pdf_doc:
+ review_pymupdf_page = review_pdf_doc.load_page(i)
+ review_pymupdf_page.set_cropbox(
+ review_pymupdf_page.mediabox
+ )
+ review_annots_to_remove = [
+ a
+ for a in review_pymupdf_page.annots()
+ if a.type[0] == pymupdf.PDF_ANNOT_REDACT
+ ]
+ for annot in review_annots_to_remove:
+ review_pymupdf_page.delete_annot(annot)
+
+ # Single pass: apply redactions to both final and (if requested) review page.
+ if has_boxes:
+ if PROFILE_REDACTION_APPLY:
+ _t_redact0 = time.perf_counter()
+ pymupdf_page = redact_page_with_pymupdf(
+ page=pymupdf_page,
+ page_annotations=all_image_annotations[i],
+ image=image,
+ original_cropbox=current_cropbox,
+ page_sizes_df=page_sizes_df,
+ return_pdf_for_review=bool(review_pymupdf_page is None),
+ return_pdf_end_of_redaction=False,
+ input_folder=input_folder,
+ image_dimensions_override=dims,
+ review_page=review_pymupdf_page,
+ )
+ if PROFILE_REDACTION_APPLY:
+ _profile_page_times.append(
+ time.perf_counter() - _t_redact0
+ )
+ else:
+ set_cropbox_safely(pymupdf_page, current_cropbox)
+ pymupdf_page.clean_contents()
+ if review_pymupdf_page is not None:
+ set_cropbox_safely(review_pymupdf_page, current_cropbox)
+ review_pymupdf_page.clean_contents()
+ if PROFILE_REDACTION_APPLY:
+ _profile_page_times.append(0.0)
+
+ # Close image immediately to free memory before next page
+ if image_should_close and image is not None:
+ try:
+ image.close()
+ except Exception:
+ pass
+ image = None
+
+ if not use_two_pass_pdf:
+ progress(0.9, "Saving output files")
+
+ if pdf_doc:
+ # Save final redacted PDF
+ out_pdf_file_path = (
+ output_folder + file_name_without_ext + "_redacted.pdf"
+ )
+ save_pdf_with_or_without_compression(
+ pdf_doc, out_pdf_file_path, COMPRESS_REDACTED_PDF
+ )
+ _out_files.append(out_pdf_file_path)
+ pdf_doc.close()
+ pdf_doc = None
+ if number_of_pages >= 30:
+ gc.collect()
+
+ # Save review PDF if RETURN_PDF_FOR_REVIEW is True
+
+ if RETURN_PDF_FOR_REVIEW and review_pdf_doc:
+ output_file_name = (
+ file_name_without_ext + "_redactions_for_review.pdf"
+ )
+ out_review_pdf_file_path = output_folder + output_file_name
+ print("Saving PDF file for review:", output_file_name)
+ save_pdf_with_or_without_compression(
+ review_pdf_doc,
+ out_review_pdf_file_path,
+ COMPRESS_REDACTED_PDF,
+ )
+ _out_files.append(out_review_pdf_file_path)
+ review_pdf_doc.close()
+ review_pdf_doc = None
+ if number_of_pages >= 30:
+ gc.collect()
+
+ # Remove temp PDF used for image->PDF route
+ if _tmp_pdf_path and os.path.isfile(_tmp_pdf_path):
+ try:
+ os.remove(_tmp_pdf_path)
+ except Exception:
+ pass
+
+ else:
+ print("PDF input not found. Outputs not saved to PDF.")
+
+ # If save_pdf is not true, then add the original pdf to the output files
+ else:
+ if is_pdf(file_path) is True:
+ orig_pdf_file_path = file_path
+ _out_files.append(orig_pdf_file_path)
+
+ _t_review_csv = 0.0
+ try:
+ if PROFILE_REDACTION_APPLY:
+ _t_review0 = time.perf_counter()
+ if (
+ ENABLE_REVIEW_CSV_PARALLELISM
+ and len(all_image_annotations) >= REVIEW_CSV_PARALLEL_MIN_PAGES
+ ):
+ chunk_size = REVIEW_CSV_PAGES_PER_CHUNK
+ chunks = [
+ all_image_annotations[i : i + chunk_size]
+ for i in range(0, len(all_image_annotations), chunk_size)
+ ]
+ with ThreadPoolExecutor(
+ max_workers=min(MAX_WORKERS, len(chunks))
+ ) as executor:
+ partial_dfs = list(
+ executor.map(convert_annotation_data_to_dataframe, chunks)
+ )
+ combined = _concat_frames_without_all_na_warning(
+ partial_dfs, ignore_index=True
+ )
+ del partial_dfs
+ _review_df = convert_annotation_json_to_review_df(
+ all_image_annotations,
+ review_file_state.copy(),
+ page_sizes=page_sizes,
+ prebuilt_df=combined,
+ )
+ else:
+ _review_df = convert_annotation_json_to_review_df(
+ all_image_annotations,
+ review_file_state.copy(),
+ page_sizes=page_sizes,
+ )
+
+ out_review_file_file_path = (
+ output_folder + file_name_with_ext + "_review_file.csv"
+ )
+ review_cols = [
+ "image",
+ "page",
+ "label",
+ "color",
+ "xmin",
+ "ymin",
+ "xmax",
+ "ymax",
+ "text",
+ "id",
+ ]
+
+ if USE_POLARS_FOR_REVIEW and not _review_df.empty:
+ coord_cols = ["xmin", "xmax", "ymin", "ymax"]
+ cols_to_convert = coord_cols + ["page"]
+ temp_pd = _review_df.copy()
+ for col in cols_to_convert:
+ if col in temp_pd.columns:
+ temp_pd[col] = pd.to_numeric(temp_pd[col], errors="coerce")
+ for col in temp_pd.columns:
+ if col not in cols_to_convert and temp_pd[col].dtype == object:
+ temp_pd[col] = temp_pd[col].astype(str)
+ pl_df = pl.from_pandas(temp_pd)
+ pl_df = divide_coordinates_by_page_sizes_pl(pl_df, page_sizes_df)
+ pl_df = pl_df.select([c for c in review_cols if c in pl_df.columns])
+ pl_df.write_csv(out_review_file_file_path)
+ _review_df = pl_df.to_pandas()
+ if "page" in _review_df.columns and not _review_df.empty:
+ _review_df["page"] = pd.to_numeric(
+ _review_df["page"], errors="coerce"
+ )
+ _review_df["page"] = _review_df["page"].astype("Int64")
+ for c in coord_cols:
+ if c in _review_df.columns:
+ _review_df[c] = _review_df[c].astype(float)
+ else:
+ _review_df = divide_coordinates_by_page_sizes(
+ _review_df, page_sizes_df
+ )
+ _review_df = _review_df[review_cols]
+ _review_df.to_csv(out_review_file_file_path, index=None)
+
+ _out_files.append(out_review_file_file_path)
+ if PROFILE_REDACTION_APPLY:
+ _t_review_csv = time.perf_counter() - _t_review0
+
+ except Exception as e:
+ print(
+ "In apply redactions function, could not save annotations to csv file:",
+ e,
+ )
+ if PROFILE_REDACTION_APPLY:
+ _total_page = sum(_profile_page_times)
+ _total_img = sum(_profile_image_times)
+ print(
+ "[PROFILE_REDACTION_APPLY] file=%s | page_sizes=%.3fs | image_load_total=%.3fs | redact_pages_total=%.3fs | review_csv=%.3fs"
+ % (
+ file_name_with_ext or file_path,
+ _t_page_sizes,
+ _total_img,
+ _total_page,
+ _t_review_csv,
+ )
+ )
+
+ return (_out_files, _out_log_files, _review_df)
+
+ if ENABLE_PARALLEL_FILES_APPLY_REDACTIONS and len(file_paths) > 1:
+ with ThreadPoolExecutor(
+ max_workers=min(MAX_WORKERS, len(file_paths))
+ ) as executor:
+ futures = [
+ executor.submit(_run_apply_redactions_loop, [fp]) for fp in file_paths
+ ]
+ for fut in as_completed(futures):
+ o_f, o_l, rev_df = fut.result()
+ output_files.extend(o_f)
+ output_log_files.extend(o_l)
+ review_df = rev_df
+ else:
+ o_f, o_l, review_df = _run_apply_redactions_loop(file_paths)
+ output_files.extend(o_f)
+ output_log_files.extend(o_l)
+
+ return doc, all_image_annotations, output_files, output_log_files, review_df
+
+
+def get_boxes_json(annotations: AnnotatedImageData):
+ return annotations["boxes"]
+
+
+def update_all_entity_df_dropdowns(
+ df: pd.DataFrame,
+ label_dropdown_value: str,
+ page_dropdown_value: str,
+ text_dropdown_value: str,
+):
+ """
+ Update all dropdowns based on rows that exist in a dataframe
+ """
+
+ if isinstance(label_dropdown_value, str):
+ label_dropdown_value = [label_dropdown_value]
+ if isinstance(page_dropdown_value, str):
+ page_dropdown_value = [page_dropdown_value]
+ if isinstance(text_dropdown_value, str):
+ text_dropdown_value = [text_dropdown_value]
+
+ # Guard against empty lists (e.g. from Gradio when nothing is selected)
+ if not label_dropdown_value:
+ label_dropdown_value = ["ALL"]
+ if not text_dropdown_value:
+ text_dropdown_value = ["ALL"]
+ if not page_dropdown_value:
+ page_dropdown_value = ["1"]
+
+ filtered_df = df.copy()
+
+ if not label_dropdown_value[0]:
+ label_dropdown_value[0] = "ALL"
+ if not text_dropdown_value[0]:
+ text_dropdown_value[0] = "ALL"
+ if not page_dropdown_value[0]:
+ page_dropdown_value[0] = "1"
+
+ recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(
+ filtered_df, "label"
+ )
+ text_entities_for_drop = update_dropdown_list_based_on_dataframe(
+ filtered_df, "text"
+ )
+ page_entities_for_drop = update_dropdown_list_based_on_dataframe(
+ filtered_df, "page"
+ )
+
+ return (
+ gr.update(
+ value=label_dropdown_value[0],
+ choices=recogniser_entities_for_drop,
+ allow_custom_value=True,
+ interactive=True,
+ ),
+ gr.update(
+ value=text_dropdown_value[0],
+ choices=text_entities_for_drop,
+ allow_custom_value=True,
+ interactive=True,
+ ),
+ gr.update(
+ value=page_dropdown_value[0],
+ choices=page_entities_for_drop,
+ allow_custom_value=True,
+ interactive=True,
+ ),
+ )
+
+
+def update_entities_df_recogniser_entities(
+ choice: str, df: pd.DataFrame, page_dropdown_value: str, text_dropdown_value: str
+):
+ """
+ Update the rows in a dataframe depending on the user choice from a dropdown
+ """
+
+ if isinstance(choice, str):
+ choice = [choice]
+ if isinstance(page_dropdown_value, str):
+ page_dropdown_value = [page_dropdown_value]
+ if isinstance(text_dropdown_value, str):
+ text_dropdown_value = [text_dropdown_value]
+
+ filtered_df = df.copy()
+
+ # Apply filtering based on dropdown selections
+ if "ALL" not in page_dropdown_value:
+ filtered_df = filtered_df[
+ filtered_df["page"].astype(str).isin(page_dropdown_value)
+ ]
+
+ if "ALL" not in text_dropdown_value:
+ filtered_df = filtered_df[
+ filtered_df["text"].astype(str).isin(text_dropdown_value)
+ ]
+
+ if "ALL" not in choice:
+ filtered_df = filtered_df[filtered_df["label"].astype(str).isin(choice)]
+
+ if not choice[0]:
+ choice[0] = "ALL"
+ if not text_dropdown_value[0]:
+ text_dropdown_value[0] = "ALL"
+ if not page_dropdown_value[0]:
+ page_dropdown_value[0] = "1"
+
+ # recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(
+ # filtered_df, "label"
+ # )
+ # gr.Dropdown(
+ # value=choice[0],
+ # choices=recogniser_entities_for_drop,
+ # allow_custom_value=True,
+ # interactive=True,
+ # )
+
+ text_entities_for_drop = update_dropdown_list_based_on_dataframe(
+ filtered_df, "text"
+ )
+ page_entities_for_drop = update_dropdown_list_based_on_dataframe(
+ filtered_df, "page"
+ )
+
+ return (
+ filtered_df,
+ gr.update(
+ value=text_dropdown_value[0],
+ choices=text_entities_for_drop,
+ allow_custom_value=True,
+ interactive=True,
+ ),
+ gr.update(
+ value=page_dropdown_value[0],
+ choices=page_entities_for_drop,
+ allow_custom_value=True,
+ interactive=True,
+ ),
+ )
+
+
+def update_entities_df_page(
+ choice: str, df: pd.DataFrame, label_dropdown_value: str, text_dropdown_value: str
+):
+ """
+ Update the rows in a dataframe depending on the user choice from a dropdown
+ """
+ if isinstance(choice, str):
+ choice = [choice]
+ elif not isinstance(choice, list):
+ choice = [str(choice)]
+ if isinstance(label_dropdown_value, str):
+ label_dropdown_value = [label_dropdown_value]
+ elif not isinstance(label_dropdown_value, list):
+ label_dropdown_value = [str(label_dropdown_value)]
+ if isinstance(text_dropdown_value, str):
+ text_dropdown_value = [text_dropdown_value]
+ elif not isinstance(text_dropdown_value, list):
+ text_dropdown_value = [str(text_dropdown_value)]
+
+ filtered_df = df.copy()
+
+ # Apply filtering based on dropdown selections
+ if "ALL" not in text_dropdown_value:
+ filtered_df = filtered_df[
+ filtered_df["text"].astype(str).isin(text_dropdown_value)
+ ]
+
+ if "ALL" not in label_dropdown_value:
+ filtered_df = filtered_df[
+ filtered_df["label"].astype(str).isin(label_dropdown_value)
+ ]
+
+ if "ALL" not in choice:
+ filtered_df = filtered_df[filtered_df["page"].astype(str).isin(choice)]
+
+ recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(
+ filtered_df, "label"
+ )
+ text_entities_for_drop = update_dropdown_list_based_on_dataframe(
+ filtered_df, "text"
+ )
+
+ return (
+ filtered_df,
+ gr.update(
+ value=label_dropdown_value[0],
+ choices=recogniser_entities_for_drop,
+ allow_custom_value=True,
+ interactive=True,
+ ),
+ gr.update(
+ value=text_dropdown_value[0],
+ choices=text_entities_for_drop,
+ allow_custom_value=True,
+ interactive=True,
+ ),
+ )
+
+
+def update_redact_choice_df_from_page_dropdown(choice: str, df: pd.DataFrame):
+ """
+ Update the rows in a dataframe depending on the user choice from a dropdown
+ """
+ if isinstance(choice, str):
+ choice = [choice]
+ elif not isinstance(choice, list):
+ choice = [str(choice)]
+
+ if "index" not in df.columns:
+ df["index"] = df.index
+
+ filtered_df = df[
+ [
+ "page",
+ "line",
+ "word_text",
+ "index",
+ ]
+ ].copy()
+
+ # Apply filtering based on dropdown selections
+ if "ALL" not in choice:
+ filtered_df = filtered_df.loc[filtered_df["page"].astype(str).isin(choice)]
+
+ # page_entities_for_drop = update_dropdown_list_based_on_dataframe(
+ # filtered_df, "page"
+ # )
+ # gr.Dropdown(
+ # value=choice[0],
+ # choices=page_entities_for_drop,
+ # allow_custom_value=True,
+ # interactive=True,
+ # )
+
+ return filtered_df
+
+
+def update_entities_df_text(
+ choice: str, df: pd.DataFrame, label_dropdown_value: str, page_dropdown_value: str
+):
+ """
+ Update the rows in a dataframe depending on the user choice from a dropdown
+ """
+ if isinstance(choice, str):
+ choice = [choice]
+ if isinstance(label_dropdown_value, str):
+ label_dropdown_value = [label_dropdown_value]
+ if isinstance(page_dropdown_value, str):
+ page_dropdown_value = [page_dropdown_value]
+
+ filtered_df = df.copy()
+
+ # Apply filtering based on dropdown selections
+ if "ALL" not in page_dropdown_value:
+ filtered_df = filtered_df[
+ filtered_df["page"].astype(str).isin(page_dropdown_value)
+ ]
+
+ if "ALL" not in label_dropdown_value:
+ filtered_df = filtered_df[
+ filtered_df["label"].astype(str).isin(label_dropdown_value)
+ ]
+
+ if "ALL" not in choice:
+ filtered_df = filtered_df[filtered_df["text"].astype(str).isin(choice)]
+
+ recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(
+ filtered_df, "label"
+ )
+ page_entities_for_drop = update_dropdown_list_based_on_dataframe(
+ filtered_df, "page"
+ )
+
+ return (
+ filtered_df,
+ gr.update(
+ value=label_dropdown_value[0],
+ choices=recogniser_entities_for_drop,
+ allow_custom_value=True,
+ interactive=True,
+ ),
+ gr.update(
+ value=page_dropdown_value[0],
+ choices=page_entities_for_drop,
+ allow_custom_value=True,
+ interactive=True,
+ ),
+ )
+
+
+def reset_dropdowns(df: pd.DataFrame):
+ """
+ Return Gradio dropdown objects with value 'ALL'.
+ """
+ recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(df, "label")
+ text_entities_for_drop = update_dropdown_list_based_on_dataframe(df, "text")
+ page_entities_for_drop = update_dropdown_list_based_on_dataframe(df, "page")
+
+ return (
+ gr.update(
+ value="ALL",
+ choices=recogniser_entities_for_drop,
+ allow_custom_value=True,
+ interactive=True,
+ ),
+ gr.update(
+ value="ALL",
+ choices=text_entities_for_drop,
+ allow_custom_value=True,
+ interactive=True,
+ ),
+ gr.update(
+ value="ALL",
+ choices=page_entities_for_drop,
+ allow_custom_value=True,
+ interactive=True,
+ ),
+ )
+
+
+def increase_bottom_page_count_based_on_top(page_number: int):
+ return int(page_number)
+
+
+def df_select_callback_dataframe_row_ocr_with_words(
+ df: pd.DataFrame, evt: gr.SelectData
+):
+
+ row_value_page = int(evt.row_value[0]) # This is the page number value
+ row_value_line = int(evt.row_value[1]) # This is the label number value
+ row_value_text = evt.row_value[2] # This is the text number value
+
+ row_value_index = evt.row_value[3] # This is the index value
+
+ row_value_df = pd.DataFrame(
+ data={
+ "page": [row_value_page],
+ "line": [row_value_line],
+ "word_text": [row_value_text],
+ "index": row_value_index,
+ }
+ )
+
+ return row_value_df, row_value_text
+
+
+def df_select_callback_dataframe_row(df: pd.DataFrame, evt: gr.SelectData):
+
+ row_value_page = int(evt.row_value[0]) # This is the page number value
+ row_value_label = evt.row_value[1] # This is the label number value
+ row_value_text = evt.row_value[2] # This is the text number value
+ row_value_id = evt.row_value[3] # This is the text number value
+
+ row_value_df = pd.DataFrame(
+ data={
+ "page": [row_value_page],
+ "label": [row_value_label],
+ "text": [row_value_text],
+ "id": [row_value_id],
+ }
+ )
+
+ return row_value_df, row_value_text
+
+
+def df_select_callback_textract_api(df: pd.DataFrame, evt: gr.SelectData):
+
+ row_value_job_id = evt.row_value[0] # This is the page number value
+ # row_value_label = evt.row_value[1] # This is the label number value
+ row_value_job_type = evt.row_value[2] # This is the text number value
+
+ row_value_df = pd.DataFrame(
+ data={"job_id": [row_value_job_id], "label": [row_value_job_type]}
+ )
+
+ return row_value_job_id, row_value_job_type, row_value_df
+
+
+def df_select_callback_cost(df: pd.DataFrame, evt: gr.SelectData):
+
+ row_value_code = evt.row_value[0] # This is the value for cost code
+ # row_value_label = evt.row_value[1] # This is the label number value
+
+ # row_value_df = pd.DataFrame(data={"page":[row_value_code], "label":[row_value_label]})
+
+ return row_value_code
+
+
+def df_select_callback_ocr(df: pd.DataFrame, evt: gr.SelectData):
+
+ row_value_page = int(evt.row_value[0]) # This is the page_number value
+ row_value_text = evt.row_value[1] # This is the text contents
+
+ row_value_df = pd.DataFrame(
+ data={"page": [row_value_page], "text": [row_value_text]}
+ )
+
+ return row_value_page, row_value_df
+
+
+# When a user selects a row in the duplicate results table
+def store_duplicate_selection(evt: gr.SelectData):
+ if not evt.empty:
+ selected_index = evt.index[0]
+ else:
+ selected_index = None
+
+ return selected_index
+
+
+def get_all_rows_with_same_text(df: pd.DataFrame, text: str):
+ """
+ Get all rows with the same text as the selected row
+ """
+ if text:
+ # Get all rows with the same text as the selected row
+ return df.loc[df["text"] == text]
+ else:
+ return pd.DataFrame(columns=["page", "label", "text", "id"])
+
+
+def get_all_rows_with_same_text_redact(df: pd.DataFrame, text: str):
+ """
+ Get all rows with the same text as the selected row for redaction tasks
+ """
+ if "index" not in df.columns:
+ df["index"] = df.index
+
+ if text and not df.empty:
+ # Get all rows with the same text as the selected row
+ return df.loc[df["word_text"] == text]
+ else:
+ return pd.DataFrame(
+ columns=[
+ "page",
+ "line",
+ "label",
+ "word_text",
+ "word_x0",
+ "word_y0",
+ "word_x1",
+ "word_y1",
+ "index",
+ ]
+ )
+
+
+def update_selected_review_df_row_colour(
+ redaction_row_selection: pd.DataFrame,
+ review_df: pd.DataFrame,
+ previous_id: str = "",
+ previous_colour: str = "(0, 0, 0)",
+ colour: str = "(1, 0, 255)",
+) -> tuple[pd.DataFrame, str, str]:
+ """
+ Update the colour of a single redaction box based on the values in a selection row
+ (Optimized Version)
+ """
+
+ # Ensure 'color' column exists, default to previous_colour if previous_id is provided
+ if "color" not in review_df.columns:
+ review_df["color"] = previous_colour if previous_id else "(0, 0, 0)"
+
+ # Ensure 'id' column exists
+ if "id" not in review_df.columns:
+ # Assuming fill_missing_ids is a defined function that returns a DataFrame
+ # It's more efficient if this is handled outside if possible,
+ # or optimized internally.
+ print("Warning: 'id' column not found. Calling fill_missing_ids.")
+ review_df = fill_missing_ids(
+ review_df
+ ) # Keep this if necessary, but note it can be slow
+
+ # --- Optimization 1 & 2: Reset existing highlight colours using vectorized assignment ---
+ # Reset the color of the previously highlighted row
+ if previous_id and previous_id in review_df["id"].values:
+ review_df.loc[review_df["id"] == previous_id, "color"] = previous_colour
+
+ # Reset the color of any row that currently has the highlight colour (handle cases where previous_id might not have been tracked correctly)
+ # Convert to string for comparison only if the dtype might be mixed or not purely string
+ # If 'color' is consistently string, the .astype(str) might be avoidable.
+ # Assuming color is consistently string format like '(R, G, B)'
+ review_df.loc[review_df["color"] == colour, "color"] = "(0, 0, 0)"
+
+ if not redaction_row_selection.empty and not review_df.empty:
+ use_id = (
+ "id" in redaction_row_selection.columns
+ and "id" in review_df.columns
+ and not redaction_row_selection["id"].isnull().all()
+ and not review_df["id"].isnull().all()
+ )
+
+ selected_merge_cols = ["id"] if use_id else ["label", "page", "text"]
+
+ # --- Optimization 3: Use inner merge directly ---
+ # Merge to find rows in review_df that match redaction_row_selection
+ merged_reviews = review_df.merge(
+ redaction_row_selection[selected_merge_cols],
+ on=selected_merge_cols,
+ how="inner", # Use inner join as we only care about matches
+ )
+
+ if not merged_reviews.empty:
+ # Assuming we only expect one match for highlighting a single row
+ # If multiple matches are possible and you want to highlight all,
+ # the logic for previous_id and previous_colour needs adjustment.
+ new_previous_colour = str(merged_reviews["color"].iloc[0])
+ new_previous_id = merged_reviews["id"].iloc[0]
+
+ # --- Optimization 1 & 2: Update color of the matched row using vectorized assignment ---
+
+ if use_id:
+ # Faster update if using unique 'id' as merge key
+ review_df.loc[review_df["id"].isin(merged_reviews["id"]), "color"] = (
+ colour
+ )
+ else:
+ # More general case using multiple columns - might be slower
+ # Create a temporary key for comparison
+ def create_merge_key(df, cols):
+ return df[cols].astype(str).agg("_".join, axis=1)
+
+ review_df_key = create_merge_key(review_df, selected_merge_cols)
+ merged_reviews_key = create_merge_key(
+ merged_reviews, selected_merge_cols
+ )
+
+ review_df.loc[review_df_key.isin(merged_reviews_key), "color"] = colour
+
+ previous_colour = new_previous_colour
+ previous_id = new_previous_id
+ else:
+ # No rows matched the selection
+ print("No reviews found matching selection criteria")
+
+ previous_colour = (
+ "(0, 0, 0)" # Reset previous_colour as no row was highlighted
+ )
+ previous_id = "" # Reset previous_id
+
+ else:
+ # If selection is empty, reset any existing highlights
+ review_df.loc[review_df["color"] == colour, "color"] = "(0, 0, 0)"
+ previous_colour = "(0, 0, 0)"
+ previous_id = ""
+
+ # Ensure column order is maintained if necessary, though pandas generally preserves order
+ # Creating a new DataFrame here might involve copying data, consider if this is strictly needed.
+ if set(
+ [
+ "image",
+ "page",
+ "label",
+ "color",
+ "xmin",
+ "ymin",
+ "xmax",
+ "ymax",
+ "text",
+ "id",
+ ]
+ ).issubset(review_df.columns):
+ review_df = review_df[
+ [
+ "image",
+ "page",
+ "label",
+ "color",
+ "xmin",
+ "ymin",
+ "xmax",
+ "ymax",
+ "text",
+ "id",
+ ]
+ ]
+ else:
+ print(
+ "Warning: Not all expected columns are present in review_df for reordering."
+ )
+
+ return review_df, previous_id, previous_colour
+
+
+def _update_one_page_boxes_color(
+ page_idx: int,
+ image_obj: dict,
+ selection_set: set,
+ colour: tuple,
+) -> Tuple[int, dict]:
+ """Process one page's boxes for color update; safe to run in a thread."""
+ out = {
+ "image": image_obj.get("image"),
+ "boxes": [
+ {
+ **box,
+ "color": (
+ colour
+ if (page_idx, box["label"]) in selection_set
+ else box["color"]
+ ),
+ }
+ for box in image_obj.get("boxes", [])
+ ],
+ }
+ return (page_idx, out)
+
+
+def update_boxes_color(
+ images: list, redaction_row_selection: pd.DataFrame, colour: tuple = (0, 255, 0)
+):
+ """
+ Update the color of bounding boxes in the images list based on redaction_row_selection.
+
+ Parameters:
+ - images (list): List of dictionaries containing image paths and box metadata.
+ - redaction_row_selection (pd.DataFrame): DataFrame with 'page', 'label', and optionally 'text' columns.
+ - colour (tuple): RGB tuple for the new color.
+
+ Returns:
+ - Updated list with modified colors.
+ """
+ selection_set = set(
+ zip(redaction_row_selection["page"], redaction_row_selection["label"])
+ )
+ if not images:
+ return images
+
+ max_workers = min(MAX_WORKERS, len(images))
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
+ results = list(
+ executor.map(
+ lambda i_obj: _update_one_page_boxes_color(
+ i_obj[0], i_obj[1], selection_set, colour
+ ),
+ [(idx, img) for idx, img in enumerate(images)],
+ )
+ )
+ ordered = sorted(results, key=lambda x: x[0])
+ return [out for _, out in ordered]
+
+
+def update_other_annotator_number_from_current(page_number_first_counter: int):
+ return page_number_first_counter
+
+
+def convert_image_coords_to_adobe(
+ pdf_page_width: float,
+ pdf_page_height: float,
+ image_width: float,
+ image_height: float,
+ x1: float,
+ y1: float,
+ x2: float,
+ y2: float,
+):
+ """
+ Converts coordinates from image space to Adobe PDF space.
+
+ Parameters:
+ - pdf_page_width: Width of the PDF page
+ - pdf_page_height: Height of the PDF page
+ - image_width: Width of the source image
+ - image_height: Height of the source image
+ - x1, y1, x2, y2: Coordinates in image space
+ - page_sizes: List of dicts containing sizes of page as pymupdf page or PIL image
+
+ Returns:
+ - Tuple of converted coordinates (x1, y1, x2, y2) in Adobe PDF space
+ """
+
+ # Calculate scaling factors
+ scale_width = pdf_page_width / image_width
+ scale_height = pdf_page_height / image_height
+
+ # Convert coordinates
+ pdf_x1 = x1 * scale_width
+ pdf_x2 = x2 * scale_width
+
+ # Convert Y coordinates (flip vertical axis)
+ # Adobe coordinates start from bottom-left
+ pdf_y1 = pdf_page_height - (y1 * scale_height)
+ pdf_y2 = pdf_page_height - (y2 * scale_height)
+
+ # Make sure y1 is always less than y2 for Adobe's coordinate system
+ if pdf_y1 > pdf_y2:
+ pdf_y1, pdf_y2 = pdf_y2, pdf_y1
+
+ return pdf_x1, pdf_y1, pdf_x2, pdf_y2
+
+
+def convert_pymupdf_coords_to_adobe(
+ x1: float, y1: float, x2: float, y2: float, pdf_page_height: float
+):
+ """
+ Converts coordinates from PyMuPDF (fitz) space to Adobe PDF space.
+
+ Parameters:
+ - x1, y1, x2, y2: Coordinates in PyMuPDF space
+ - pdf_page_height: Total height of the PDF page
+
+ Returns:
+ - Tuple of converted coordinates (x1, y1, x2, y2) in Adobe PDF space
+ """
+
+ # PyMuPDF uses (0,0) at the bottom-left, while Adobe uses (0,0) at the top-left
+ adobe_y1 = pdf_page_height - y2 # Convert top coordinate
+ adobe_y2 = pdf_page_height - y1 # Convert bottom coordinate
+
+ return x1, adobe_y1, x2, adobe_y2
+
+
+def _build_one_redact_element(
+ row_dict: dict, pdf_page_height: float, date_str: str
+) -> Element:
+ """Build a single redact XML element from a row; safe to run in a thread."""
+ redact_annot = Element("redact")
+ redact_annot.set("opacity", "0.500000")
+ redact_annot.set("interior-color", "#000000")
+ redact_annot.set("date", date_str)
+ redact_annot.set("name", str(uuid.uuid4()))
+ page_python_format = int(row_dict["page"]) - 1
+ redact_annot.set("page", str(page_python_format))
+ redact_annot.set("mimetype", "Form")
+
+ x1_pdf = row_dict["xmin"]
+ y1_pdf = row_dict["ymin"]
+ x2_pdf = row_dict["xmax"]
+ y2_pdf = row_dict["ymax"]
+ adobe_x1, adobe_y1, adobe_x2, adobe_y2 = convert_pymupdf_coords_to_adobe(
+ x1_pdf, y1_pdf, x2_pdf, y2_pdf, pdf_page_height
+ )
+ redact_annot.set(
+ "rect", f"{adobe_x1:.6f},{adobe_y1:.6f},{adobe_x2:.6f},{adobe_y2:.6f}"
+ )
+ redact_annot.set("subject", str(row_dict["label"]))
+ redact_annot.set("title", str(row_dict.get("label", "Unknown")))
+
+ contents_richtext = SubElement(redact_annot, "contents-richtext")
+ body_attrs = {
+ "xmlns": "http://www.w3.org/1999/xhtml",
+ "{http://www.xfa.org/schema/xfa-data/1.0/}APIVersion": "Acrobat:25.1.0",
+ "{http://www.xfa.org/schema/xfa-data/1.0/}spec": "2.0.2",
+ }
+ body = SubElement(contents_richtext, "body", attrib=body_attrs)
+ p_element = SubElement(body, "p", dir="ltr")
+ span_attrs = {
+ "dir": "ltr",
+ "style": "font-size:10.0pt;text-align:left;color:#000000;font-weight:normal;font-style:normal",
+ }
+ span_element = SubElement(p_element, "span", attrib=span_attrs)
+ span_element.text = str(row_dict.get("text", "")).strip()
+
+ pdf_ops_for_black_fill_and_outline = [
+ "1 w",
+ "0 g",
+ "0 G",
+ "1 0 0 1 0 0 cm",
+ f"{adobe_x1:.2f} {adobe_y1:.2f} m",
+ f"{adobe_x2:.2f} {adobe_y1:.2f} l",
+ f"{adobe_x2:.2f} {adobe_y2:.2f} l",
+ f"{adobe_x1:.2f} {adobe_y2:.2f} l",
+ "h",
+ "B",
+ ]
+ data_content_string = "\n".join(pdf_ops_for_black_fill_and_outline) + "\n"
+ data_element = SubElement(redact_annot, "data")
+ data_element.set("MODE", "filtered")
+ data_element.set("encoding", "ascii")
+ data_element.set("length", str(len(data_content_string.encode("ascii"))))
+ data_element.text = data_content_string
+ return redact_annot
+
+
+def create_xfdf(
+ review_file_df: pd.DataFrame,
+ pdf_path: str,
+ pymupdf_doc: object,
+ image_paths: List[str] = list(),
+ document_cropboxes: List = list(),
+ page_sizes: List[dict] = list(),
+):
+ """
+ Create an xfdf file from a review csv file and a pdf
+ """
+ xfdf_root = Element(
+ "xfdf", xmlns="http://ns.adobe.com/xfdf/", **{"xml:space": "preserve"}
+ )
+ annots = SubElement(xfdf_root, "annots")
+
+ if page_sizes:
+ page_sizes_df = pd.DataFrame(page_sizes)
+ if not page_sizes_df.empty and "mediabox_width" not in review_file_df.columns:
+ review_file_df = review_file_df.merge(page_sizes_df, how="left", on="page")
+ if "xmin" in review_file_df.columns and review_file_df["xmin"].max() <= 1:
+ if (
+ "mediabox_width" in review_file_df.columns
+ and "mediabox_height" in review_file_df.columns
+ ):
+ review_file_df["xmin"] = (
+ review_file_df["xmin"] * review_file_df["mediabox_width"]
+ )
+ review_file_df["xmax"] = (
+ review_file_df["xmax"] * review_file_df["mediabox_width"]
+ )
+ review_file_df["ymin"] = (
+ review_file_df["ymin"] * review_file_df["mediabox_height"]
+ )
+ review_file_df["ymax"] = (
+ review_file_df["ymax"] * review_file_df["mediabox_height"]
+ )
+ elif "image_width" in review_file_df.columns and not page_sizes_df.empty:
+ review_file_df = multiply_coordinates_by_page_sizes(
+ review_file_df,
+ page_sizes_df,
+ xmin="xmin",
+ xmax="xmax",
+ ymin="ymin",
+ ymax="ymax",
+ )
+
+ # Sequential pass: load each unique page once, set cropbox, store height (PyMuPDF is not thread-safe).
+ page_heights = {}
+ for page_num_reported in review_file_df["page"].astype(int).unique():
+ page_python_format = int(page_num_reported) - 1 # to 0-based
+ pymupdf_page = pymupdf_doc.load_page(page_python_format)
+ if document_cropboxes and page_python_format < len(document_cropboxes):
+ from tools.secure_regex_utils import safe_extract_numbers
+
+ match = safe_extract_numbers(document_cropboxes[page_python_format])
+ if match and len(match) == 4:
+ rect_values = list(map(float, match))
+ pymupdf_page.set_cropbox(Rect(*rect_values))
+ page_heights[page_python_format] = pymupdf_page.mediabox.height
+
+ now = datetime.now(timezone(timedelta(hours=1)))
+ date_str = (
+ now.strftime("D:%Y%m%d%H%M%S")
+ + now.strftime("%z")[:3]
+ + "'"
+ + now.strftime("%z")[3:]
+ + "'"
+ )
+
+ # Build redact elements in parallel (no PyMuPDF in workers).
+ rows_with_heights = []
+ for idx, row in review_file_df.iterrows():
+ page_python_format = int(row["page"]) - 1
+ rows_with_heights.append(
+ (idx, row.to_dict(), page_heights.get(page_python_format, 0.0))
+ )
+
+ if rows_with_heights:
+ max_workers = min(MAX_WORKERS, len(rows_with_heights))
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
+ results = list(
+ executor.map(
+ lambda item: (
+ item[0],
+ _build_one_redact_element(item[1], item[2], date_str),
+ ),
+ rows_with_heights,
+ )
+ )
+ for _, elem in sorted(results, key=lambda x: x[0]):
+ annots.append(elem)
+
+ rough_string = tostring(xfdf_root, encoding="unicode", method="xml")
+ reparsed = defused_minidom.parseString(rough_string)
+ return reparsed.toxml() # .toprettyxml(indent=" ")
+
+
+def convert_df_to_xfdf(
+ input_files: List[str],
+ pdf_doc: Document,
+ image_paths: List[str],
+ output_folder: str = OUTPUT_FOLDER,
+ document_cropboxes: List = list(),
+ page_sizes: List[dict] = list(),
+):
+ """
+ Load in files to convert a review file into an Adobe comment file format
+ """
+ output_paths = list()
+ pdf_name = ""
+ file_path_name = ""
+
+ if isinstance(input_files, str):
+ file_paths_list = [input_files]
+ else:
+ file_paths_list = input_files
+
+ # Sort the file paths so that the pdfs come first
+ file_paths_list = sorted(
+ file_paths_list,
+ key=lambda x: (
+ os.path.splitext(x)[1] != ".pdf",
+ os.path.splitext(x)[1] != ".json",
+ ),
+ )
+
+ for file in file_paths_list:
+
+ if isinstance(file, str):
+ file_path = file
+ else:
+ file_path = file.name
+
+ file_path_name = get_file_name_without_type(file_path)
+ file_path_end = detect_file_type(file_path)
+
+ if file_path_end == "pdf":
+ pdf_name = os.path.basename(file_path)
+
+ if file_path_end == "csv" and "review_file" in file_path_name:
+ # If no pdf name, just get the name of the file path
+ if not pdf_name:
+ pdf_name = file_path_name
+ # Read CSV file
+ review_file_df = pd.read_csv(file_path)
+
+ # Replace NaN in review file with an empty string
+ if "text" in review_file_df.columns:
+ review_file_df["text"] = review_file_df["text"].fillna("")
+ if "label" in review_file_df.columns:
+ review_file_df["label"] = review_file_df["label"].fillna("")
+
+ xfdf_content = create_xfdf(
+ review_file_df,
+ pdf_name,
+ pdf_doc,
+ image_paths,
+ document_cropboxes,
+ page_sizes,
+ )
+
+ # Split output_folder (trusted base) from filename (untrusted)
+ secure_file_write(
+ output_folder,
+ file_path_name + "_adobe.xfdf",
+ xfdf_content,
+ encoding="utf-8",
+ )
+
+ # Reconstruct the full path for logging purposes
+ output_path = output_folder + file_path_name + "_adobe.xfdf"
+
+ output_paths.append(output_path)
+
+ return output_paths
+
+
+### Convert xfdf coordinates back to image for app
+
+
+def convert_adobe_coords_to_image(
+ pdf_page_width: float,
+ pdf_page_height: float,
+ image_width: float,
+ image_height: float,
+ x1: float,
+ y1: float,
+ x2: float,
+ y2: float,
+):
+ """
+ Converts coordinates from Adobe PDF space to image space.
+
+ Parameters:
+ - pdf_page_width: Width of the PDF page
+ - pdf_page_height: Height of the PDF page
+ - image_width: Width of the source image
+ - image_height: Height of the source image
+ - x1, y1, x2, y2: Coordinates in Adobe PDF space
+
+ Returns:
+ - Tuple of converted coordinates (x1, y1, x2, y2) in image space
+ """
+
+ # Calculate scaling factors
+ scale_width = image_width / pdf_page_width
+ scale_height = image_height / pdf_page_height
+
+ # Convert coordinates
+ image_x1 = x1 * scale_width
+ image_x2 = x2 * scale_width
+
+ # Convert Y coordinates (flip vertical axis)
+ # Adobe coordinates start from bottom-left
+ image_y1 = (pdf_page_height - y1) * scale_height
+ image_y2 = (pdf_page_height - y2) * scale_height
+
+ # Make sure y1 is always less than y2 for image's coordinate system
+ if image_y1 > image_y2:
+ image_y1, image_y2 = image_y2, image_y1
+
+ return image_x1, image_y1, image_x2, image_y2
+
+
+def parse_xfdf(xfdf_path: str):
+ """
+ Parse the XFDF file and extract redaction annotations.
+
+ Parameters:
+ - xfdf_path: Path to the XFDF file
+
+ Returns:
+ - List of dictionaries containing redaction information
+ """
+ # Assuming xfdf_path is a file path. If you are passing the XML string,
+ # you would use defused_etree.fromstring(xfdf_string) instead of .parse()
+ tree = defused_etree.parse(xfdf_path)
+ root = tree.getroot()
+
+ # Define the namespace
+ namespace = {"xfdf": "http://ns.adobe.com/xfdf/"}
+
+ redactions = list()
+
+ # Find all redact elements using the namespace
+ for redact in root.findall(".//xfdf:redact", namespaces=namespace):
+
+ # Extract text from contents-richtext if it exists
+ text_content = ""
+
+ # *** THE FIX IS HERE ***
+ # Use the namespace to find the contents-richtext element
+ contents_richtext = redact.find(
+ ".//xfdf:contents-richtext", namespaces=namespace
+ )
+
+ if contents_richtext is not None:
+ # Get all text content from the HTML structure
+ # The children of contents-richtext (body, p, span) have a different namespace
+ # but itertext() cleverly handles that for us.
+ text_content = "".join(contents_richtext.itertext()).strip()
+
+ # Fallback to contents attribute if no richtext content
+ if not text_content:
+ text_content = redact.get("contents", "")
+
+ redaction_info = {
+ "image": "", # Image will be filled in later
+ "page": int(redact.get("page")) + 1, # Convert to 1-based index
+ "xmin": float(redact.get("rect").split(",")[0]),
+ "ymin": float(redact.get("rect").split(",")[1]),
+ "xmax": float(redact.get("rect").split(",")[2]),
+ "ymax": float(redact.get("rect").split(",")[3]),
+ "label": redact.get("title"),
+ "text": text_content, # Use the extracted text content
+ "color": redact.get(
+ "border-color", "(0, 0, 0)"
+ ), # Default to black if not specified
+ }
+ redactions.append(redaction_info)
+
+ return redactions
+
+
+def convert_xfdf_to_dataframe(
+ file_paths_list: List[str],
+ pymupdf_doc: Document,
+ image_paths: List[str],
+ output_folder: str = OUTPUT_FOLDER,
+ input_folder: str = INPUT_FOLDER,
+):
+ """
+ Convert redaction annotations from XFDF and associated images into a DataFrame.
+
+ Parameters:
+ - xfdf_path: Path to the XFDF file
+ - pdf_doc: PyMuPDF document object
+ - image_paths: List of PIL Image objects corresponding to PDF pages
+ - output_folder: Output folder for file save
+ - input_folder: Input folder for image creation
+
+ Returns:
+ - DataFrame containing redaction information
+ """
+ output_paths = list()
+ df = pd.DataFrame()
+ pdf_name = ""
+ pdf_path = ""
+
+ # Sort the file paths so that the pdfs come first
+ file_paths_list = sorted(
+ file_paths_list,
+ key=lambda x: (
+ os.path.splitext(x)[1] != ".pdf",
+ os.path.splitext(x)[1] != ".json",
+ ),
+ )
+
+ for file in file_paths_list:
+
+ if isinstance(file, str):
+ file_path = file
+ else:
+ file_path = file.name
+
+ file_path_name = get_file_name_without_type(file_path)
+ file_path_end = detect_file_type(file_path)
+
+ if file_path_end == "pdf":
+ pdf_name = os.path.basename(file_path)
+ pdf_path = file_path
+
+ # Add pdf to outputs
+ output_paths.append(file_path)
+
+ if file_path_end == "xfdf":
+
+ if not pdf_name:
+ message = "Original PDF needed to convert from .xfdf format"
+ print(message)
+ raise ValueError(message)
+ xfdf_path = file
+
+ file_path_name = get_file_name_without_type(xfdf_path)
+
+ # Parse the XFDF file
+ redactions = parse_xfdf(xfdf_path)
+
+ # Create a DataFrame from the redaction information
+ df = pd.DataFrame(redactions)
+
+ df.fillna("", inplace=True) # Replace NaN with an empty string
+
+ for _, row in df.iterrows():
+ page_python_format = int(row["page"]) - 1
+
+ pymupdf_page = pymupdf_doc.load_page(page_python_format)
+
+ pdf_page_height = pymupdf_page.rect.height
+ pdf_page_width = pymupdf_page.rect.width
+
+ image_path = image_paths[page_python_format]
+
+ if isinstance(image_path, str):
+ try:
+ image = Image.open(image_path)
+ except Exception:
+ page_num, out_path, width, height = (
+ process_single_page_for_image_conversion(
+ pdf_path, page_python_format, input_folder=input_folder
+ )
+ )
+
+ image = Image.open(out_path)
+
+ image_page_width, image_page_height = image.size
+
+ # Convert to image coordinates
+ image_x1, image_y1, image_x2, image_y2 = convert_adobe_coords_to_image(
+ pdf_page_width,
+ pdf_page_height,
+ image_page_width,
+ image_page_height,
+ row["xmin"],
+ row["ymin"],
+ row["xmax"],
+ row["ymax"],
+ )
+
+ df.loc[_, ["xmin", "ymin", "xmax", "ymax"]] = [
+ image_x1,
+ image_y1,
+ image_x2,
+ image_y2,
+ ]
+
+ # Optionally, you can add the image path or other relevant information
+ df.loc[_, "image"] = image_path
+
+ out_file_path = output_folder + file_path_name + "_review_file.csv"
+ df.to_csv(out_file_path, index=None)
+
+ output_paths.append(out_file_path)
+
+ gr.Info(
+ f"Review file saved to {out_file_path}. Now click on '1. Upload original pdf' to view the pdf with the annotations."
+ )
+
+ return output_paths
diff --git a/tools/run_vlm.py b/tools/run_vlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a9fe95bdf21de4cd4080a589b143a132738a07a
--- /dev/null
+++ b/tools/run_vlm.py
@@ -0,0 +1,1431 @@
+import os
+import sys
+import time
+from threading import Lock, Thread
+
+import gradio as gr
+import spaces
+from PIL import Image
+
+from tools.config import (
+ ADD_VLM_BOUNDING_BOX_RULES,
+ CLOUD_VLM_MODEL_CHOICE,
+ DEFAULT_INFERENCE_SERVER_VLM_MODEL,
+ LOAD_PADDLE_AT_STARTUP,
+ LOAD_TRANSFORMERS_VLM_MODEL_AT_START,
+ MAX_INPUT_TOKEN_LENGTH,
+ MAX_NEW_TOKENS,
+ MAX_SPACES_GPU_RUN_TIME,
+ MAX_WORKERS,
+ PADDLE_DET_DB_UNCLIP_RATIO,
+ PADDLE_FONT_PATH,
+ PADDLE_MODEL_PATH,
+ PADDLE_USE_TEXTLINE_ORIENTATION,
+ QUANTISE_VLM_MODELS,
+ REPORT_VLM_OUTPUTS_TO_GUI,
+ SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL,
+ SHOW_BEDROCK_VLM_MODELS,
+ SHOW_INFERENCE_SERVER_VLM_OPTIONS,
+ SHOW_VLM_MODEL_OPTIONS,
+ USE_FLASH_ATTENTION,
+ VLM_DEFAULT_DO_SAMPLE,
+ VLM_DEFAULT_MIN_P,
+ VLM_DEFAULT_PRESENCE_PENALTY,
+ VLM_DEFAULT_REPETITION_PENALTY,
+ VLM_DEFAULT_STREAM,
+ VLM_DEFAULT_TEMPERATURE,
+ VLM_DEFAULT_TOP_K,
+ VLM_DEFAULT_TOP_P,
+ VLM_DISABLE_QWEN3_5_THINKING,
+ VLM_MAX_IMAGE_SIZE,
+ VLM_MIN_IMAGE_SIZE,
+ VLM_QWEN3_5_NOTHINK_SUFFIX,
+ VLM_SEED,
+)
+from tools.helper_functions import get_system_font_path
+
+text_read_default_prompt = """Read the main line of text in the image, and return JSON with keys "text" (string) and "conf" (number 0–1) for confidence in your identification, e.g. {"text": "read text", "conf": 0.95}. Do not include any other keys in the JSON. Ignore any words that are not part of the main line of text closest to the center of the image. Ensure that spaces between words and upper/lower cases are preserved. If you can't read the text, return an empty string ""."""
+
+if LOAD_PADDLE_AT_STARTUP:
+ # Set PaddleOCR environment variables BEFORE importing PaddleOCR
+ # This ensures fonts are configured before the package loads
+
+ # Set PaddleOCR model directory environment variable (only if specified).
+ if PADDLE_MODEL_PATH and PADDLE_MODEL_PATH.strip():
+ os.environ["PADDLEOCR_MODEL_DIR"] = PADDLE_MODEL_PATH
+ print(f"Setting PaddleOCR model path to: {PADDLE_MODEL_PATH}")
+ else:
+ print("Using default PaddleOCR model storage location")
+
+ # Set PaddleOCR font path to use system fonts instead of downloading simfang.ttf/PingFang-SC-Regular.ttf
+ # This MUST be set before importing PaddleOCR to prevent font downloads
+ if (
+ PADDLE_FONT_PATH
+ and PADDLE_FONT_PATH.strip()
+ and os.path.exists(PADDLE_FONT_PATH)
+ ):
+ os.environ["PADDLE_PDX_LOCAL_FONT_FILE_PATH"] = PADDLE_FONT_PATH
+ print(f"Setting PaddleOCR font path to configured font: {PADDLE_FONT_PATH}")
+ else:
+ system_font_path = get_system_font_path()
+ if system_font_path:
+ os.environ["PADDLE_PDX_LOCAL_FONT_FILE_PATH"] = system_font_path
+ print(f"Setting PaddleOCR font path to system font: {system_font_path}")
+ else:
+ print(
+ "Warning: No suitable system font found. PaddleOCR may download default fonts."
+ )
+
+ try:
+ from paddleocr import PaddleOCR
+
+ print("PaddleOCR imported successfully")
+
+ paddle_kwargs = None
+
+ # Default paddle configuration if none provided
+ if paddle_kwargs is None:
+ paddle_kwargs = {
+ "det_db_unclip_ratio": PADDLE_DET_DB_UNCLIP_RATIO,
+ "use_textline_orientation": PADDLE_USE_TEXTLINE_ORIENTATION,
+ "use_doc_orientation_classify": False,
+ "use_doc_unwarping": False,
+ "lang": "en",
+ }
+ else:
+ # Enforce language if not explicitly provided
+ paddle_kwargs.setdefault("lang", "en")
+
+ try:
+ PaddleOCR(**paddle_kwargs)
+ except Exception as e:
+ # Handle DLL loading errors (common on Windows with GPU version)
+ if (
+ "WinError 127" in str(e)
+ or "could not be found" in str(e).lower()
+ or "dll" in str(e).lower()
+ ):
+ print(
+ f"Warning: GPU initialization failed (likely missing CUDA/cuDNN dependencies): {e}"
+ )
+ print("PaddleOCR will not be available. To fix GPU issues:")
+ print("1. Install Visual C++ Redistributables (latest version)")
+ print("2. Ensure CUDA runtime libraries are in your PATH")
+ print(
+ "3. Or reinstall paddlepaddle CPU version: pip install paddlepaddle"
+ )
+ raise ImportError(
+ f"Error initializing PaddleOCR: {e}. Please install it using 'pip install paddleocr paddlepaddle' in your python environment and retry."
+ )
+ else:
+ raise e
+
+ except ImportError:
+ PaddleOCR = None
+ print(
+ "PaddleOCR not found. Please install it using 'pip install paddleocr paddlepaddle' in your python environment and retry."
+ )
+
+
+# Module-level refs to loaded VLM model/processor (set when SHOW_VLM_MODEL_OPTIONS and model is loaded). Used by LLM entity detection when USE_TRANSFORMERS_VLM_MODEL_AS_LLM.
+_loaded_vlm_model = None
+_loaded_vlm_processor = None
+
+# Define module-level defaults for model parameters (always available for import)
+# These will be overridden inside the SHOW_VLM_MODEL_OPTIONS block if enabled
+model_default_prompt = text_read_default_prompt
+model_default_do_sample = (
+ VLM_DEFAULT_DO_SAMPLE if VLM_DEFAULT_DO_SAMPLE is not None else True
+)
+model_default_top_p = VLM_DEFAULT_TOP_P if VLM_DEFAULT_TOP_P is not None else None
+model_default_min_p = VLM_DEFAULT_MIN_P if VLM_DEFAULT_MIN_P is not None else None
+model_default_top_k = VLM_DEFAULT_TOP_K if VLM_DEFAULT_TOP_K is not None else None
+model_default_temperature = (
+ VLM_DEFAULT_TEMPERATURE if VLM_DEFAULT_TEMPERATURE is not None else None
+)
+model_default_repetition_penalty = (
+ VLM_DEFAULT_REPETITION_PENALTY
+ if VLM_DEFAULT_REPETITION_PENALTY is not None
+ else None
+)
+model_default_presence_penalty = VLM_DEFAULT_PRESENCE_PENALTY
+model_default_max_new_tokens = int(MAX_NEW_TOKENS)
+model_default_seed = VLM_SEED if VLM_SEED is not None else None
+
+_load_vlm_weights_fn = None
+_vlm_load_lock = Lock()
+_transformers_vlm_weights_loaded = False
+
+
+def ensure_transformers_vlm_loaded():
+ """Load local transformers VLM weights once (thread-safe). No-op if VLM options disabled."""
+ global _transformers_vlm_weights_loaded
+ if not SHOW_VLM_MODEL_OPTIONS:
+ return
+ if _transformers_vlm_weights_loaded:
+ return
+ fn = _load_vlm_weights_fn
+ if fn is None:
+ return
+ with _vlm_load_lock:
+ if _transformers_vlm_weights_loaded:
+ return
+ fn()
+ _transformers_vlm_weights_loaded = True
+
+
+if SHOW_VLM_MODEL_OPTIONS is True:
+ import torch
+ from huggingface_hub import snapshot_download
+ from transformers import (
+ AutoConfig,
+ BitsAndBytesConfig,
+ TextIteratorStreamer,
+ )
+
+ from tools.config import (
+ MAX_INPUT_TOKEN_LENGTH,
+ MAX_NEW_TOKENS,
+ MODEL_CACHE_PATH,
+ OVERRIDE_VLM_REPO_ID,
+ QUANTISE_VLM_MODELS,
+ SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL,
+ USE_FLASH_ATTENTION,
+ VLM_DEFAULT_DO_SAMPLE,
+ VLM_DEFAULT_MIN_P,
+ VLM_DEFAULT_PRESENCE_PENALTY,
+ VLM_DEFAULT_REPETITION_PENALTY,
+ VLM_DEFAULT_TEMPERATURE,
+ VLM_DEFAULT_TOP_K,
+ VLM_DEFAULT_TOP_P,
+ VLM_SEED,
+ )
+
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+
+ print("torch.__version__ =", torch.__version__)
+ # print("torch.version.cuda =", torch.version.cuda)
+ print("cuda available:", torch.cuda.is_available())
+ # print("cuda device count:", torch.cuda.device_count())
+ if torch.cuda.is_available():
+ # print("current device:", torch.cuda.current_device())
+ print("device name:", torch.cuda.get_device_name(torch.cuda.current_device()))
+
+ # print("Using device:", device)
+
+ CACHE_PATH = MODEL_CACHE_PATH
+ if not os.path.exists(CACHE_PATH):
+ os.makedirs(CACHE_PATH)
+
+ # Initialize model and processor variables
+ processor = None
+ model = None
+
+ # Initialize model-specific generation parameters (will be set by specific models if needed)
+ # If config values are provided, use them; otherwise leave as None to use model defaults
+ model_default_prompt = text_read_default_prompt
+ model_default_do_sample = (
+ VLM_DEFAULT_DO_SAMPLE if VLM_DEFAULT_DO_SAMPLE is not None else None
+ )
+ model_default_top_p = VLM_DEFAULT_TOP_P if VLM_DEFAULT_TOP_P is not None else None
+ model_default_min_p = VLM_DEFAULT_MIN_P if VLM_DEFAULT_MIN_P is not None else None
+ model_default_top_k = VLM_DEFAULT_TOP_K if VLM_DEFAULT_TOP_K is not None else None
+ model_default_temperature = (
+ VLM_DEFAULT_TEMPERATURE if VLM_DEFAULT_TEMPERATURE is not None else None
+ )
+ model_default_repetition_penalty = (
+ VLM_DEFAULT_REPETITION_PENALTY
+ if VLM_DEFAULT_REPETITION_PENALTY is not None
+ else None
+ )
+ model_default_presence_penalty = VLM_DEFAULT_PRESENCE_PENALTY
+ model_default_max_new_tokens = int(MAX_NEW_TOKENS)
+ # Track which models support presence_penalty (only Qwen3-VL models currently)
+ model_supports_presence_penalty = False
+ model_default_seed = VLM_SEED if VLM_SEED is not None else None
+
+ if USE_FLASH_ATTENTION is True:
+ attn_implementation = "flash_attention_2"
+ else:
+ attn_implementation = "eager"
+
+ # Setup quantisation config if enabled
+ quantization_config = None
+ if QUANTISE_VLM_MODELS is True:
+ if not torch.cuda.is_available():
+ print(
+ "Warning: 4-bit quantisation requires CUDA, but CUDA is not available."
+ )
+ print("Falling back to loading models without quantisation")
+ quantization_config = None
+ else:
+ try:
+ quantization_config = BitsAndBytesConfig(
+ load_in_4bit=True,
+ bnb_4bit_compute_dtype=torch.float16,
+ bnb_4bit_use_double_quant=True,
+ bnb_4bit_quant_type="nf4",
+ )
+ print("Created quantization config for bitsandbytes")
+ except Exception as e:
+ print(f"Warning: Could not setup bitsandbytes quantization: {e}")
+ print("Falling back to loading models without quantization")
+ quantization_config = None
+
+ def _get_vlm_config_capped_length(model_id):
+ """Load model config with max_position_embeddings capped to MAX_INPUT_TOKEN_LENGTH to reduce VRAM (KV cache)."""
+ config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
+ cap = MAX_INPUT_TOKEN_LENGTH
+ if getattr(config, "max_position_embeddings", None) is not None:
+ if config.max_position_embeddings > cap:
+ config.max_position_embeddings = cap
+ if getattr(config, "text_config", None) is not None:
+ tc = config.text_config
+ if (
+ getattr(tc, "max_position_embeddings", None) is not None
+ and tc.max_position_embeddings > cap
+ ):
+ tc.max_position_embeddings = cap
+ return config
+
+ def _load_vlm_weights_and_finalize():
+ global model, processor, _loaded_vlm_model, _loaded_vlm_processor
+ global model_default_prompt, model_default_do_sample, model_default_top_p, model_default_min_p, model_default_top_k
+ global model_default_temperature, model_default_repetition_penalty, model_default_presence_penalty
+ global model_default_max_new_tokens, model_default_seed, model_supports_presence_penalty
+
+ # print(f"Loading vision model: {SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL}")
+
+ # Load only the selected model based on configuration
+ if SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Nanonets-OCR2-3B":
+ MODEL_ID = "nanonets/Nanonets-OCR2-3B"
+ from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
+
+ if OVERRIDE_VLM_REPO_ID:
+ MODEL_ID = OVERRIDE_VLM_REPO_ID
+ processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
+ load_kwargs = {
+ "trust_remote_code": True,
+ "config": _get_vlm_config_capped_length(MODEL_ID),
+ }
+ if quantization_config is not None:
+ load_kwargs["quantization_config"] = quantization_config
+ load_kwargs["device_map"] = "auto"
+ else:
+ load_kwargs["torch_dtype"] = torch.float16
+ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+ MODEL_ID, **load_kwargs
+ ).eval()
+ if quantization_config is None:
+ model = model.to(device)
+
+ model_default_prompt = text_read_default_prompt
+
+ elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Dots.OCR":
+ # Download and patch Dots.OCR model
+ model_path_d_local = snapshot_download(
+ repo_id="rednote-hilab/dots.ocr",
+ local_dir=os.path.join(CACHE_PATH, "dots.ocr"),
+ max_workers=MAX_WORKERS,
+ local_dir_use_symlinks=False,
+ )
+
+ config_file_path = os.path.join(model_path_d_local, "configuration_dots.py")
+
+ if os.path.exists(config_file_path):
+ with open(config_file_path, "r") as f:
+ input_code = f.read()
+
+ lines = input_code.splitlines()
+ if "class DotsVLProcessor" in input_code and not any(
+ "attributes = " in line for line in lines
+ ):
+ output_lines = []
+ for line in lines:
+ output_lines.append(line)
+ if line.strip().startswith("class DotsVLProcessor"):
+ output_lines.append(
+ ' attributes = ["image_processor", "tokenizer"]'
+ )
+
+ with open(config_file_path, "w") as f:
+ f.write("\n".join(output_lines))
+ print("Patched configuration_dots.py successfully.")
+
+ sys.path.append(model_path_d_local)
+
+ MODEL_ID = model_path_d_local
+ if OVERRIDE_VLM_REPO_ID:
+ MODEL_ID = OVERRIDE_VLM_REPO_ID
+ from transformers import AutoModelForCausalLM, AutoProcessor
+
+ processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
+ load_kwargs = {
+ "attn_implementation": attn_implementation,
+ "device_map": "auto",
+ "trust_remote_code": True,
+ "config": _get_vlm_config_capped_length(MODEL_ID),
+ }
+ if quantization_config is not None:
+ load_kwargs["quantization_config"] = quantization_config
+ else:
+ load_kwargs["torch_dtype"] = torch.bfloat16
+ model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **load_kwargs).eval()
+
+ model_default_prompt = text_read_default_prompt
+ model_default_max_new_tokens = MAX_NEW_TOKENS
+
+ elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "PaddleOCR-VL":
+ MODEL_ID = "PaddlePaddle/PaddleOCR-VL"
+ from transformers import AutoModelForCausalLM, AutoProcessor
+
+ if OVERRIDE_VLM_REPO_ID:
+ MODEL_ID = OVERRIDE_VLM_REPO_ID
+ load_kwargs = {
+ "trust_remote_code": True,
+ "config": _get_vlm_config_capped_length(MODEL_ID),
+ }
+ if quantization_config is not None:
+ load_kwargs["quantization_config"] = quantization_config
+ load_kwargs["device_map"] = "auto"
+ else:
+ load_kwargs["torch_dtype"] = torch.bfloat16
+ model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **load_kwargs).eval()
+ if quantization_config is None:
+ model = model.to(device)
+ processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
+
+ model_default_prompt = """OCR:"""
+ model_default_max_new_tokens = MAX_NEW_TOKENS
+
+ ###
+ # QWEN 3-VL MODELS
+ ###
+ elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Qwen3-VL-2B-Instruct":
+ MODEL_ID = "Qwen/Qwen3-VL-2B-Instruct"
+ from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
+
+ if OVERRIDE_VLM_REPO_ID:
+ MODEL_ID = OVERRIDE_VLM_REPO_ID
+ processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
+ load_kwargs = {
+ "device_map": "auto",
+ "trust_remote_code": True,
+ "config": _get_vlm_config_capped_length(MODEL_ID),
+ }
+ if quantization_config is not None:
+ load_kwargs["quantization_config"] = quantization_config
+ else:
+ load_kwargs["dtype"] = "auto"
+ model = Qwen3VLForConditionalGeneration.from_pretrained(
+ MODEL_ID, **load_kwargs
+ ).eval()
+
+ model_default_prompt = text_read_default_prompt
+ model_default_do_sample = model_default_do_sample
+ model_default_top_p = 0.8
+ model_default_min_p = 0.0
+ model_default_top_k = 20
+ model_default_temperature = 0.7
+ model_default_repetition_penalty = 1.0
+ model_default_presence_penalty = 1.5
+ model_default_max_new_tokens = MAX_NEW_TOKENS
+ model_supports_presence_penalty = (
+ False # I found that this doesn't work when using transformers
+ )
+
+ elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Qwen3-VL-4B-Instruct":
+ MODEL_ID = "Qwen/Qwen3-VL-4B-Instruct"
+ from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
+
+ if OVERRIDE_VLM_REPO_ID:
+ MODEL_ID = OVERRIDE_VLM_REPO_ID
+ processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
+ load_kwargs = {
+ "attn_implementation": attn_implementation,
+ "device_map": "auto",
+ "trust_remote_code": True,
+ "config": _get_vlm_config_capped_length(MODEL_ID),
+ }
+ if quantization_config is not None:
+ load_kwargs["quantization_config"] = quantization_config
+ else:
+ load_kwargs["dtype"] = "auto"
+ model = Qwen3VLForConditionalGeneration.from_pretrained(
+ MODEL_ID, **load_kwargs
+ ).eval()
+
+ model_default_prompt = text_read_default_prompt
+ model_default_do_sample = model_default_do_sample
+ model_default_top_p = 0.8
+ model_default_min_p = 0.0
+ model_default_top_k = 20
+ model_default_temperature = 0.7
+ model_default_repetition_penalty = 1.0
+ model_default_presence_penalty = 1.5
+ model_default_max_new_tokens = MAX_NEW_TOKENS
+ model_supports_presence_penalty = (
+ False # I found that this doesn't work when using transformers
+ )
+ elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Qwen3-VL-8B-Instruct":
+ MODEL_ID = "Qwen/Qwen3-VL-8B-Instruct"
+ from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
+
+ if OVERRIDE_VLM_REPO_ID:
+ MODEL_ID = OVERRIDE_VLM_REPO_ID
+ processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
+ load_kwargs = {
+ "attn_implementation": attn_implementation,
+ "device_map": "auto",
+ "trust_remote_code": True,
+ "config": _get_vlm_config_capped_length(MODEL_ID),
+ }
+ if quantization_config is not None:
+ load_kwargs["quantization_config"] = quantization_config
+ else:
+ load_kwargs["dtype"] = "auto"
+ model = Qwen3VLForConditionalGeneration.from_pretrained(
+ MODEL_ID, **load_kwargs
+ ).eval()
+
+ model_default_prompt = text_read_default_prompt
+ model_default_do_sample = model_default_do_sample
+ model_default_top_p = 0.8
+ model_default_min_p = 0.0
+ model_default_top_k = 20
+ model_default_temperature = 0.7
+ model_default_repetition_penalty = 1.0
+ model_default_presence_penalty = 1.5
+ model_default_max_new_tokens = MAX_NEW_TOKENS
+ model_supports_presence_penalty = (
+ False # I found that this doesn't work when using transformers
+ )
+
+ elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Qwen3-VL-32B-Instruct":
+ MODEL_ID = "Qwen/Qwen3-VL-32B-Instruct"
+ from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
+
+ if OVERRIDE_VLM_REPO_ID:
+ MODEL_ID = OVERRIDE_VLM_REPO_ID
+ processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
+ load_kwargs = {
+ "attn_implementation": attn_implementation,
+ "device_map": "auto",
+ "trust_remote_code": True,
+ "config": _get_vlm_config_capped_length(MODEL_ID),
+ }
+ if quantization_config is not None:
+ load_kwargs["quantization_config"] = quantization_config
+ else:
+ load_kwargs["dtype"] = "auto"
+ model = Qwen3VLForConditionalGeneration.from_pretrained(
+ MODEL_ID, **load_kwargs
+ ).eval()
+
+ model_default_prompt = text_read_default_prompt
+ model_default_do_sample = model_default_do_sample
+ model_default_top_p = 0.8
+ model_default_min_p = 0.0
+ model_default_top_k = 20
+ model_default_temperature = 0.7
+ model_default_repetition_penalty = 1.0
+ model_default_presence_penalty = 1.5
+ model_default_max_new_tokens = MAX_NEW_TOKENS
+ model_supports_presence_penalty = (
+ False # I found that this doesn't work when using transformers
+ )
+
+ elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Qwen3-VL-30B-A3B-Instruct":
+ MODEL_ID = "Qwen/Qwen3-VL-30B-A3B-Instruct"
+ if OVERRIDE_VLM_REPO_ID:
+ MODEL_ID = OVERRIDE_VLM_REPO_ID
+ from transformers import AutoProcessor, Qwen3VLMoeForConditionalGeneration
+
+ processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
+ load_kwargs = {
+ "attn_implementation": attn_implementation,
+ "device_map": "auto",
+ "trust_remote_code": True,
+ "config": _get_vlm_config_capped_length(MODEL_ID),
+ }
+
+ if quantization_config is not None:
+ load_kwargs["quantization_config"] = quantization_config
+ else:
+ load_kwargs["dtype"] = "auto"
+ model = Qwen3VLMoeForConditionalGeneration.from_pretrained(
+ MODEL_ID, **load_kwargs
+ ).eval()
+
+ model_default_prompt = text_read_default_prompt
+ model_default_do_sample = model_default_do_sample
+ model_default_top_p = 0.8
+ model_default_min_p = 0.0
+ model_default_top_k = 20
+ model_default_temperature = 0.7
+ model_default_repetition_penalty = 1.0
+ model_default_presence_penalty = 1.5
+ model_default_max_new_tokens = MAX_NEW_TOKENS
+ model_supports_presence_penalty = (
+ False # I found that this doesn't work when using transformers
+ )
+
+ elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Qwen3-VL-235B-A22B-Instruct-FP8":
+ MODEL_ID = "Qwen/Qwen3-VL-235B-A22B-Instruct-FP8"
+ if OVERRIDE_VLM_REPO_ID:
+ MODEL_ID = OVERRIDE_VLM_REPO_ID
+ from transformers import AutoProcessor, Qwen3VLMoeForConditionalGeneration
+
+ processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
+ load_kwargs = {
+ "attn_implementation": attn_implementation,
+ "device_map": "auto",
+ "trust_remote_code": True,
+ "config": _get_vlm_config_capped_length(MODEL_ID),
+ }
+
+ if quantization_config is not None:
+ load_kwargs["quantization_config"] = quantization_config
+ else:
+ load_kwargs["dtype"] = "auto"
+ model = Qwen3VLMoeForConditionalGeneration.from_pretrained(
+ MODEL_ID, **load_kwargs
+ ).eval()
+
+ model_default_prompt = text_read_default_prompt
+ model_default_do_sample = model_default_do_sample
+ model_default_top_p = 0.8
+ model_default_min_p = 0.0
+ model_default_top_k = 20
+ model_default_temperature = 0.7
+ model_default_repetition_penalty = 1.0
+ model_default_presence_penalty = 1.5
+ model_default_max_new_tokens = MAX_NEW_TOKENS
+ model_supports_presence_penalty = (
+ False # I found that this doesn't work when using transformers
+ )
+
+ ###
+ # QWEN 3.5 MODELS
+ ###
+ elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Qwen3.5-0.8B":
+ from transformers import AutoProcessor, Qwen3_5ForConditionalGeneration
+
+ MODEL_ID = "Qwen/Qwen3.5-0.8B"
+ if OVERRIDE_VLM_REPO_ID:
+ MODEL_ID = OVERRIDE_VLM_REPO_ID
+ processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
+ load_kwargs = {
+ "attn_implementation": attn_implementation,
+ "device_map": "auto",
+ "trust_remote_code": True,
+ "config": _get_vlm_config_capped_length(MODEL_ID),
+ }
+ if quantization_config is not None:
+ load_kwargs["quantization_config"] = quantization_config
+ else:
+ load_kwargs["dtype"] = "auto"
+
+ model = Qwen3_5ForConditionalGeneration.from_pretrained(
+ MODEL_ID, **load_kwargs
+ )
+
+ model_default_prompt = text_read_default_prompt
+ model_default_do_sample = model_default_do_sample
+ model_default_top_p = 0.8
+ model_default_min_p = 0.0
+ model_default_top_k = 20
+ model_default_temperature = 0.7
+ model_default_repetition_penalty = 1.0
+ model_default_presence_penalty = 1.5
+ model_default_max_new_tokens = MAX_NEW_TOKENS
+ model_supports_presence_penalty = (
+ False # I found that this doesn't work when using transformers
+ )
+
+ elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Qwen3.5-2B":
+ from transformers import AutoProcessor, Qwen3_5ForConditionalGeneration
+
+ MODEL_ID = "Qwen/Qwen3.5-2B"
+ if OVERRIDE_VLM_REPO_ID:
+ MODEL_ID = OVERRIDE_VLM_REPO_ID
+ processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
+ load_kwargs = {
+ "attn_implementation": attn_implementation,
+ "device_map": "auto",
+ "trust_remote_code": True,
+ "config": _get_vlm_config_capped_length(MODEL_ID),
+ }
+ if quantization_config is not None:
+ load_kwargs["quantization_config"] = quantization_config
+ else:
+ load_kwargs["dtype"] = "auto"
+
+ model = Qwen3_5ForConditionalGeneration.from_pretrained(
+ MODEL_ID, **load_kwargs
+ )
+
+ model_default_prompt = text_read_default_prompt
+ model_default_do_sample = model_default_do_sample
+ model_default_top_p = 0.8
+ model_default_min_p = 0.0
+ model_default_top_k = 20
+ model_default_temperature = 0.7
+ model_default_repetition_penalty = 1.0
+ model_default_presence_penalty = 1.5
+ model_default_max_new_tokens = MAX_NEW_TOKENS
+ model_supports_presence_penalty = (
+ False # I found that this doesn't work when using transformers
+ )
+
+ elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Qwen3.5-4B":
+ from transformers import AutoProcessor, Qwen3_5ForConditionalGeneration
+
+ MODEL_ID = "Qwen/Qwen3.5-4B"
+ if OVERRIDE_VLM_REPO_ID:
+ MODEL_ID = OVERRIDE_VLM_REPO_ID
+ processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
+ load_kwargs = {
+ "attn_implementation": attn_implementation,
+ "device_map": "auto",
+ "trust_remote_code": True,
+ "config": _get_vlm_config_capped_length(MODEL_ID),
+ }
+ if quantization_config is not None:
+ load_kwargs["quantization_config"] = quantization_config
+ else:
+ load_kwargs["dtype"] = "auto"
+ model = Qwen3_5ForConditionalGeneration.from_pretrained(
+ MODEL_ID, **load_kwargs
+ )
+
+ model_default_prompt = text_read_default_prompt
+ model_default_do_sample = model_default_do_sample
+ model_default_top_p = 0.8
+ model_default_min_p = 0.0
+ model_default_top_k = 20
+ model_default_temperature = 0.7
+ model_default_repetition_penalty = 1.0
+ model_default_presence_penalty = 1.5
+ model_default_max_new_tokens = MAX_NEW_TOKENS
+ model_supports_presence_penalty = (
+ False # I found that this doesn't work when using transformers
+ )
+
+ elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Qwen3.5-9B":
+ from transformers import AutoProcessor, Qwen3_5ForConditionalGeneration
+
+ MODEL_ID = "Qwen/Qwen3.5-9B"
+ if OVERRIDE_VLM_REPO_ID:
+ MODEL_ID = OVERRIDE_VLM_REPO_ID
+ processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
+ load_kwargs = {
+ "attn_implementation": attn_implementation,
+ "device_map": "auto",
+ "trust_remote_code": True,
+ "config": _get_vlm_config_capped_length(MODEL_ID),
+ }
+ if quantization_config is not None:
+ load_kwargs["quantization_config"] = quantization_config
+ else:
+ load_kwargs["dtype"] = "auto"
+ model = Qwen3_5ForConditionalGeneration.from_pretrained(
+ MODEL_ID, **load_kwargs
+ )
+
+ model_default_prompt = text_read_default_prompt
+ model_default_do_sample = model_default_do_sample
+ model_default_top_p = 0.8
+ model_default_min_p = 0.0
+ model_default_top_k = 20
+ model_default_temperature = 0.7
+ model_default_repetition_penalty = 1.0
+ model_default_presence_penalty = 1.5
+ model_default_max_new_tokens = MAX_NEW_TOKENS
+ model_supports_presence_penalty = (
+ False # I found that this doesn't work when using transformers
+ )
+
+ elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Qwen3.5-27B":
+ from transformers import (
+ AutoProcessor,
+ Qwen3_5ForConditionalGeneration,
+ )
+
+ MODEL_ID = "Qwen/Qwen3.5-27B"
+ if OVERRIDE_VLM_REPO_ID:
+ MODEL_ID = OVERRIDE_VLM_REPO_ID
+ processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
+ load_kwargs = {
+ "attn_implementation": attn_implementation,
+ "device_map": "auto",
+ "trust_remote_code": True,
+ "config": _get_vlm_config_capped_length(MODEL_ID),
+ }
+ if quantization_config is not None:
+ load_kwargs["quantization_config"] = quantization_config
+ else:
+ load_kwargs["dtype"] = "auto"
+ model = Qwen3_5ForConditionalGeneration.from_pretrained(
+ MODEL_ID, **load_kwargs
+ )
+
+ model_default_prompt = text_read_default_prompt
+ model_default_do_sample = model_default_do_sample
+ model_default_top_p = 0.8
+ model_default_min_p = 0.0
+ model_default_top_k = 20
+ model_default_temperature = 0.7
+ model_default_repetition_penalty = 1.0
+ model_default_presence_penalty = 1.5
+ model_default_max_new_tokens = MAX_NEW_TOKENS
+ model_supports_presence_penalty = (
+ False # I found that this doesn't work when using transformers
+ )
+
+ elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Qwen3.5-27B-bnb-4bit":
+ from transformers import (
+ AutoProcessor,
+ Qwen3_5ForConditionalGeneration,
+ )
+
+ MODEL_ID = "bertbobson/Qwen3.5-27B-bnb-4bit"
+ if OVERRIDE_VLM_REPO_ID:
+ MODEL_ID = OVERRIDE_VLM_REPO_ID
+ processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
+ load_kwargs = {
+ "attn_implementation": attn_implementation,
+ "device_map": "auto",
+ "trust_remote_code": True,
+ "config": _get_vlm_config_capped_length(MODEL_ID),
+ }
+ if quantization_config is not None:
+ load_kwargs["quantization_config"] = quantization_config
+ else:
+ load_kwargs["dtype"] = "auto"
+ model = Qwen3_5ForConditionalGeneration.from_pretrained(
+ MODEL_ID, **load_kwargs
+ )
+
+ model_default_prompt = text_read_default_prompt
+ model_default_do_sample = model_default_do_sample
+ model_default_top_p = 0.8
+ model_default_min_p = 0.0
+ model_default_top_k = 20
+ model_default_temperature = 0.7
+ model_default_repetition_penalty = 1.0
+ model_default_presence_penalty = 1.5
+ model_default_max_new_tokens = MAX_NEW_TOKENS
+ model_supports_presence_penalty = (
+ False # I found that this doesn't work when using transformers
+ )
+
+ elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Qwen3.5-35B-A3B":
+ from transformers import (
+ AutoProcessor,
+ Qwen3_5MoeForConditionalGeneration,
+ )
+
+ MODEL_ID = "Qwen/Qwen3.5-35B-A3B"
+ if OVERRIDE_VLM_REPO_ID:
+ MODEL_ID = OVERRIDE_VLM_REPO_ID
+ processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
+ load_kwargs = {
+ "attn_implementation": attn_implementation,
+ "device_map": "auto",
+ "trust_remote_code": True,
+ "config": _get_vlm_config_capped_length(MODEL_ID),
+ }
+ if quantization_config is not None:
+ load_kwargs["quantization_config"] = quantization_config
+ else:
+ load_kwargs["dtype"] = "auto"
+ model = Qwen3_5MoeForConditionalGeneration.from_pretrained(
+ MODEL_ID, **load_kwargs
+ )
+
+ model_default_prompt = text_read_default_prompt
+ model_default_do_sample = model_default_do_sample
+ model_default_top_p = 0.8
+ model_default_min_p = 0.0
+ model_default_top_k = 20
+ model_default_temperature = 0.7
+ model_default_repetition_penalty = 1.0
+ model_default_presence_penalty = 1.5
+ model_default_max_new_tokens = MAX_NEW_TOKENS
+ model_supports_presence_penalty = (
+ False # I found that this doesn't work when using transformers
+ )
+
+ elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Qwen3.5-122B-A10B":
+ from transformers import (
+ AutoProcessor,
+ Qwen3_5MoeForConditionalGeneration,
+ )
+
+ MODEL_ID = "Qwen/Qwen3.5-122B-A10B"
+ if OVERRIDE_VLM_REPO_ID:
+ MODEL_ID = OVERRIDE_VLM_REPO_ID
+ processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
+ load_kwargs = {
+ "attn_implementation": attn_implementation,
+ "device_map": "auto",
+ "trust_remote_code": True,
+ "config": _get_vlm_config_capped_length(MODEL_ID),
+ }
+ if quantization_config is not None:
+ load_kwargs["quantization_config"] = quantization_config
+ else:
+ load_kwargs["dtype"] = "auto"
+ model = Qwen3_5MoeForConditionalGeneration.from_pretrained(
+ MODEL_ID, **load_kwargs
+ )
+
+ model_default_prompt = text_read_default_prompt
+ model_default_do_sample = model_default_do_sample
+ model_default_top_p = 0.8
+ model_default_min_p = 0.0
+ model_default_top_k = 20
+ model_default_temperature = 0.7
+ model_default_repetition_penalty = 1.0
+ model_default_presence_penalty = 1.5
+ model_default_max_new_tokens = MAX_NEW_TOKENS
+ model_supports_presence_penalty = (
+ False # I found that this doesn't work when using transformers
+ )
+
+ elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Qwen3.5-397B-A17B":
+ from transformers import (
+ AutoProcessor,
+ Qwen3_5MoeForConditionalGeneration,
+ )
+
+ MODEL_ID = "Qwen/Qwen3.5-397B-A17B"
+ if OVERRIDE_VLM_REPO_ID:
+ MODEL_ID = OVERRIDE_VLM_REPO_ID
+ processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
+ load_kwargs = {
+ "attn_implementation": attn_implementation,
+ "device_map": "auto",
+ "trust_remote_code": True,
+ "config": _get_vlm_config_capped_length(MODEL_ID),
+ }
+ if quantization_config is not None:
+ load_kwargs["quantization_config"] = quantization_config
+ else:
+ load_kwargs["dtype"] = "auto"
+ model = Qwen3_5MoeForConditionalGeneration.from_pretrained(
+ MODEL_ID, **load_kwargs
+ )
+
+ model_default_prompt = text_read_default_prompt
+ model_default_do_sample = model_default_do_sample
+ model_default_top_p = 0.8
+ model_default_min_p = 0.0
+ model_default_top_k = 20
+ model_default_temperature = 0.7
+ model_default_repetition_penalty = 1.0
+ model_default_presence_penalty = 1.5
+ model_default_max_new_tokens = MAX_NEW_TOKENS
+ model_supports_presence_penalty = (
+ False # I found that this doesn't work when using transformers
+ )
+
+ elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "None":
+ model = None
+ processor = None
+
+ else:
+ raise ValueError(
+ f"Invalid model selected: {SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL}. Valid options are shown in tools/run_vlm.py, or None"
+ )
+
+ # Override model defaults with user-provided config values if they are set
+ # Priority: user config value > model default
+ if VLM_DEFAULT_DO_SAMPLE is not None:
+ model_default_do_sample = VLM_DEFAULT_DO_SAMPLE
+ if VLM_DEFAULT_TOP_P is not None:
+ model_default_top_p = VLM_DEFAULT_TOP_P
+ if VLM_DEFAULT_MIN_P is not None:
+ model_default_min_p = VLM_DEFAULT_MIN_P
+ if VLM_DEFAULT_TOP_K is not None:
+ model_default_top_k = VLM_DEFAULT_TOP_K
+ if VLM_DEFAULT_TEMPERATURE is not None:
+ model_default_temperature = VLM_DEFAULT_TEMPERATURE
+ if VLM_DEFAULT_REPETITION_PENALTY is not None:
+ model_default_repetition_penalty = VLM_DEFAULT_REPETITION_PENALTY
+ if VLM_DEFAULT_PRESENCE_PENALTY is not None:
+ model_default_presence_penalty = VLM_DEFAULT_PRESENCE_PENALTY
+ if VLM_SEED is not None:
+ model_default_seed = VLM_SEED
+
+ # Cap processor tokenizer to config max context length so all tokenization respects MAX_INPUT_TOKEN_LENGTH
+ if processor is not None:
+ tokenizer = getattr(processor, "tokenizer", None)
+ if tokenizer is not None and hasattr(tokenizer, "model_max_length"):
+ current_max = tokenizer.model_max_length
+ if current_max is None or current_max == float("inf"):
+ tokenizer.model_max_length = MAX_INPUT_TOKEN_LENGTH
+ elif current_max > MAX_INPUT_TOKEN_LENGTH:
+ tokenizer.model_max_length = MAX_INPUT_TOKEN_LENGTH
+ # Log effective VLM context cap so env (e.g. MAX_INPUT_TOKEN_LENGTH=4096) can be verified
+ _ref_ctx = 32768
+ _reserve = 1024
+ _eff_max = min(
+ VLM_MAX_IMAGE_SIZE,
+ (VLM_MAX_IMAGE_SIZE * max(0, MAX_INPUT_TOKEN_LENGTH - _reserve) // _ref_ctx)
+ // 1024
+ * 1024,
+ )
+ _abs_min = 65536
+ effective_max_pixels_at_load = max(_abs_min, _eff_max)
+ effective_min_pixels_at_load = min(
+ VLM_MIN_IMAGE_SIZE, effective_max_pixels_at_load
+ )
+
+ if SHOW_VLM_MODEL_OPTIONS:
+ print(
+ f"VLM context cap: MAX_INPUT_TOKEN_LENGTH={MAX_INPUT_TOKEN_LENGTH}, "
+ f"effective max_pixels={effective_max_pixels_at_load}, min_pixels={effective_min_pixels_at_load} "
+ f"(VLM_MAX_IMAGE_SIZE={VLM_MAX_IMAGE_SIZE}, VLM_MIN_IMAGE_SIZE={VLM_MIN_IMAGE_SIZE})"
+ )
+
+ # Store at module level for USE_TRANSFORMERS_VLM_MODEL_AS_LLM (no global needed at module level)
+ _loaded_vlm_model = model
+ _loaded_vlm_processor = processor
+
+ _load_vlm_weights_fn = _load_vlm_weights_and_finalize
+
+ # print(f"Successfully loaded {SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL}")
+
+
+if SHOW_VLM_MODEL_OPTIONS and LOAD_TRANSFORMERS_VLM_MODEL_AT_START:
+ try:
+ ensure_transformers_vlm_loaded()
+ except Exception as e:
+ print(f"Warning: Could not load transformers VLM model at startup: {e}")
+ print("The VLM will be loaded on first use when a VLM OCR task runs.")
+
+
+def get_loaded_vlm_model_and_tokenizer():
+ """
+ Return the currently loaded VLM model and its tokenizer for use by LLM tasks (e.g. entity detection) when USE_TRANSFORMERS_VLM_MODEL_AS_LLM is True.
+ Returns (model, tokenizer) or (None, None) if the VLM has not been loaded yet.
+ """
+ global _loaded_vlm_model, _loaded_vlm_processor
+ ensure_transformers_vlm_loaded()
+ if _loaded_vlm_model is None or _loaded_vlm_processor is None:
+ return None, None
+ tokenizer = getattr(_loaded_vlm_processor, "tokenizer", _loaded_vlm_processor)
+ return _loaded_vlm_model, tokenizer
+
+
+@spaces.GPU(duration=MAX_SPACES_GPU_RUN_TIME)
+def extract_text_from_image_vlm(
+ text: str,
+ image: Image.Image,
+ max_new_tokens: int = None,
+ temperature: float = None,
+ top_p: float = None,
+ min_p: float = None,
+ top_k: int = None,
+ repetition_penalty: float = None,
+ do_sample: bool = None,
+ presence_penalty: float = None,
+ seed: int = None,
+ model_default_prompt: str = None,
+):
+ """
+ Generates responses using the configured vision model for image input.
+
+ When ``VLM_DEFAULT_STREAM`` is True (default), streams text to the console and
+ returns the full string when generation finishes. When ``VLM_DEFAULT_STREAM``
+ is False, runs a single batched ``generate`` call (no console streaming) and
+ returns the same ``(text, input_tokens, output_tokens)`` tuple.
+
+ Uses model-specific defaults if they were set during model initialization,
+ falling back to function argument defaults if provided, and finally to sensible
+ general defaults if neither are available.
+
+ Args:
+ text (str): The text prompt to send to the vision model. If empty and model
+ has a default prompt, the model default will be used.
+ image (Image.Image): The PIL Image to process. Must not be None.
+ max_new_tokens (int, optional): Maximum number of new tokens to generate.
+ Defaults to model-specific value (MAX_NEW_TOKENS for models with defaults) or MAX_NEW_TOKENS from config.
+ temperature (float, optional): Sampling temperature for generation.
+ Defaults to model-specific value (0.7 for Qwen3-VL models) or 0.7.
+ top_p (float, optional): Nucleus sampling parameter (top-p).
+ Defaults to model-specific value (0.8 for Qwen3-VL models) or 0.9.
+ min_p (float, optional): Minimum probability threshold for token sampling.
+ Defaults to model-specific value or 0.0.
+ top_k (int, optional): Top-k sampling parameter.
+ Defaults to model-specific value (20 for Qwen3-VL models) or 50.
+ repetition_penalty (float, optional): Penalty for token repetition.
+ Defaults to model-specific value (1.0 for Qwen3-VL models) or 1.3.
+ do_sample (bool, optional): If True, use sampling (do_sample=True).
+ If False, use sampling (do_sample=True). If None, defaults to False
+ (sampling) for Qwen3-VL models, or True (sampling) for other models.
+ presence_penalty (float, optional): Penalty for token presence.
+ Defaults to model-specific value (1.5 for Qwen3-VL models) or None.
+ Note: Not all models support this parameter.
+ seed (int, optional): Random seed for generation. If None, uses VLM_SEED
+ from config if set, otherwise no seed is set (non-deterministic).
+ model_default_prompt (str, optional): The default prompt to use if no text is provided.
+ Defaults to model-specific value (None for Dots.OCR, "Read all the text in the image." for Qwen3-VL models) or "Read all the text in the image."
+
+ Returns:
+ Tuple[str, int, int]: The complete generated text response, input tokens (estimated), output tokens (estimated).
+ """
+ if image is None:
+ return "Please upload an image.", 0, 0
+
+ if not SHOW_VLM_MODEL_OPTIONS:
+ return (
+ "Local transformers VLM is not enabled (SHOW_VLM_MODEL_OPTIONS=False).",
+ 0,
+ 0,
+ )
+
+ ensure_transformers_vlm_loaded()
+ if model is None or processor is None:
+ return (
+ "No local transformers VLM is loaded. Check SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL "
+ "or prior load errors (e.g. set LOAD_TRANSFORMERS_VLM_MODEL_AT_START=True to load at startup).",
+ 0,
+ 0,
+ )
+
+ # Determine parameter values with priority: function args > model defaults > general defaults
+ # Priority order: function argument (if not None) > model default > general default
+
+ # Text/prompt handling
+ if text and text.strip():
+ actual_text = text
+ elif model_default_prompt is not None:
+ actual_text = model_default_prompt
+ else:
+ actual_text = "Read all the text in the image." # General default
+
+ # max_new_tokens: function arg > model default > general default
+ if max_new_tokens is not None:
+ actual_max_new_tokens = max_new_tokens
+ elif model_default_max_new_tokens is not None:
+ actual_max_new_tokens = model_default_max_new_tokens
+ else:
+ actual_max_new_tokens = MAX_NEW_TOKENS # General default (from config)
+
+ # temperature: function arg > model default (which may include config override)
+ if temperature is not None:
+ actual_temperature = temperature
+ elif model_default_temperature is not None:
+ actual_temperature = model_default_temperature
+ else:
+ # Fallback to a sensible default if neither function arg nor model default is set
+ actual_temperature = 0.1
+
+ # top_p: function arg > model default (which may include config override)
+ if top_p is not None:
+ actual_top_p = top_p
+ elif model_default_top_p is not None:
+ actual_top_p = model_default_top_p
+ else:
+ # Fallback to a sensible default if neither function arg nor model default is set
+ actual_top_p = 0.8
+
+ # min_p: function arg > model default (which may include config override)
+ if min_p is not None:
+ actual_min_p = min_p
+ elif model_default_min_p is not None:
+ actual_min_p = model_default_min_p
+ else:
+ # Fallback to a sensible default if neither function arg nor model default is set
+ actual_min_p = 0.0
+
+ # top_k: function arg > model default (which may include config override)
+ if top_k is not None:
+ actual_top_k = top_k
+ elif model_default_top_k is not None:
+ actual_top_k = model_default_top_k
+ else:
+ # Fallback to a sensible default if neither function arg nor model default is set
+ actual_top_k = 20
+
+ # repetition_penalty: function arg > model default (which may include config override)
+ if repetition_penalty is not None:
+ actual_repetition_penalty = repetition_penalty
+ elif model_default_repetition_penalty is not None:
+ actual_repetition_penalty = model_default_repetition_penalty
+ else:
+ # Fallback to a sensible default if neither function arg nor model default is set
+ actual_repetition_penalty = 1.0
+
+ # do_sample: function arg > model default (which may include config override)
+ if do_sample is not None:
+ actual_do_sample = do_sample
+ elif model_default_do_sample is not None:
+ actual_do_sample = model_default_do_sample
+ else:
+ # Fallback to a sensible default if neither function arg nor model default is set
+ actual_do_sample = True
+
+ # presence_penalty: function arg > model default (which may include config override) > None
+ actual_presence_penalty = None
+ if presence_penalty is not None:
+ actual_presence_penalty = presence_penalty
+ elif model_default_presence_penalty is not None:
+ actual_presence_penalty = model_default_presence_penalty
+
+ # seed: function arg > model default (which may include config override)
+ actual_seed = None
+ if seed is not None:
+ actual_seed = seed
+ elif model_default_seed is not None:
+ actual_seed = model_default_seed
+
+ messages = [
+ {
+ "role": "user",
+ "content": [
+ {"type": "image"},
+ {"type": "text", "text": actual_text},
+ ],
+ }
+ ]
+ # Build prompt: when disabling Qwen3.5 thinking we append after the generation
+ # prompt so the model sees it and continues with the answer (avoids continue_final_message
+ # which can fail when the chat template does not include the final assistant message in the
+ # rendered string).
+ prompt_full = processor.apply_chat_template(
+ messages, tokenize=False, add_generation_prompt=True
+ )
+ if VLM_DISABLE_QWEN3_5_THINKING:
+ prompt_full = prompt_full + VLM_QWEN3_5_NOTHINK_SUFFIX
+
+ # Cap max_pixels so image tokens + text fit within MAX_INPUT_TOKEN_LENGTH (image token count scales with resolution).
+ # Reserve ~1k tokens for prompt; allow max_pixels below VLM_MIN_IMAGE_SIZE when context is small to avoid VRAM spike.
+ _ref_context = 32768
+ _reserve_text = 1024
+ _effective_max_pixels = min(
+ VLM_MAX_IMAGE_SIZE,
+ (
+ VLM_MAX_IMAGE_SIZE
+ * max(0, MAX_INPUT_TOKEN_LENGTH - _reserve_text)
+ // _ref_context
+ )
+ // 1024
+ * 1024,
+ )
+ _absolute_min_pixels = 65536 # 256*256 so image remains usable
+ effective_max_pixels = max(_absolute_min_pixels, _effective_max_pixels)
+ # Don't force upscaling above our cap: min_pixels must not exceed max_pixels
+ effective_min_pixels = min(VLM_MIN_IMAGE_SIZE, effective_max_pixels)
+
+ inputs = processor(
+ text=[prompt_full],
+ images=[image],
+ return_tensors="pt",
+ padding=True,
+ min_pixels=effective_min_pixels,
+ max_pixels=effective_max_pixels,
+ truncation=True,
+ max_length=MAX_INPUT_TOKEN_LENGTH,
+ ).to(device)
+
+ use_stream = VLM_DEFAULT_STREAM if VLM_DEFAULT_STREAM is not None else True
+
+ # Set random seed if specified
+ if actual_seed is not None:
+ torch.manual_seed(actual_seed)
+ if torch.cuda.is_available():
+ torch.cuda.manual_seed_all(actual_seed)
+
+ # Build generation kwargs with resolved parameters
+ generation_kwargs = {
+ **inputs,
+ "max_new_tokens": actual_max_new_tokens,
+ "do_sample": actual_do_sample,
+ "temperature": actual_temperature,
+ "top_p": actual_top_p,
+ "min_p": actual_min_p,
+ "top_k": actual_top_k,
+ "repetition_penalty": actual_repetition_penalty,
+ }
+
+ # Add presence_penalty if it's set and the model supports it
+ # Only Qwen3-VL models currently support presence_penalty
+ if actual_presence_penalty is not None and model_supports_presence_penalty:
+ generation_kwargs["presence_penalty"] = actual_presence_penalty
+
+ start_time = time.time()
+ buffer = ""
+
+ if use_stream:
+ streamer = TextIteratorStreamer(
+ processor, skip_prompt=True, skip_special_tokens=True
+ )
+ generation_kwargs["streamer"] = streamer
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
+ thread.start()
+
+ line_buffer = "" # Accumulate text for the current line
+ for new_text in streamer:
+ buffer += new_text
+ buffer = buffer.replace("<|im_end|>", "")
+ line_buffer += new_text
+
+ # Print to console as it streams
+ print(new_text, end="", flush=True)
+
+ # If we hit a newline, report the entire accumulated line to GUI
+ if REPORT_VLM_OUTPUTS_TO_GUI and "\n" in new_text:
+ # Split by newline to handle the line(s) we just completed
+ parts = line_buffer.split("\n")
+ # Report all complete lines (everything except the last part which may be incomplete)
+ for line in parts[:-1]:
+ if line.strip(): # Only report non-empty lines
+ gr.Info(line, duration=2)
+ # Keep the last part (after the last newline) for the next line
+ line_buffer = parts[-1] if parts else ""
+
+ # Print final newline after streaming is complete
+ print() # Add newline at the end
+ else:
+ with torch.inference_mode():
+ output_ids = model.generate(**generation_kwargs)
+ prompt_len = inputs["input_ids"].shape[1]
+ new_token_ids = output_ids[:, prompt_len:]
+ buffer = processor.batch_decode(new_token_ids, skip_special_tokens=True)[0]
+ buffer = buffer.replace("<|im_end|>", "")
+ if REPORT_VLM_OUTPUTS_TO_GUI and buffer.strip():
+ for line in buffer.split("\n"):
+ if line.strip():
+ gr.Info(line, duration=2)
+
+ end_time = time.time()
+
+ # Estimate token usage for local models
+ # For local transformers models, we can estimate using the tokenizer if available
+ input_tokens = 0
+ output_tokens = 0
+ try:
+ if (
+ processor
+ and hasattr(processor, "tokenizer")
+ and processor.tokenizer is not None
+ ):
+ # Estimate input tokens from prompt and image
+ # Note: Vision models encode images differently, so this is an approximation
+ prompt_tokens = len(
+ processor.tokenizer.encode(actual_text, add_special_tokens=False)
+ )
+ # Rough estimate: assume image tokens are proportional to image size
+ # This is a rough approximation - actual vision tokenization is more complex
+ image_tokens_estimate = (
+ image.size[0] * image.size[1]
+ ) // 1000 # Rough estimate
+ input_tokens = prompt_tokens + image_tokens_estimate
+
+ # Estimate output tokens from generated text
+ output_tokens = len(
+ processor.tokenizer.encode(buffer, add_special_tokens=False)
+ )
+ except Exception:
+ # If token counting fails, use rough word-based estimates
+ input_tokens = len(actual_text.split()) * 2 # Rough estimate
+ output_tokens = len(buffer.split()) * 2 # Rough estimate
+
+ duration = end_time - start_time
+ tokens_per_second = output_tokens / duration if duration > 0 else 0
+
+ print("\n--- Performance ---")
+ print(f"Time taken: {duration:.2f} seconds")
+ print(f"Generated tokens: {output_tokens}")
+ print(f"Tokens per second: {tokens_per_second:.2f}")
+
+ # Return the complete text and token estimates
+ return buffer, input_tokens, output_tokens
+
+
+# Optionally, give some more guidance on bounding box coordinates
+if ADD_VLM_BOUNDING_BOX_RULES:
+ # Qwen models don't need the additional bounding box guidance as they have already been trained in this coordinate system
+ if (
+ (
+ "qwen" in str(SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL).lower()
+ and SHOW_VLM_MODEL_OPTIONS
+ )
+ or (
+ "qwen" in str(DEFAULT_INFERENCE_SERVER_VLM_MODEL).lower()
+ and SHOW_INFERENCE_SERVER_VLM_OPTIONS
+ )
+ or ("qwen" in str(CLOUD_VLM_MODEL_CHOICE).lower() and SHOW_BEDROCK_VLM_MODELS)
+ ):
+ additional_bounding_box_rules = ""
+ else:
+ additional_bounding_box_rules = "\n- Bounding boxes should fit within the coordinate extents of the image: 0, 0 is the top left corner of the image, and 999, 999 is the bottom right corner of the image"
+else:
+ additional_bounding_box_rules = ""
+
+full_page_ocr_vlm_prompt = f"""Spot all the text in the image at line-level, and output in JSON format as [{{'bbox': [x1, y1, x2, y2], 'text': 'identified text', 'conf': 'confidence score 0-1.0'}}, ...].
+
+IMPORTANT: Extract each horizontal line of text separately. Do NOT combine multiple lines into paragraphs. Each line that appears on a separate horizontal row in the image should be a separate entry.
+
+Rules:
+- Each line must be on a separate horizontal row in the image
+- Even if a sentence is split over multiple horizontal lines, it should be split into separate entries (one per line)
+- If text spans multiple horizontal lines, split it into separate entries (one per line)
+- The text should not contain any formatting tags unless they are explicitly written in the text (e.g. the text is html or markdown)
+- Do NOT combine lines that appear on different horizontal rows
+- Each bounding box should tightly fit around a single horizontal line of text{additional_bounding_box_rules}
+- Empty lines should be skipped
+- Use keys bbox, text, and conf; 'conf' must be a numeric confidence from 0-1
+
+
+# Only return valid JSON, no additional text or explanation."""
+
+full_page_ocr_people_vlm_prompt = f"""Spot all photos of people's faces in the image, and output in JSON format as [{{'bbox': [x1, y1, x2, y2], 'text': '[FACE]', 'conf': 'confidence score 0-1.0'}}, ...].
+
+Rules:
+- If there are no photos of people's faces in the image, return an empty JSON array []
+- If you are not confident that the detected object is a photo of a person's face, do not include it in the results. Only return results for objects that are clearly photos of people's faces. If in doubt, do not include it in the results.
+- For successful results, only return bbox, text, and conf keys. Do not include any other keys in the JSON.
+- Each identified photo of a person's face with high confidence should be a separate JSON entry
+- Only include photos of people's faces in the results, not a drawing or sketch
+- Bounding boxes around an identified person's face should completely cover the person's face{additional_bounding_box_rules}
+- 'text' must be exactly the string '[FACE]' (no other wording)
+- 'conf' should be a numeric confidence from 0-1
+- Do NOT include any other text or information in the JSON
+
+
+# Only return valid JSON, no additional text or explanation."""
+
+full_page_ocr_signature_vlm_prompt = f"""Spot all handwritten signatures in the image, and output in JSON format as [{{'bbox': [x1, y1, x2, y2], 'text': '[SIGNATURE]', 'conf': 'confidence score 0-1.0'}}, ...].
+
+Rules:
+- If there are no handwritten signatures in the image, return an empty JSON array []
+- If you are not confident that the detected object is a handwritten signature, do not include it in the results. Only return results for objects that are clearly handwritten signatures. If in doubt, do not include it in the results.
+- For successful results, only return bbox, text, and conf keys. Do not include any other keys in the JSON.
+- Each identified handwritten signature with high confidence should be a separate JSON entry
+- Bounding boxes around an identified handwritten signature should completely cover the signature{additional_bounding_box_rules}
+- 'text' must be exactly the string '[SIGNATURE]' (no other wording)
+- 'conf' should be a numeric confidence from 0-1
+- Do NOT include any other text or information in the JSON.
+
+# Only return valid JSON, no additional text or explanation."""
+
+# Test for word-level OCR with VLMs - makes some mistakes but not bad
+full_page_ocr_vlm_words_prompt = f"""Spot all the text in the image at word-level, and output in JSON format as [{{'bbox': [x1, y1, x2, y2], 'text': 'identified word', 'conf': 'confidence score 0-1.0'}}, ...].
+
+IMPORTANT: Extract each word in the image separately. Do NOT combine words into longer fragments, sentences, or paragraphs. Each entry must correspond to a single, individual word as visually separated in the image.
+
+Rules:
+- Each entry should correspond to a single distinct word (not groups of words, not whole lines)
+- For each word, provide a tight bounding box [x1, y1, x2, y2] around just that word{additional_bounding_box_rules}
+- For successful results, only return bbox, text, and conf keys. Do not include any other keys in the JSON.
+- Do not merge words. Do not split words into letters. Only return one entry per word
+- Maintain the order of words as they appear spatially from top to bottom, left to right
+- Skip any empty or whitespace-only entries
+- Do not include extraneous text, explanations, or formatting beyond the required JSON
+
+Only return valid JSON, no additional text or explanation."""
diff --git a/tools/secure_path_utils.py b/tools/secure_path_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b7f60ed68934c0fd023a4b9c99184041b8e3db4
--- /dev/null
+++ b/tools/secure_path_utils.py
@@ -0,0 +1,418 @@
+"""
+Secure path utilities to prevent path injection attacks.
+
+This module provides secure alternatives to os.path operations that validate
+and sanitize file paths to prevent directory traversal and other path-based attacks.
+"""
+
+import logging
+import os
+import re
+from pathlib import Path
+from typing import Optional, Union
+
+logger = logging.getLogger(__name__)
+
+
+def sanitize_filename(filename: str, max_length: int = 255) -> str:
+ """
+ Sanitize a filename to prevent path injection attacks.
+
+ Args:
+ filename: The filename to sanitize
+ max_length: Maximum length of the sanitized filename
+
+ Returns:
+ A sanitized filename safe for use in file operations
+
+ Raises:
+ ValueError: If the filename cannot be sanitized safely
+ """
+ if not filename or not isinstance(filename, str):
+ raise ValueError("Filename must be a non-empty string")
+
+ # Remove any path separators and normalize
+ filename = os.path.basename(filename)
+
+ # Remove or replace dangerous characters
+ # Keep alphanumeric, dots, hyphens, underscores, spaces, parentheses, brackets, and other safe chars
+ # Only remove truly dangerous characters like path separators and control chars
+ sanitized = re.sub(r'[<>:"|?*\x00-\x1f]', "_", filename)
+
+ # Remove multiple consecutive dots (except for file extensions)
+ sanitized = re.sub(r"\.{2,}", ".", sanitized)
+
+ # Remove leading/trailing dots and spaces
+ sanitized = sanitized.strip(". ")
+
+ # Ensure it's not empty after sanitization
+ if not sanitized:
+ sanitized = "sanitized_file"
+
+ # Truncate if too long, preserving extension
+ if len(sanitized) > max_length:
+ name, ext = os.path.splitext(sanitized)
+ max_name_length = max_length - len(ext)
+ sanitized = name[:max_name_length] + ext
+
+ return sanitized
+
+
+def secure_path_join(base_path: Union[str, Path], *path_parts: str) -> Path:
+ """
+ Safely join paths while preventing directory traversal attacks.
+
+ Args:
+ base_path: The base directory path
+ *path_parts: Additional path components to join
+
+ Returns:
+ A Path object representing the safe joined path
+
+ Raises:
+ ValueError: If any path component contains dangerous characters
+ PermissionError: If the resulting path would escape the base directory
+ """
+ base_path = Path(base_path).resolve()
+
+ # Sanitize each path part - only sanitize if it contains dangerous patterns
+ sanitized_parts = []
+ for part in path_parts:
+ if not part:
+ continue
+ # Only sanitize if the part contains dangerous patterns
+ if re.search(r'[<>:"|?*\x00-\x1f]|\.{2,}', part):
+ sanitized_part = sanitize_filename(part)
+ else:
+ sanitized_part = part
+ sanitized_parts.append(sanitized_part)
+
+ # Join the paths
+ result_path = base_path
+ for part in sanitized_parts:
+ result_path = result_path / part
+
+ # Resolve the final path
+ result_path = result_path.resolve()
+
+ # Security check: ensure the result is within the base directory
+ try:
+ result_path.relative_to(base_path)
+ except ValueError:
+ raise PermissionError(f"Path would escape base directory: {result_path}")
+
+ return result_path
+
+
+def secure_file_write(
+ base_path: Union[str, Path],
+ filename: str,
+ content: str,
+ mode: str = "w",
+ encoding: Optional[str] = None,
+ **kwargs,
+) -> None:
+ """
+ Safely write content to a file within a base directory with path validation.
+
+ Args:
+ base_path: The base directory under which to write the file
+ filename: The target file name or relative path (untrusted)
+ content: The content to write
+ mode: File open mode (default: 'w')
+ encoding: Text encoding (default: None for binary mode)
+ **kwargs: Additional arguments for open()
+ """
+ # Use secure_path_join to ensure the final path is within base_path and to sanitize filename
+ file_path = secure_path_join(base_path, filename)
+
+ # Ensure the parent directory exists AFTER joining and securing the final path
+ file_path.parent.mkdir(parents=True, exist_ok=True)
+
+ # Write the file
+ open_kwargs = {"mode": mode}
+ if encoding:
+ open_kwargs["encoding"] = encoding
+ open_kwargs.update(kwargs)
+
+ with open(file_path, **open_kwargs) as f:
+ f.write(content)
+
+
+def secure_file_read(
+ base_path: Union[str, Path],
+ filename: str,
+ mode: str = "r",
+ encoding: Optional[str] = None,
+ **kwargs,
+) -> str:
+ """
+ Safely read content from a file within a base directory with path validation.
+
+ Args:
+ base_path: The base directory under which to read the file
+ filename: The target file name or relative path (untrusted)
+ mode: File open mode (default: 'r')
+ encoding: Text encoding (default: None for binary mode)
+ **kwargs: Additional arguments for open()
+
+ Returns:
+ The file content
+ """
+ # Use secure_path_join to ensure the final path is within base_path and to sanitize filename
+ file_path = secure_path_join(base_path, filename)
+
+ # Validate the path exists and is a file
+ if not file_path.exists():
+ raise FileNotFoundError(f"File not found: {file_path}")
+
+ if not file_path.is_file():
+ raise ValueError(f"Path is not a file: {file_path}")
+
+ # Read the file
+ open_kwargs = {"mode": mode}
+ if encoding:
+ open_kwargs["encoding"] = encoding
+ open_kwargs.update(kwargs)
+
+ with open(file_path, **open_kwargs) as f:
+ return f.read()
+
+
+def validate_path_safety(
+ path: Union[str, Path], base_path: Optional[Union[str, Path]] = None
+) -> bool:
+ """
+ Validate that a path is safe and doesn't contain dangerous patterns.
+
+ Args:
+ path: The path to validate
+ base_path: Optional base path to check against
+
+ Returns:
+ True if the path is safe, False otherwise
+ """
+ try:
+ path = Path(path)
+
+ # Check for dangerous patterns
+ path_str = str(path)
+
+ # Check for directory traversal patterns
+ dangerous_patterns = [
+ "..", # Parent directory
+ "//", # Double slashes
+ ]
+
+ # Only check for backslashes on non-Windows systems
+ if os.name != "nt": # 'nt' is Windows
+ dangerous_patterns.append("\\") # Backslashes (on Unix systems)
+
+ for pattern in dangerous_patterns:
+ if pattern in path_str:
+ return False
+
+ # If base path is provided, ensure the path is within it.
+ # Do not call Path.resolve() (or join Path objects) on untrusted input — CodeQL
+ # py/path-injection; use normpath + commonpath containment instead.
+ if base_path:
+ base_norm = os.path.normpath(os.path.abspath(str(base_path)))
+ user_norm = os.path.normpath(path_str)
+ if os.path.isabs(user_norm):
+ candidate = os.path.normpath(os.path.abspath(user_norm))
+ else:
+ candidate = os.path.normpath(os.path.join(base_norm, user_norm))
+ try:
+ common = os.path.commonpath([candidate, base_norm])
+ except ValueError:
+ return False
+ if common != base_norm:
+ return False
+
+ return True
+
+ except Exception:
+ return False
+
+
+def validate_path_containment(
+ path: Union[str, Path], base_path: Union[str, Path]
+) -> bool:
+ """
+ Robustly validate that a path is strictly contained within a base directory.
+ Uses os.path.commonpath for more reliable containment checking.
+ Also allows test directories and example files for testing scenarios.
+
+ Args:
+ path: The path to validate
+ base_path: The trusted base directory
+
+ Returns:
+ True if the path is strictly contained within base_path, False otherwise
+ """
+ try:
+ # Normalize both paths to absolute paths
+ normalized_path = os.path.normpath(os.path.abspath(str(path)))
+ normalized_base = os.path.normpath(os.path.abspath(str(base_path)))
+
+ # Allow test directories and example files - check if path is a test/example directory
+ path_str = str(normalized_path).lower()
+ if any(
+ test_pattern in path_str
+ for test_pattern in [
+ "test_output_",
+ "temp",
+ "tmp",
+ "test_",
+ "_test",
+ "example_data",
+ "examples",
+ ]
+ ):
+ # For test directories and example files, allow them if they're in system temp directories
+ # or if they contain test/example-related patterns
+ import tempfile
+
+ temp_dir = tempfile.gettempdir().lower()
+ if temp_dir in path_str or "test" in path_str or "example" in path_str:
+ return True
+
+ # Ensure the base path exists and is a directory
+ if not os.path.exists(normalized_base) or not os.path.isdir(normalized_base):
+ return False
+
+ # Check if the path exists and is a file (not a directory)
+ if not os.path.exists(normalized_path) or not os.path.isfile(normalized_path):
+ return False
+
+ # Use commonpath to check containment
+ try:
+ common_path = os.path.commonpath([normalized_path, normalized_base])
+ # The common path must be exactly the base path for strict containment
+ return common_path == normalized_base
+ except ValueError:
+ # commonpath raises ValueError if paths are on different drives (Windows)
+ return False
+
+ except Exception:
+ return False
+
+
+def validate_folder_containment(
+ path: Union[str, Path], base_path: Union[str, Path]
+) -> bool:
+ """
+ Robustly validate that a folder path is strictly contained within a base directory.
+ Uses os.path.commonpath for more reliable containment checking.
+ Also allows test directories for testing scenarios.
+
+ Args:
+ path: The folder path to validate
+ base_path: The trusted base directory
+
+ Returns:
+ True if the folder path is strictly contained within base_path, False otherwise
+ """
+ try:
+ # Normalize both paths to absolute paths
+ normalized_path = os.path.normpath(os.path.abspath(str(path)))
+ normalized_base = os.path.normpath(os.path.abspath(str(base_path)))
+
+ # Allow test directories and example files - check if path is a test/example directory
+ path_str = str(normalized_path).lower()
+ base_str = str(normalized_base).lower()
+
+ # Check if this is a test scenario
+ is_test_path = any(
+ test_pattern in path_str
+ for test_pattern in [
+ "test_output_",
+ "temp",
+ "tmp",
+ "test_",
+ "_test",
+ "example_data",
+ "examples",
+ ]
+ )
+
+ # Check if this is a test base path
+ is_test_base = any(
+ test_pattern in base_str
+ for test_pattern in [
+ "test_output_",
+ "temp",
+ "tmp",
+ "test_",
+ "_test",
+ "example_data",
+ "examples",
+ ]
+ )
+
+ # For test scenarios, be more permissive
+ if is_test_path or is_test_base:
+ return True
+
+ # Ensure the base path exists and is a directory
+ if not os.path.exists(normalized_base) or not os.path.isdir(normalized_base):
+ return False
+
+ # Use commonpath to check containment
+ try:
+ common_path = os.path.commonpath([normalized_path, normalized_base])
+ # The common path must be exactly the base path for strict containment
+ result = common_path == normalized_base
+ return result
+ except ValueError:
+ # commonpath raises ValueError if paths are on different drives (Windows)
+ return False
+
+ except Exception as e:
+ print(f"Error validating folder containment: {e}")
+ return False
+
+
+# Backward compatibility functions that maintain the same interface as os.path
+def secure_join(*paths: str) -> str:
+ """
+ Secure alternative to os.path.join that prevents path injection.
+
+ Args:
+ *paths: Path components to join
+
+ Returns:
+ A safe joined path string
+ """
+ if not paths:
+ return ""
+
+ # Use the first path as base, others as components
+ base_path = Path(paths[0])
+ path_parts = paths[1:]
+
+ # Only use secure_path_join if there are potentially dangerous patterns
+ if any(re.search(r'[<>:"|?*\x00-\x1f]|\.{2,}', part) for part in path_parts):
+ result_path = secure_path_join(base_path, *path_parts)
+ return str(result_path)
+ else:
+ # Use normal path joining for safe paths
+ return str(Path(*paths))
+
+
+def secure_basename(path: str) -> str:
+ """
+ Secure alternative to os.path.basename that sanitizes the result.
+
+ Args:
+ path: The path to get the basename from
+
+ Returns:
+ A sanitized basename
+ """
+ basename = os.path.basename(path)
+ # Only sanitize if the basename contains dangerous patterns
+ if re.search(r'[<>:"|?*\x00-\x1f]|\.{2,}', basename):
+ return sanitize_filename(basename)
+ else:
+ return basename
diff --git a/tools/secure_regex_utils.py b/tools/secure_regex_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..db5832e00c91672b65c796f44fe02e3dfaced346
--- /dev/null
+++ b/tools/secure_regex_utils.py
@@ -0,0 +1,297 @@
+"""
+Secure regex utilities to prevent ReDoS (Regular Expression Denial of Service) attacks.
+
+This module provides safe alternatives to common regex patterns that can cause
+catastrophic backtracking and performance issues.
+"""
+
+import re
+from typing import List, Optional
+
+
+def safe_extract_numbers_with_seconds(text: str) -> List[float]:
+ """
+ Safely extract numbers before 'seconds' from text without ReDoS vulnerability.
+
+ Args:
+ text: The text to search for numbers followed by 'seconds'
+
+ Returns:
+ List of float numbers found before 'seconds'
+ """
+ if not text or not isinstance(text, str):
+ return []
+
+ # Use a more specific pattern that avoids catastrophic backtracking
+ # Look for digits, optional decimal part, optional whitespace, then 'seconds'
+ pattern = r"\b(\d+(?:\.\d+)?)\s*seconds\b"
+
+ matches = re.findall(pattern, text)
+ try:
+ return [float(match) for match in matches]
+ except (ValueError, TypeError):
+ return []
+
+
+def safe_extract_numbers(text: str) -> List[float]:
+ """
+ Safely extract all numbers from text without ReDoS vulnerability.
+
+ Args:
+ text: The text to extract numbers from
+
+ Returns:
+ List of float numbers found in the text
+ """
+ if not text or not isinstance(text, str):
+ return []
+
+ # Use a simple, safe pattern that doesn't cause backtracking
+ # Match digits, optional decimal point and more digits
+ pattern = r"\b\d+(?:\.\d+)?\b"
+
+ matches = re.findall(pattern, text)
+ try:
+ return [float(match) for match in matches]
+ except (ValueError, TypeError):
+ return []
+
+
+def safe_extract_page_number_from_filename(filename: str) -> Optional[int]:
+ """
+ Safely extract page number from filename ending with .png.
+
+ Args:
+ filename: The filename to extract page number from
+
+ Returns:
+ Page number if found, None otherwise
+ """
+ if not filename or not isinstance(filename, str):
+ return None
+
+ # Use a more specific, secure pattern that avoids potential ReDoS
+ # Match 1-10 digits followed by .png at the end of string
+ pattern = r"(\d{1,10})\.png$"
+ match = re.search(pattern, filename)
+
+ if match:
+ try:
+ return int(match.group(1))
+ except (ValueError, TypeError):
+ return None
+
+ return None
+
+
+def safe_extract_page_number_from_path(path: str) -> Optional[int]:
+ """
+ Safely extract page number from path containing _(\\d+).png pattern.
+
+ Args:
+ path: The path to extract page number from
+
+ Returns:
+ Page number if found, None otherwise
+ """
+ if not path or not isinstance(path, str):
+ return None
+
+ # Use a more specific, secure pattern that avoids potential ReDoS
+ # Match underscore followed by 1-10 digits and .png at the end
+ pattern = r"_(\d{1,10})\.png$"
+ match = re.search(pattern, path)
+
+ if match:
+ try:
+ return int(match.group(1))
+ except (ValueError, TypeError):
+ return None
+
+ return None
+
+
+def safe_clean_text(text: str, remove_html: bool = True) -> str:
+ """
+ Safely clean text without ReDoS vulnerability.
+
+ Args:
+ text: The text to clean
+ remove_html: Whether to remove HTML tags
+
+ Returns:
+ Cleaned text
+ """
+ if not text or not isinstance(text, str):
+ return ""
+
+ cleaned = text
+
+ if remove_html:
+ # Use a simple pattern that doesn't cause backtracking
+ cleaned = re.sub(r"<[^>]*>", "", cleaned)
+
+ # Clean up whitespace
+ cleaned = re.sub(r"\s+", " ", cleaned).strip()
+
+ return cleaned
+
+
+def safe_extract_rgb_values(text: str) -> Optional[tuple]:
+ """
+ Safely extract RGB values from text like "(255, 255, 255)".
+
+ Args:
+ text: The text to extract RGB values from
+
+ Returns:
+ Tuple of (r, g, b) values if found, None otherwise
+ """
+ if not text or not isinstance(text, str):
+ return None
+
+ # Use a simple, safe pattern
+ pattern = r"\(\s*(\d{1,3})\s*,\s*(\d{1,3})\s*,\s*(\d{1,3})\s*\)"
+ match = re.match(pattern, text.strip())
+
+ if match:
+ try:
+ r = int(match.group(1))
+ g = int(match.group(2))
+ b = int(match.group(3))
+
+ # Validate RGB values
+ if 0 <= r <= 255 and 0 <= g <= 255 and 0 <= b <= 255:
+ return (r, g, b)
+ except (ValueError, TypeError):
+ pass
+
+ return None
+
+
+def safe_split_filename(filename: str, delimiters: List[str]) -> List[str]:
+ """
+ Safely split filename by delimiters without ReDoS vulnerability.
+
+ Args:
+ filename: The filename to split
+ delimiters: List of delimiter patterns to split on
+
+ Returns:
+ List of filename parts
+ """
+ if not filename or not isinstance(filename, str):
+ return []
+
+ if not delimiters:
+ return [filename]
+
+ # Escape special regex characters in delimiters
+ escaped_delimiters = [re.escape(delim) for delim in delimiters]
+
+ # Create a safe pattern
+ pattern = "|".join(escaped_delimiters)
+
+ try:
+ return re.split(pattern, filename)
+ except re.error:
+ # Fallback to simple string operations if regex fails
+ result = [filename]
+ for delim in delimiters:
+ new_result = []
+ for part in result:
+ new_result.extend(part.split(delim))
+ result = new_result
+ return result
+
+
+def safe_remove_leading_newlines(text: str) -> str:
+ """
+ Safely remove leading newlines without ReDoS vulnerability.
+
+ Args:
+ text: The text to clean
+
+ Returns:
+ Text with leading newlines removed
+ """
+ if not text or not isinstance(text, str):
+ return ""
+
+ # Use a simple pattern
+ return re.sub(r"^\n+", "", text).strip()
+
+
+def safe_remove_non_ascii(text: str) -> str:
+ """
+ Safely remove non-ASCII characters without ReDoS vulnerability.
+
+ Args:
+ text: The text to clean
+
+ Returns:
+ Text with non-ASCII characters removed
+ """
+ if not text or not isinstance(text, str):
+ return ""
+
+ # Use a simple pattern
+ return re.sub(r"[^\x00-\x7F]", "", text)
+
+
+def safe_extract_latest_number_from_filename(filename: str) -> Optional[int]:
+ """
+ Safely extract the latest/largest number from filename without ReDoS vulnerability.
+
+ Args:
+ filename: The filename to extract number from
+
+ Returns:
+ The largest number found, or None if no numbers found
+ """
+ if not filename or not isinstance(filename, str):
+ return None
+
+ # Use a safe pattern to find all numbers (limit to reasonable length)
+ pattern = r"\d{1,10}"
+ matches = re.findall(pattern, filename)
+
+ if not matches:
+ return None
+
+ try:
+ # Convert to integers and return the maximum
+ numbers = [int(match) for match in matches]
+ return max(numbers)
+ except (ValueError, TypeError):
+ return None
+
+
+def safe_sanitize_text(text: str, replacement: str = "_", max_length: int = 255) -> str:
+ """
+ Safely sanitize text by removing dangerous characters without ReDoS vulnerability.
+
+ Args:
+ text: The text to sanitize
+ replacement: Character to replace dangerous characters with
+ max_length: Maximum length of the text
+ Returns:
+ Sanitized text
+ """
+ if not text or not isinstance(text, str):
+ return ""
+
+ # Use a simple pattern for dangerous characters
+ dangerous_chars = r'[<>:"|?*\\/\x00-\x1f\x7f-\x9f]'
+ sanitized = re.sub(dangerous_chars, replacement, text)
+
+ # Remove multiple consecutive replacements
+ sanitized = re.sub(f"{re.escape(replacement)}+", replacement, sanitized)
+
+ # Remove leading/trailing replacements
+ sanitized = sanitized.strip(replacement)
+
+ # Truncate to maximum length
+ sanitized = sanitized[:max_length]
+
+ return sanitized
diff --git a/tools/summaries.py b/tools/summaries.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7e83059a3caa4e387b75387fc7588507b509d11
--- /dev/null
+++ b/tools/summaries.py
@@ -0,0 +1,2434 @@
+import os
+import re
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import List, Optional, Tuple
+
+import boto3
+import gradio as gr
+import markdown
+import pandas as pd
+import spaces
+from gradio import Progress as progress
+from tqdm import tqdm
+
+from tools.config import (
+ AWS_ACCESS_KEY,
+ AWS_LLM_PII_OPTION,
+ AWS_REGION,
+ AWS_SECRET_KEY,
+ CLOUD_LLM_PII_MODEL_CHOICE,
+ CLOUD_SUMMARISATION_MODEL_CHOICE,
+ DEFAULT_INFERENCE_SERVER_PII_MODEL,
+ INFERENCE_SERVER_API_URL,
+ INFERENCE_SERVER_PII_OPTION,
+ LLM_CONTEXT_LENGTH,
+ LLM_MAX_NEW_TOKENS,
+ LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE,
+ LOCAL_TRANSFORMERS_LLM_PII_OPTION,
+ MAX_SPACES_GPU_RUN_TIME,
+ OUTPUT_FOLDER,
+ PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS,
+ REASONING_SUFFIX,
+ RUN_AWS_FUNCTIONS,
+ SUMMARY_PAGE_GROUP_MAX_WORKERS,
+ model_name_map,
+)
+from tools.file_conversion import is_pdf, word_level_ocr_df_to_line_level_ocr_df
+from tools.helper_functions import (
+ clean_column_name,
+ create_batch_file_path_details,
+ get_file_name_no_ext,
+)
+from tools.llm_funcs import (
+ calculate_tokens_from_metadata,
+ construct_azure_client,
+ construct_gemini_generative_model,
+ load_model,
+ process_requests,
+)
+
+max_tokens = LLM_MAX_NEW_TOKENS
+reasoning_suffix = REASONING_SUFFIX
+max_text_length = 500
+
+###
+# System prompt
+###
+
+generic_system_prompt = """You are a researcher analysing a document. Use British English spelling and grammar."""
+
+system_prompt = """You are a researcher analysing a document. Use British English spelling and grammar."""
+
+markdown_additional_prompt = """ You will be given a request for a markdown table. You must respond with ONLY the markdown table. Do not include any introduction, explanation, or concluding text."""
+
+###
+# SUMMARISE TOPICS PROMPT
+###
+
+summary_assistant_prefill = ""
+
+summarise_topic_descriptions_system_prompt = system_prompt
+
+summarise_topic_descriptions_prompt = """Your task is to make a consolidated summary of the text below. {summary_format}
+Return only the summary and no other text. Do not mention specific response numbers in the summary.{additional_summary_instructions}
+
+Text to summarise:
+{summaries}
+
+Summary:"""
+
+concise_summary_format_prompt = "Return a concise summary that summarises only the most important themes from the original text"
+
+detailed_summary_format_prompt = (
+ "Return a summary that includes as much detail as possible from the original text"
+)
+
+###
+# OVERALL SUMMARY PROMPTS
+###
+
+summarise_everything_system_prompt = system_prompt
+
+summarise_everything_prompt = """Below is a table that gives an overview of the main issues related to a document.
+Your task is to summarise the text in the table below. {summary_format}. Return only the summary and no other text. Use headers and paragraphs to structure the summary where appropriate. Format the output for Excel display using: **bold text** for main headings, • bullet points for sub-items, and line breaks between sections. Avoid markdown symbols like # or ##. {additional_summary_instructions}
+
+Table to summarise:
+{topic_summary_table}
+
+Summary:"""
+
+
+def _summarisation_upload_to_paths(file_upload):
+ """Normalise Gradio file input to a list of file paths (str, list, or dict with 'name')."""
+ if not file_upload:
+ return []
+ paths = []
+ if isinstance(file_upload, str):
+ paths.append(file_upload)
+ elif isinstance(file_upload, list):
+ for item in file_upload:
+ if isinstance(item, str):
+ paths.append(item)
+ elif isinstance(item, dict):
+ paths.append(item.get("name") or item.get("path") or "")
+ elif hasattr(item, "name"):
+ paths.append(item.name)
+ elif hasattr(item, "path"):
+ paths.append(item.path)
+ elif isinstance(file_upload, dict):
+ paths.append(file_upload.get("name") or file_upload.get("path") or "")
+ elif hasattr(file_upload, "name"):
+ paths.append(file_upload.name)
+ elif hasattr(file_upload, "path"):
+ paths.append(file_upload.path)
+ return [p for p in paths if p and str(p).strip()]
+
+
+def _upload_contains_pdf(file_upload):
+ """Return True if the summarisation upload contains any PDF file."""
+ paths = _summarisation_upload_to_paths(file_upload)
+ return any(is_pdf(p) for p in paths)
+
+
+###
+# Document Summarisation Functions
+###
+def get_model_choice_from_inference_method(inference_method: str) -> str:
+ """
+ Get the default model choice for a given inference method (for summarisation).
+ Uses the default values defined in config.py (CLOUD_SUMMARISATION_MODEL_CHOICE for cloud).
+
+ Args:
+ inference_method: One of "aws-bedrock", "local", "inference-server"
+
+ Returns:
+ str: The model choice string to use
+ """
+ # Map inference method to model choice using defaults from config.py
+ if inference_method == "aws-bedrock":
+ return CLOUD_SUMMARISATION_MODEL_CHOICE
+ elif inference_method == "local":
+ return LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE
+ elif inference_method == "inference-server":
+ return DEFAULT_INFERENCE_SERVER_PII_MODEL
+ else:
+ raise ValueError(
+ f"Unknown inference method: {inference_method}. "
+ f"Expected one of: 'aws-bedrock', 'local', 'inference-server'"
+ )
+
+
+def get_model_source_from_model_choice(model_choice: str) -> str:
+ """
+ Determine model source from model_choice by comparing to defaults from config.py.
+ Does not check model_name_map - uses the defined defaults.
+
+ Args:
+ model_choice: The model choice string
+
+ Returns:
+ str: The model source ("AWS", "Local", or "inference-server")
+ """
+ # Compare model_choice to the default config values to determine source
+ if model_choice == LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE:
+ return "Local"
+ elif model_choice == DEFAULT_INFERENCE_SERVER_PII_MODEL:
+ return "inference-server"
+ elif (
+ model_choice == CLOUD_LLM_PII_MODEL_CHOICE
+ or model_choice == CLOUD_SUMMARISATION_MODEL_CHOICE
+ ):
+ return "AWS"
+ else:
+ # If it doesn't match any default, infer from common patterns
+ # AWS Bedrock models typically have "amazon." or "anthropic." prefix
+ if model_choice.startswith("amazon.") or model_choice.startswith("anthropic."):
+ return "AWS"
+ # Inference server models are often custom names
+ # Default to AWS for backward compatibility, but could be inference-server
+ # Since we're using defaults, assume AWS if it's not clearly local
+ return "AWS"
+
+
+def load_csv_files_to_dataframe(file_input):
+ """
+ Load CSV files from Gradio file input and combine them into a single DataFrame.
+ Similar to how duplicate pages function handles file input.
+
+ Args:
+ file_input: Gradio file input (can be a single file, list of files, or file objects)
+
+ Returns:
+ pd.DataFrame: Combined DataFrame with columns page, line, and text
+ """
+ if not file_input:
+ return pd.DataFrame(columns=["page", "line", "text"])
+
+ # Handle different input types (similar to run_tabular_duplicate_detection)
+ file_paths = []
+ if isinstance(file_input, str):
+ file_paths.append(file_input)
+ elif isinstance(file_input, list):
+ for f_item in file_input:
+ if isinstance(f_item, str):
+ file_paths.append(f_item)
+ elif hasattr(f_item, "name"):
+ file_paths.append(f_item.name)
+ elif hasattr(file_input, "name"):
+ file_paths.append(file_input.name)
+
+ # Load and combine all CSV files
+ all_dfs = []
+ for file_path in file_paths:
+ try:
+ df = pd.read_csv(file_path)
+ # Convert word-level OCR to line-level if user uploaded word-level file
+ if "ocr_results_with_words" in os.path.basename(file_path) and (
+ "word_text" in df.columns and "text" not in df.columns
+ ):
+ df = word_level_ocr_df_to_line_level_ocr_df(df)
+ # Ensure required columns exist
+ if "page" in df.columns and "line" in df.columns and "text" in df.columns:
+ all_dfs.append(df[["page", "line", "text"]])
+ else:
+ print(
+ f"Warning: {file_path} does not have required columns (page, line, text)"
+ )
+ except Exception as e:
+ print(f"Error loading {file_path}: {e}")
+
+ if not all_dfs:
+ return pd.DataFrame(columns=["page", "line", "text"])
+
+ # Combine all DataFrames
+ combined_df = pd.concat(all_dfs, ignore_index=True)
+ return combined_df
+
+
+# Wrapper function to convert inference method to model choice
+@spaces.GPU(duration=MAX_SPACES_GPU_RUN_TIME)
+def summarise_document_wrapper(
+ all_page_line_level_ocr_results_df,
+ output_folder,
+ summarisation_inference_method,
+ summarisation_api_key,
+ summarisation_temperature,
+ file_name,
+ summarisation_context,
+ summarisation_aws_access_key,
+ summarisation_aws_secret_key,
+ summarisation_hf_api_key,
+ summarisation_azure_endpoint,
+ summarisation_format,
+ summarisation_additional_instructions,
+ summarisation_max_pages_per_group,
+ in_summarisation_ocr_files=None,
+):
+ """
+ Wrapper to select the correct model and format for document summarization, and optionally
+ load input OCR CSV files if they are provided.
+
+ Args:
+ all_page_line_level_ocr_results_df (pd.DataFrame): Pre-loaded DataFrame containing the line-level OCR results.
+ output_folder (str): Path to folder where outputs should be saved.
+ summarisation_inference_method (str): String specifying which inference/LLM method to use ('aws-bedrock', etc).
+ summarisation_api_key (str): API key for the selected inference method, if required.
+ summarisation_temperature (float): The temperature parameter for the model (controls randomness).
+ file_name (str): Name to use as a base for output files.
+ summarisation_context (str): Additional context string to include in the summarization.
+ summarisation_aws_access_key (str): AWS access key if using AWS inference.
+ summarisation_aws_secret_key (str): AWS secret key if using AWS inference.
+ summarisation_hf_api_key (str): HuggingFace API key if required.
+ summarisation_azure_endpoint (str): Endpoint string if using Azure inference.
+ summarisation_format (str): Format for the summary output (e.g., "bullets", "structured").
+ summarisation_additional_instructions (str): Extra instructions to pass to the summarization LLM.
+ summarisation_max_pages_per_group (int): Maximum number of pages to group per LLM summarization pass.
+ in_summarisation_ocr_files (str | list | object, optional): One or more file paths or file-like objects to OCR results in CSV format.
+
+ Returns:
+ Output of the downstream summarisation process (see next code section for details).
+ """
+ """Wrapper to convert inference method selection to model choice and load CSV files."""
+ # Map inference method option to inference method string
+ inference_method_map = {
+ AWS_LLM_PII_OPTION: "aws-bedrock",
+ LOCAL_TRANSFORMERS_LLM_PII_OPTION: "local",
+ INFERENCE_SERVER_PII_OPTION: "inference-server",
+ }
+
+ inference_method = inference_method_map.get(
+ summarisation_inference_method, "aws-bedrock"
+ )
+
+ # Use config default for region
+ summarisation_aws_region = AWS_REGION
+ summarisation_api_url = INFERENCE_SERVER_API_URL
+
+ # Get model choice from inference method
+ model_choice = get_model_choice_from_inference_method(inference_method)
+
+ # Load CSV files if provided, otherwise use the dataframe
+ if in_summarisation_ocr_files:
+ ocr_df = load_csv_files_to_dataframe(in_summarisation_ocr_files)
+ else:
+ ocr_df = all_page_line_level_ocr_results_df
+
+ # If file_name is None or empty, derive it from in_summarisation_ocr_files
+ if not file_name or file_name.strip() == "":
+ if in_summarisation_ocr_files:
+ # Extract file path from in_summarisation_ocr_files (similar to load_csv_files_to_dataframe)
+ file_paths = []
+ if isinstance(in_summarisation_ocr_files, str):
+ file_paths.append(in_summarisation_ocr_files)
+ elif isinstance(in_summarisation_ocr_files, list):
+ for f_item in in_summarisation_ocr_files:
+ if isinstance(f_item, str):
+ file_paths.append(f_item)
+ elif hasattr(f_item, "name"):
+ file_paths.append(f_item.name)
+ elif hasattr(in_summarisation_ocr_files, "name"):
+ file_paths.append(in_summarisation_ocr_files.name)
+
+ # Get the first file path and extract filename prefix
+ if file_paths:
+ first_file_path = file_paths[0]
+ # Get basename without extension
+ basename = os.path.basename(first_file_path)
+ filename_without_ext, _ = os.path.splitext(basename)
+ # Take first 20 characters, removing any invalid filename characters
+ filename_prefix = filename_without_ext[:20]
+ # Remove any invalid characters for filenames
+ invalid_chars = '<>:"/\\|?*'
+ for char in invalid_chars:
+ filename_prefix = filename_prefix.replace(char, "_")
+ file_name = filename_prefix if filename_prefix else "document"
+ else:
+ file_name = "document"
+ else:
+ file_name = "document"
+
+ # Call the actual summarise_document function (timed for usage logs)
+ start_time = time.perf_counter()
+ (
+ output_files,
+ status_message,
+ llm_model_name,
+ llm_total_input_tokens,
+ llm_total_output_tokens,
+ summary_display_text,
+ ) = summarise_document(
+ ocr_df,
+ output_folder,
+ model_choice,
+ summarisation_api_key,
+ summarisation_temperature,
+ file_name,
+ summarisation_context,
+ summarisation_aws_access_key,
+ summarisation_aws_secret_key,
+ summarisation_aws_region,
+ summarisation_hf_api_key,
+ summarisation_azure_endpoint,
+ summarisation_api_url,
+ summarisation_format,
+ summarisation_additional_instructions,
+ max_pages_per_group=summarisation_max_pages_per_group,
+ )
+ elapsed_seconds = round(time.perf_counter() - start_time, 1)
+
+ return (
+ output_files,
+ status_message,
+ llm_model_name,
+ llm_total_input_tokens,
+ llm_total_output_tokens,
+ summary_display_text,
+ elapsed_seconds,
+ )
+
+
+def group_pages_by_context_length(
+ all_page_line_level_ocr_results_df: pd.DataFrame,
+ context_length: int = LLM_CONTEXT_LENGTH,
+ tokenizer=None,
+ model_source: str = "Local",
+ max_pages_per_group: int = 30,
+) -> List[Tuple[List[int], str]]:
+ """
+ Group pages into chunks that fit within the LLM context length.
+ Splits pages into roughly equal-sized groups (e.g. 56 pages with room for 50
+ per context -> two groups of 28, not 50 and 6). Each page is prefixed with
+ '=== Page x ==='.
+
+ Args:
+ all_page_line_level_ocr_results_df: DataFrame with columns 'page', 'line', 'text'
+ context_length: Maximum context length in tokens
+ tokenizer: Tokenizer for accurate token counting
+ model_source: Source of the model for token counting
+
+ Returns:
+ List of tuples: (list of page numbers, formatted text for that group)
+ """
+ if (
+ all_page_line_level_ocr_results_df is None
+ or all_page_line_level_ocr_results_df.empty
+ ):
+ return []
+
+ # Group by page and concatenate text
+ page_texts = {}
+ for _, row in all_page_line_level_ocr_results_df.iterrows():
+ page = int(row["page"])
+ text = str(row.get("text", ""))
+ if page not in page_texts:
+ page_texts[page] = []
+ page_texts[page].append(text)
+
+ # Format each page with header and get token count per page
+ page_list = [] # (page_num, formatted_page, page_tokens)
+ for page_num in sorted(page_texts.keys()):
+ page_text = " ".join(page_texts[page_num])
+ formatted_page = f"=== Page {page_num} ===\n{page_text}"
+ page_tokens = count_tokens_in_text(formatted_page, tokenizer, model_source)
+ page_list.append((page_num, formatted_page, page_tokens))
+
+ # Reserve some tokens for the prompt template
+ reserved_tokens = 500
+ available_tokens = context_length - reserved_tokens
+
+ if not page_list:
+ return []
+
+ # Sanitise max_pages_per_group
+ try:
+ max_pages_per_group_int = int(max_pages_per_group)
+ except Exception:
+ max_pages_per_group_int = 30
+ if max_pages_per_group_int < 1:
+ max_pages_per_group_int = 1
+
+ # Step 1: Greedy pass to determine minimum number of groups by tokens
+ k_token = 0
+ cur_tokens = 0
+ for _, _, pt in page_list:
+ if cur_tokens + pt > available_tokens and cur_tokens > 0:
+ k_token += 1
+ cur_tokens = 0
+ cur_tokens += pt
+ k_token += 1 # last group
+ n = len(page_list)
+
+ # Also enforce a maximum pages-per-group cap
+ k_pages = (n + max_pages_per_group_int - 1) // max_pages_per_group_int
+
+ # Final number of groups must satisfy both token limit and max-pages limit
+ k = max(k_token, k_pages)
+
+ # Step 2: Target pages per group for roughly equal split (e.g. 56 pages, 2 groups -> 28, 28)
+ q, r = n // k, n % k
+ target_per_group = [q + 1] * r + [q] * (k - r)
+
+ # Step 3: Assign pages to groups with target sizes, respecting token limit
+ groups = []
+ page_idx = 0
+ for group_idx in range(k):
+ target = min(target_per_group[group_idx], max_pages_per_group_int)
+ current_group_pages = []
+ current_group_text = ""
+ current_tokens = 0
+ while page_idx < n and len(current_group_pages) < target:
+ page_num, formatted_page, page_tokens = page_list[page_idx]
+ if current_tokens + page_tokens > available_tokens and current_group_pages:
+ break # full by token limit; start next group
+ current_group_pages.append(page_num)
+ if current_group_text:
+ current_group_text += "\n\n" + formatted_page
+ else:
+ current_group_text = formatted_page
+ current_tokens += page_tokens
+ page_idx += 1
+ if current_group_pages:
+ groups.append((current_group_pages, current_group_text))
+
+ # Any remaining pages (e.g. group hit token limit before target) go into final group(s)
+ while page_idx < n:
+ current_group_pages = []
+ current_group_text = ""
+ current_tokens = 0
+ while page_idx < n and len(current_group_pages) < max_pages_per_group_int:
+ page_num, formatted_page, page_tokens = page_list[page_idx]
+ if current_tokens + page_tokens > available_tokens and current_group_pages:
+ break
+ # If even a single page exceeds limit, add it anyway to avoid infinite loop
+ current_group_pages.append(page_num)
+ if current_group_text:
+ current_group_text += "\n\n" + formatted_page
+ else:
+ current_group_text = formatted_page
+ current_tokens += page_tokens
+ page_idx += 1
+ if current_group_pages:
+ groups.append((current_group_pages, current_group_text))
+
+ return groups
+
+
+def summarise_text_chunk(
+ text_chunk: str,
+ model_choice: str,
+ in_api_key: str,
+ temperature: float,
+ context_textbox: str = "",
+ aws_access_key_textbox: str = "",
+ aws_secret_key_textbox: str = "",
+ aws_region_textbox: str = "",
+ model_name_map: dict = None,
+ hf_api_key_textbox: str = "",
+ azure_endpoint_textbox: str = "",
+ api_url: str = None,
+ reasoning_suffix: str = "",
+ local_model=None,
+ tokenizer=None,
+ assistant_model=None,
+ summarise_format_radio: str = "Return a summary up to two paragraphs long that includes as much detail as possible from the original text",
+ additional_summary_instructions: str = "",
+) -> Tuple[str, str, dict]:
+ """
+ Summarise a single text chunk using the summarise_output_topics_query function.
+
+ Returns:
+ Tuple of (summary_text, full_prompt, metadata)
+ """
+ from tools.config import (
+ model_name_map as default_model_name_map,
+ )
+
+ # Note: load_model is already imported at the top of the file
+
+ if model_name_map is None:
+ model_name_map = default_model_name_map
+
+ if additional_summary_instructions:
+ additional_summary_instructions = (
+ "Important additional instructions to follow closely: "
+ + additional_summary_instructions
+ )
+
+ formatted_summary_prompt = [
+ summarise_topic_descriptions_prompt.format(
+ summaries=text_chunk,
+ summary_format=summarise_format_radio,
+ additional_summary_instructions=additional_summary_instructions,
+ )
+ ]
+
+ # Format system prompt
+ formatted_system_prompt = summarise_topic_descriptions_system_prompt.format(
+ column_name="document text",
+ consultation_context=context_textbox if context_textbox else "",
+ )
+
+ # Determine model source from model_choice using defaults from config.py
+ # Does not check model_name_map - uses the defined defaults
+ model_source = get_model_source_from_model_choice(model_choice)
+
+ # Setup model based on model source
+ # Load model and tokenizer together to ensure they're from the same source
+ # This prevents mismatches that could occur if they're loaded separately
+ # Similar to llm_funcs.py pattern (lines 830-839) and llm_entity_detection.py (lines 519-533)
+ if (model_source == "Local") & (local_model is None or tokenizer is None):
+ progress(0.1, f"Using model: {LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE}")
+ # Use load_model() to ensure both are loaded atomically
+ # This is safer than calling get_pii_model() and get_pii_tokenizer() separately
+ loaded_model, loaded_tokenizer, loaded_assistant_model = load_model()
+ if local_model is None:
+ local_model = loaded_model
+ if tokenizer is None:
+ tokenizer = loaded_tokenizer
+ if assistant_model is None:
+ assistant_model = loaded_assistant_model
+
+ # Setup bedrock for AWS models
+ # Use the same approach as file_redaction.py (lines 939-969) for consistency
+ bedrock_runtime = None
+ if model_source == "AWS":
+ # Use aws_region_textbox if provided, otherwise fall back to AWS_REGION from config
+ region = aws_region_textbox if aws_region_textbox else AWS_REGION
+
+ if RUN_AWS_FUNCTIONS and PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS:
+ print("Connecting to Bedrock via existing SSO connection")
+ bedrock_runtime = boto3.client("bedrock-runtime", region_name=region)
+ elif aws_access_key_textbox and aws_secret_key_textbox:
+ print(
+ "Connecting to Bedrock using AWS access key and secret keys from user input."
+ )
+ bedrock_runtime = boto3.client(
+ "bedrock-runtime",
+ aws_access_key_id=aws_access_key_textbox,
+ aws_secret_access_key=aws_secret_key_textbox,
+ region_name=region,
+ )
+ elif RUN_AWS_FUNCTIONS:
+ print("Connecting to Bedrock via existing SSO connection")
+ bedrock_runtime = boto3.client("bedrock-runtime", region_name=region)
+ elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
+ print("Getting Bedrock credentials from environment variables")
+ bedrock_runtime = boto3.client(
+ "bedrock-runtime",
+ aws_access_key_id=AWS_ACCESS_KEY,
+ aws_secret_access_key=AWS_SECRET_KEY,
+ region_name=region,
+ )
+ else:
+ bedrock_runtime = None
+ out_message = "Cannot connect to AWS Bedrock service. Please provide access keys under LLM settings, or choose another model type."
+ print(out_message)
+ raise Exception(out_message)
+
+ # Note: Gemini and Azure/OpenAI clients are handled within summarise_output_topics_query
+ # via the process_requests function, so we don't need to set them up here
+ # Similar to how llm_entity_detection.py handles them (lines 554-584)
+
+ # Apply reasoning suffix if needed
+ if reasoning_suffix:
+ is_gpt_oss_model = (
+ "gpt-oss" in model_choice.lower() or "gpt_oss" in model_choice.lower()
+ )
+ if is_gpt_oss_model or ("Local" in model_source and reasoning_suffix):
+ formatted_system_prompt = formatted_system_prompt + "\n" + reasoning_suffix
+
+ # Call the summarisation function
+ try:
+ response, conversation_history, metadata, response_text = (
+ summarise_output_topics_query(
+ model_choice,
+ in_api_key,
+ temperature,
+ formatted_summary_prompt,
+ formatted_system_prompt,
+ model_source,
+ bedrock_runtime,
+ local_model if local_model else [],
+ tokenizer if tokenizer else [],
+ assistant_model if assistant_model else [],
+ azure_endpoint_textbox,
+ api_url,
+ )
+ )
+
+ full_prompt = formatted_system_prompt + "\n" + formatted_summary_prompt[0]
+ return response_text, full_prompt, metadata
+ except Exception as e:
+ print(f"Error summarising text chunk: {e}")
+ full_prompt = formatted_system_prompt + "\n" + formatted_summary_prompt[0]
+ return "", full_prompt, {}
+
+
+def recursively_summarise(
+ summaries: List[str],
+ model_choice: str,
+ in_api_key: str,
+ temperature: float,
+ context_length: int = LLM_CONTEXT_LENGTH,
+ tokenizer=None,
+ model_source: str = "Local",
+ token_accumulator=None,
+ **kwargs,
+) -> List[str]:
+ """
+ Recursively summarise summaries until they fit within context length.
+
+ Args:
+ token_accumulator: Optional list to accumulate [input_tokens, output_tokens] from metadata
+ """
+ # Check total length
+ combined_summaries = "\n\n".join(summaries)
+ total_tokens = count_tokens_in_text(combined_summaries, tokenizer, model_source)
+
+ # Reserve tokens for prompt
+ reserved_tokens = 500
+ available_tokens = context_length - reserved_tokens
+
+ if total_tokens <= available_tokens:
+ return summaries
+
+ # Need to summarise further - group summaries into chunks
+ groups = []
+ current_group = []
+ current_tokens = 0
+
+ for summary in summaries:
+ summary_tokens = count_tokens_in_text(summary, tokenizer, model_source)
+ if current_tokens + summary_tokens > available_tokens and current_group:
+ groups.append("\n\n".join(current_group))
+ current_group = [summary]
+ current_tokens = summary_tokens
+ else:
+ current_group.append(summary)
+ current_tokens += summary_tokens
+
+ if current_group:
+ groups.append("\n\n".join(current_group))
+
+ # Summarise each group
+ new_summaries = []
+ for group_text in groups:
+ summary_text, _, metadata = summarise_text_chunk(
+ group_text,
+ model_choice,
+ in_api_key,
+ temperature,
+ tokenizer=tokenizer,
+ model_source=model_source,
+ **kwargs,
+ )
+ if summary_text:
+ new_summaries.append(summary_text)
+ # Accumulate tokens if accumulator provided
+ if token_accumulator is not None and metadata:
+ # Convert metadata to string if it's a list
+ metadata_string = (
+ str(metadata) if not isinstance(metadata, str) else metadata
+ )
+ input_tokens, output_tokens, _ = calculate_tokens_from_metadata(
+ metadata_string, model_choice, model_name_map
+ )
+ token_accumulator[0] += input_tokens
+ token_accumulator[1] += output_tokens
+
+ # Recursively call if still too long
+ if len(new_summaries) > 1:
+ return recursively_summarise(
+ new_summaries,
+ model_choice,
+ in_api_key,
+ temperature,
+ context_length,
+ tokenizer,
+ model_source,
+ token_accumulator=token_accumulator,
+ **kwargs,
+ )
+
+ return new_summaries
+
+
+def summarise_document(
+ all_page_line_level_ocr_results_df: pd.DataFrame,
+ output_folder: str,
+ model_choice: str,
+ in_api_key: str,
+ temperature: float,
+ file_name: str = "document",
+ context_textbox: str = "",
+ aws_access_key_textbox: str = "",
+ aws_secret_key_textbox: str = "",
+ aws_region_textbox: str = "",
+ hf_api_key_textbox: str = "",
+ azure_endpoint_textbox: str = "",
+ api_url: str = None,
+ summarise_format_radio: str = "Return a summary up to two paragraphs long that includes as much detail as possible from the original text",
+ additional_summary_instructions: str = "",
+ max_pages_per_group: int = 30,
+ summary_page_group_max_workers: Optional[int] = None,
+ progress=gr.Progress(track_tqdm=True),
+) -> Tuple[List[str], str]:
+ """
+ Main function to summarise a document from OCR results.
+
+ Args:
+ all_page_line_level_ocr_results_df (pd.DataFrame): DataFrame containing line-level OCR results.
+ output_folder (str): The folder where outputs will be saved.
+ model_choice (str): The model to use for summarization.
+ in_api_key (str): API key for the selected model/inference method.
+ temperature (float): LLM temperature hyperparameter.
+ file_name (str, optional): Name to use for the output files. Default is "document".
+ context_textbox (str, optional): Extra context for summarization. Default is "".
+ aws_access_key_textbox (str, optional): AWS access key, if using AWS. Default is "".
+ aws_secret_key_textbox (str, optional): AWS secret key, if using AWS. Default is "".
+ aws_region_textbox (str, optional): AWS region string. Default is "".
+ hf_api_key_textbox (str, optional): HuggingFace API key, if used. Default is "".
+ azure_endpoint_textbox (str, optional): Azure endpoint, if used. Default is "".
+ api_url (str, optional): API URL. Default is None.
+ summarise_format_radio (str, optional): Summary output format instructions. Default is detailed summary.
+ additional_summary_instructions (str, optional): Extra instructions for the summarization. Default is "".
+ max_pages_per_group (int, optional): Maximum number of pages to group per LLM pass. Default is 30.
+ progress (gr.Progress, optional): Gradio progress tracker. Default is Gradio Progress with tqdm.
+
+ Returns:
+ Tuple of (output_file_paths, status_message)
+ """
+ import os
+ from datetime import datetime
+
+ from tools.llm_funcs import load_model
+
+ output_files = []
+ all_prompts = []
+ all_responses = []
+ all_token_counts = (
+ []
+ ) # Store (input_tokens, output_tokens) for each prompt/response
+ page_group_page_ranges = (
+ []
+ ) # Store (min_page, max_page) for each saved prompt/response
+ page_group_summaries = []
+
+ # Initialize token tracking variables
+ llm_total_input_tokens = 0
+ llm_total_output_tokens = 0
+ llm_model_name = ""
+
+ try:
+ # Determine model source from model_choice using defaults from config.py
+ # Does not check model_name_map - uses the defined defaults
+ model_source = get_model_source_from_model_choice(model_choice)
+
+ local_model = None
+ tokenizer = None
+ assistant_model = None
+
+ # Setup model based on model source - check for Local models
+ # Load model and tokenizer together to ensure they're from the same source
+ # This prevents mismatches that could occur if they're loaded separately
+ # Similar to llm_funcs.py pattern (lines 830-839) and llm_entity_detection.py (lines 519-533)
+ if model_source == "Local":
+ if local_model is None or tokenizer is None:
+ progress(0.05, "Loading local model...")
+ # Use load_model() to ensure both are loaded atomically
+ # This is safer than calling get_pii_model() and get_pii_tokenizer() separately
+ loaded_model, loaded_tokenizer, loaded_assistant_model = load_model()
+ if local_model is None:
+ local_model = loaded_model
+ if tokenizer is None:
+ tokenizer = loaded_tokenizer
+ if assistant_model is None:
+ assistant_model = loaded_assistant_model
+
+ # Step 1: Group pages by context length
+ progress(0.1, "Grouping pages by context length...")
+ page_groups = group_pages_by_context_length(
+ all_page_line_level_ocr_results_df,
+ LLM_CONTEXT_LENGTH,
+ tokenizer,
+ model_source,
+ max_pages_per_group=max_pages_per_group,
+ )
+
+ if not page_groups:
+ return [], "No OCR results found. Please run text extraction first."
+
+ # Step 2: Summarise each page group (optionally in parallel)
+ _summary_page_group_max_workers = (
+ summary_page_group_max_workers
+ if summary_page_group_max_workers is not None
+ else SUMMARY_PAGE_GROUP_MAX_WORKERS
+ )
+ use_parallel_page_groups = (
+ _summary_page_group_max_workers > 1 and len(page_groups) > 1
+ )
+ progress(0.2, f"Summarising {len(page_groups)} page groups...")
+
+ def _summarise_one_group(args):
+ i, page_nums, group_text = args
+ summary_text, full_prompt, metadata = summarise_text_chunk(
+ group_text,
+ model_choice,
+ in_api_key,
+ temperature,
+ context_textbox=context_textbox,
+ aws_access_key_textbox=aws_access_key_textbox,
+ aws_secret_key_textbox=aws_secret_key_textbox,
+ aws_region_textbox=aws_region_textbox,
+ hf_api_key_textbox=hf_api_key_textbox,
+ azure_endpoint_textbox=azure_endpoint_textbox,
+ api_url=api_url,
+ local_model=local_model,
+ tokenizer=tokenizer,
+ assistant_model=assistant_model,
+ summarise_format_radio=summarise_format_radio,
+ additional_summary_instructions=additional_summary_instructions,
+ )
+ return (i, page_nums, summary_text, full_prompt, metadata)
+
+ if use_parallel_page_groups:
+ max_workers = min(_summary_page_group_max_workers, len(page_groups))
+ tasks = [
+ (i, page_nums, group_text)
+ for i, (page_nums, group_text) in enumerate(page_groups)
+ ]
+ results_by_index = [None] * len(page_groups)
+ pbar = tqdm(
+ total=len(page_groups),
+ unit="groups",
+ desc="Summarising page groups",
+ )
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
+ futures = {
+ executor.submit(_summarise_one_group, t): t[0] for t in tasks
+ }
+ completed = 0
+ for future in as_completed(futures):
+ i, page_nums, summary_text, full_prompt, metadata = future.result()
+ results_by_index[i] = (
+ page_nums,
+ summary_text,
+ full_prompt,
+ metadata,
+ )
+ completed += 1
+ pbar.update(1)
+ progress(
+ 0.2 + (completed / len(page_groups)) * 0.5,
+ f"Summarising page group {completed}/{len(page_groups)} (pages {min(page_nums)}-{max(page_nums)})...",
+ )
+ pbar.close()
+ # Build lists in page-group order
+ for i in range(len(page_groups)):
+ if results_by_index[i] is None:
+ continue
+ page_nums, summary_text, full_prompt, metadata = results_by_index[i]
+ if summary_text:
+ try:
+ min_page = int(min(page_nums)) if page_nums else 0
+ max_page = int(max(page_nums)) if page_nums else 0
+ except Exception:
+ min_page, max_page = 0, 0
+ page_group_page_ranges.append((min_page, max_page))
+ page_group_summaries.append(summary_text)
+ all_prompts.append(full_prompt)
+ all_responses.append(summary_text)
+ input_tokens, output_tokens = 0, 0
+ if metadata:
+ metadata_string = (
+ str(metadata) if not isinstance(metadata, str) else metadata
+ )
+ input_tokens, output_tokens, _ = calculate_tokens_from_metadata(
+ metadata_string, model_choice, model_name_map
+ )
+ llm_total_input_tokens += input_tokens
+ llm_total_output_tokens += output_tokens
+ if not llm_model_name and model_choice:
+ llm_model_name = model_choice
+ all_token_counts.append((input_tokens, output_tokens))
+ else:
+ seq_pbar = tqdm(
+ page_groups,
+ unit="groups",
+ desc="Summarising page groups",
+ )
+ for i, (page_nums, group_text) in enumerate(seq_pbar):
+ progress(
+ 0.2 + (i / len(page_groups)) * 0.5,
+ f"Summarising page group {i+1}/{len(page_groups)} (pages {min(page_nums)}-{max(page_nums)})...",
+ )
+ summary_text, full_prompt, metadata = summarise_text_chunk(
+ group_text,
+ model_choice,
+ in_api_key,
+ temperature,
+ context_textbox=context_textbox,
+ aws_access_key_textbox=aws_access_key_textbox,
+ aws_secret_key_textbox=aws_secret_key_textbox,
+ aws_region_textbox=aws_region_textbox,
+ hf_api_key_textbox=hf_api_key_textbox,
+ azure_endpoint_textbox=azure_endpoint_textbox,
+ api_url=api_url,
+ local_model=local_model,
+ tokenizer=tokenizer,
+ assistant_model=assistant_model,
+ summarise_format_radio=summarise_format_radio,
+ additional_summary_instructions=additional_summary_instructions,
+ )
+ if summary_text:
+ try:
+ min_page = int(min(page_nums)) if page_nums else 0
+ max_page = int(max(page_nums)) if page_nums else 0
+ except Exception:
+ min_page, max_page = 0, 0
+ page_group_page_ranges.append((min_page, max_page))
+ page_group_summaries.append(summary_text)
+ all_prompts.append(full_prompt)
+ all_responses.append(summary_text)
+ input_tokens, output_tokens = 0, 0
+ if metadata:
+ metadata_string = (
+ str(metadata) if not isinstance(metadata, str) else metadata
+ )
+ input_tokens, output_tokens, _ = calculate_tokens_from_metadata(
+ metadata_string, model_choice, model_name_map
+ )
+ llm_total_input_tokens += input_tokens
+ llm_total_output_tokens += output_tokens
+ if not llm_model_name and model_choice:
+ llm_model_name = model_choice
+ all_token_counts.append((input_tokens, output_tokens))
+ seq_pbar.close()
+
+ # Step 3: Recursively summarise if needed
+ progress(0.7, "Checking if recursive summarisation is needed...")
+ # Create token accumulator for recursive summarization
+ recursive_token_accumulator = [0, 0] # [input_tokens, output_tokens]
+ final_summaries = recursively_summarise(
+ page_group_summaries,
+ model_choice,
+ in_api_key,
+ temperature,
+ context_length=LLM_CONTEXT_LENGTH,
+ tokenizer=tokenizer,
+ model_source=model_source,
+ token_accumulator=recursive_token_accumulator,
+ context_textbox=context_textbox,
+ aws_access_key_textbox=aws_access_key_textbox,
+ aws_secret_key_textbox=aws_secret_key_textbox,
+ aws_region_textbox=aws_region_textbox,
+ hf_api_key_textbox=hf_api_key_textbox,
+ azure_endpoint_textbox=azure_endpoint_textbox,
+ api_url=api_url,
+ local_model=local_model,
+ assistant_model=assistant_model,
+ summarise_format_radio=summarise_format_radio,
+ additional_summary_instructions=additional_summary_instructions,
+ )
+
+ # Add tokens from recursive summarization
+ llm_total_input_tokens += recursive_token_accumulator[0]
+ llm_total_output_tokens += recursive_token_accumulator[1]
+
+ # Step 4: Create overall summary
+ progress(0.85, "Creating overall summary...")
+ # Create a topic summary DataFrame for overall_summary: three columns only
+ summary_numbers = list(range(1, len(final_summaries) + 1))
+ if len(final_summaries) == len(page_groups):
+ page_ranges = [f"Pages {min(pg[0])}-{max(pg[0])}" for pg in page_groups]
+ else:
+ # Recursion combined some summaries - use "All" or full range
+ if len(final_summaries) == 1 and page_groups:
+ all_pages = [p for pg in page_groups for p in pg[0]]
+ page_ranges = [f"Pages {min(all_pages)}-{max(all_pages)}"]
+ else:
+ page_ranges = ["All"] * len(final_summaries)
+ topic_summary_df = pd.DataFrame(
+ {
+ "Summary number": summary_numbers,
+ "Page range": page_ranges,
+ "Summary": final_summaries,
+ }
+ )
+
+ # Call overall_summary
+ (
+ output_files,
+ html_output_table,
+ overall_summarised_outputs_df,
+ out_metadata_str,
+ overall_input_tokens,
+ overall_output_tokens,
+ number_of_calls_num,
+ time_taken,
+ out_message,
+ overall_logged_content,
+ overall_prompt,
+ overall_response,
+ ) = overall_summary(
+ topic_summary_df=topic_summary_df,
+ model_choice=model_choice,
+ in_api_key=in_api_key,
+ temperature=temperature,
+ reference_data_file_name=file_name,
+ output_folder=output_folder,
+ context_textbox=context_textbox,
+ aws_access_key_textbox=aws_access_key_textbox,
+ aws_secret_key_textbox=aws_secret_key_textbox,
+ aws_region_textbox=aws_region_textbox,
+ hf_api_key_textbox=hf_api_key_textbox,
+ azure_endpoint_textbox=azure_endpoint_textbox,
+ api_url=api_url,
+ local_model=local_model,
+ tokenizer=tokenizer,
+ assistant_model=assistant_model,
+ summarise_format_radio=summarise_format_radio,
+ additional_summary_instructions=additional_summary_instructions,
+ progress=progress,
+ )
+
+ llm_total_input_tokens += overall_input_tokens
+ llm_total_output_tokens += overall_output_tokens
+
+ # Extract summary texts from the DataFrame
+ if (
+ overall_summarised_outputs_df is not None
+ and not overall_summarised_outputs_df.empty
+ ):
+ if "Summary" in overall_summarised_outputs_df.columns:
+ overall_summary_texts = overall_summarised_outputs_df[
+ "Summary"
+ ].tolist()
+ else:
+ # Fallback: get from first column if "Summary" column doesn't exist
+ overall_summary_texts = overall_summarised_outputs_df.iloc[
+ :, 0
+ ].tolist()
+ else:
+ overall_summary_texts = []
+
+ # Step 5: Save outputs
+ progress(0.95, "Saving output files...")
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+ file_name_clean = get_file_name_no_ext(file_name) if file_name else "document"
+ # Ensure file_name_clean is not empty
+ if not file_name_clean or file_name_clean.strip() == "":
+ file_name_clean = "document"
+
+ summaries_folder = os.path.join(output_folder, "summaries")
+ os.makedirs(summaries_folder, exist_ok=True)
+
+ # Save prompts and responses as .txt files for page group summaries
+ for i, (prompt, response) in enumerate(zip(all_prompts, all_responses)):
+ # Page range for this prompt/response pair
+ min_page, max_page = (
+ page_group_page_ranges[i] if i < len(page_group_page_ranges) else (0, 0)
+ )
+ page_range_slug = f"pages_{min_page}_{max_page}"
+ txt_file_path = os.path.join(
+ summaries_folder,
+ f"{file_name_clean}_{page_range_slug}_prompt_response_{timestamp}.txt",
+ )
+ # Get token counts for this prompt/response pair
+ input_tokens, output_tokens = (
+ all_token_counts[i] if i < len(all_token_counts) else (0, 0)
+ )
+
+ with open(txt_file_path, "w", encoding="utf-8") as f:
+ f.write("=" * 80 + "\n")
+ f.write("TOKEN INFORMATION\n")
+ f.write("=" * 80 + "\n")
+ f.write(f"Page Range: {min_page}-{max_page}\n")
+ f.write(f"Input Tokens: {input_tokens}\n")
+ f.write(f"Output Tokens: {output_tokens}\n")
+ f.write(f"Maximum Context Length: {LLM_CONTEXT_LENGTH}\n")
+ f.write(f"Model: {model_choice}\n")
+ f.write(f"Temperature: {temperature}\n")
+ f.write("=" * 80 + "\n\n")
+ f.write("=" * 80 + "\n")
+ f.write("PROMPT\n")
+ f.write("=" * 80 + "\n")
+ f.write(prompt)
+ f.write("\n\n" + "=" * 80 + "\n")
+ f.write("RESPONSE\n")
+ f.write("=" * 80 + "\n")
+ f.write(response)
+ output_files.append(txt_file_path)
+
+ # Save overall summary prompt/response
+
+ # Fallback: If we don't have prompt/response from logged_content, use summary texts
+ # This should rarely happen, but provides a safety net
+ if not overall_prompt and overall_summary_texts:
+ # Construct a basic prompt representation (this is a fallback, not ideal)
+ overall_prompt = (
+ f"Overall summary request for document: {file_name_clean}\n"
+ )
+ overall_prompt += f"Input: {len(final_summaries)} summary group(s) to combine into overall summary\n"
+ overall_prompt += f"Summary format: {summarise_format_radio}\n"
+ if additional_summary_instructions:
+ overall_prompt += (
+ f"Additional instructions: {additional_summary_instructions}\n"
+ )
+
+ # If we still don't have a response, use summary texts
+ if not overall_response and overall_summary_texts:
+ overall_response = (
+ "\n\n".join(overall_summary_texts)
+ if isinstance(overall_summary_texts, list)
+ else str(overall_summary_texts)
+ )
+
+ # Save overall summary .txt file if we have response content (always create if we have summary texts)
+ if overall_response or overall_summary_texts:
+ txt_file_path = os.path.join(
+ summaries_folder,
+ f"{file_name_clean}_overall_summary_prompt_response_{timestamp}.txt",
+ )
+ with open(txt_file_path, "w", encoding="utf-8") as f:
+ f.write("=" * 80 + "\n")
+ f.write("TOKEN INFORMATION\n")
+ f.write("=" * 80 + "\n")
+ f.write(f"Input Tokens: {overall_input_tokens}\n")
+ f.write(f"Output Tokens: {overall_output_tokens}\n")
+ f.write(f"Maximum Context Length: {LLM_CONTEXT_LENGTH}\n")
+ f.write(f"Model: {model_choice}\n")
+ f.write(f"Temperature: {temperature}\n")
+ f.write("=" * 80 + "\n\n")
+ f.write("=" * 80 + "\n")
+ f.write("PROMPT\n")
+ f.write("=" * 80 + "\n")
+ f.write(overall_prompt)
+ f.write("\n\n" + "=" * 80 + "\n")
+ f.write("RESPONSE\n")
+ f.write("=" * 80 + "\n")
+ f.write(overall_response)
+ output_files.append(txt_file_path)
+
+ # Save summaries as CSV
+ summary_data = {"Type": [], "Page_Range": [], "Summary": []}
+
+ # Add page group summaries
+ for i, (page_nums, summary) in enumerate(
+ zip([pg[0] for pg in page_groups], page_group_summaries)
+ ):
+ summary_data["Type"].append("Page Group Summary")
+ summary_data["Page_Range"].append(f"{min(page_nums)}-{max(page_nums)}")
+ summary_data["Summary"].append(summary)
+
+ # Add final summaries if different from page group summaries
+ if final_summaries != page_group_summaries:
+ for i, summary in enumerate(final_summaries):
+ summary_data["Type"].append("Final Summary")
+ summary_data["Page_Range"].append(f"Group {i+1}")
+ summary_data["Summary"].append(summary)
+
+ # Add overall summary - ensure overall_summary_texts is a list of strings
+ if overall_summary_texts:
+ # Handle case where overall_summary_texts might be a single string
+ if isinstance(overall_summary_texts, str):
+ overall_summary_texts = [overall_summary_texts]
+ # Ensure each item is a string, not being iterated character by character
+ for summary in overall_summary_texts:
+ if isinstance(summary, str):
+ summary_data["Type"].append("Overall Summary")
+ summary_data["Page_Range"].append("All")
+ summary_data["Summary"].append(summary)
+ elif hasattr(summary, "__iter__") and not isinstance(summary, str):
+ # If it's iterable but not a string, convert to string
+ summary_str = str(summary)
+ summary_data["Type"].append("Overall Summary")
+ summary_data["Page_Range"].append("All")
+ summary_data["Summary"].append(summary_str)
+
+ summary_df = pd.DataFrame(summary_data)
+ csv_file_path = os.path.join(
+ summaries_folder, f"{file_name_clean}_summaries_{timestamp}.csv"
+ )
+ summary_df.to_csv(csv_file_path, index=False, encoding="utf-8-sig")
+ output_files.append(csv_file_path)
+
+ progress(1.0, "Summarisation complete!")
+ status_message = (
+ f"Summarisation complete! Generated {len(output_files)} output files."
+ )
+
+ # Prepare summary text for display (combine all overall summary texts)
+ summary_display_text = ""
+ if overall_summary_texts:
+ if isinstance(overall_summary_texts, list):
+ summary_display_text = "\n\n".join(overall_summary_texts)
+ else:
+ summary_display_text = str(overall_summary_texts)
+
+ return (
+ output_files,
+ status_message,
+ llm_model_name,
+ llm_total_input_tokens,
+ llm_total_output_tokens,
+ summary_display_text,
+ )
+
+ except Exception as e:
+ error_message = f"Error during summarisation: {str(e)}"
+ print(error_message)
+ import traceback
+
+ traceback.print_exc()
+ return (
+ output_files,
+ error_message,
+ llm_model_name,
+ llm_total_input_tokens,
+ llm_total_output_tokens,
+ "", # Empty summary display text on error
+ )
+
+
+def join_unique_summaries(x):
+ unique_summaries = []
+ seen = set()
+
+ for s in x:
+ if pd.isna(s):
+ continue
+
+ # 1. Normalize whitespace and split lines
+ s_str = str(s).strip()
+ lines = s_str.split("\n")
+
+ for line in lines:
+ # 2. Aggressive Cleaning
+ # Remove "Rows X to Y:" prefix
+ line = re.sub(
+ r"^Rows\s+\d+\s+to\s+\d+:\s*", "", line, flags=re.IGNORECASE
+ ).strip()
+
+ # Remove generic "Prefix:" if it exists (e.g., "Summary: ...")
+ if ": " in line:
+ parts = line.split(": ", 1)
+ if len(parts[0]) < 50 and " " not in parts[0]:
+ line = parts[1].strip()
+
+ # 3. Handle Invisible Characters (Crucial)
+ # Replace non-breaking spaces (\xa0) and multiple spaces with a single standard space
+ normalized_line = re.sub(r"\s+", " ", line).strip()
+
+ # 4. Check against Seen
+ if normalized_line and normalized_line not in seen:
+ unique_summaries.append(normalized_line)
+ seen.add(normalized_line)
+
+ return "\n".join(unique_summaries)
+
+
+def sample_reference_table_summaries(
+ reference_df: pd.DataFrame,
+ random_seed: int,
+ no_of_sampled_summaries: int = 100,
+ sample_reference_table_checkbox: bool = False,
+):
+ """
+ Sample x number of summaries from which to produce summaries, so that the input token length is not too long.
+ """
+
+ if sample_reference_table_checkbox:
+
+ all_summaries = pd.DataFrame(
+ columns=[
+ "General topic",
+ "Subtopic",
+ "Sentiment",
+ "Group",
+ "Response References",
+ "Summary",
+ ]
+ )
+
+ if "Group" not in reference_df.columns:
+ reference_df["Group"] = "All"
+
+ reference_df_grouped = reference_df.groupby(
+ ["General topic", "Subtopic", "Sentiment", "Group"]
+ )
+
+ if "Revised summary" in reference_df.columns:
+ out_message = "Summary has already been created for this file"
+ print(out_message)
+ raise Exception(out_message)
+
+ for group_keys, reference_df_group in reference_df_grouped:
+ if len(reference_df_group["General topic"]) > 1:
+
+ filtered_reference_df = reference_df_group.reset_index()
+
+ filtered_reference_df_unique = filtered_reference_df.drop_duplicates(
+ [
+ "General topic",
+ "Subtopic",
+ "Sentiment",
+ "Group",
+ "Start row of group",
+ ]
+ )
+
+ # Sample n of the unique topic summaries PER GROUP. To limit the length of the text going into the summarisation tool
+ # This ensures each group gets up to no_of_sampled_summaries summaries, not the total across all groups
+ number_of_summaries_to_sample = min(
+ no_of_sampled_summaries, len(filtered_reference_df_unique)
+ )
+ print(
+ f"Sampling {number_of_summaries_to_sample} summaries from group {group_keys}, from dataframe filtered_reference_df_unique.head(5):\n{filtered_reference_df_unique.head(5)}"
+ )
+ filtered_reference_df_unique_sampled = (
+ filtered_reference_df_unique.sample(
+ number_of_summaries_to_sample, random_state=random_seed
+ )
+ )
+
+ all_summaries = pd.concat(
+ [all_summaries, filtered_reference_df_unique_sampled]
+ )
+
+ print("all_summaries.tail(5):\n", all_summaries.tail(5))
+
+ # If no responses/topics qualify, just go ahead with the original reference dataframe
+ if all_summaries.empty:
+ sampled_reference_table_df = reference_df
+ # Filter by sentiment only (Response References is a string in original df, not a count)
+ sampled_reference_table_df = sampled_reference_table_df.loc[
+ sampled_reference_table_df["Sentiment"] != "Not Mentioned"
+ ]
+ else:
+ # Deduplicate summaries within each group before joining to prevent repeated summaries
+
+ sampled_reference_table_df = (
+ all_summaries.groupby(
+ ["General topic", "Subtopic", "Sentiment", "Group"]
+ )
+ .agg(
+ {
+ "Response References": "size", # Count the number of references
+ "Summary": join_unique_summaries, # Join unique summaries only
+ }
+ )
+ .reset_index()
+ )
+ # Filter by sentiment and count (Response References is now a numeric count after aggregation)
+ sampled_reference_table_df = sampled_reference_table_df.loc[
+ (sampled_reference_table_df["Sentiment"] != "Not Mentioned")
+ & (sampled_reference_table_df["Response References"] > 1)
+ ]
+ else:
+ sampled_reference_table_df = reference_df
+
+ summarised_references_markdown = sampled_reference_table_df.to_markdown(index=False)
+
+ return sampled_reference_table_df, summarised_references_markdown
+
+
+def count_tokens_in_text(text: str, tokenizer=None, model_source: str = "Local") -> int:
+ """
+ Count the number of tokens in the given text.
+
+ Args:
+ text (str): The text to count tokens for
+ tokenizer (object, optional): Tokenizer object for local models. Defaults to None.
+ model_source (str): Source of the model to determine tokenization method. Defaults to "Local".
+
+ Returns:
+ int: Number of tokens in the text
+ """
+ if not text:
+ return 0
+
+ try:
+ if model_source == "Local" and tokenizer and len(tokenizer) > 0:
+ # Use local tokenizer if available
+ tokens = tokenizer[0].encode(text, add_special_tokens=False)
+ return len(tokens)
+ else:
+ # Fallback: rough estimation using word count (approximately 1.3 tokens per word)
+ word_count = len(text.split())
+ return int(word_count * 1.3)
+ except Exception as e:
+ print(f"Error counting tokens: {e}. Using word count estimation.")
+ # Fallback: rough estimation using word count
+ word_count = len(text.split())
+ return int(word_count * 1.3)
+
+
+def clean_markdown_table_whitespace(markdown_text: str) -> str:
+ if not markdown_text:
+ return markdown_text
+
+ lines = markdown_text.splitlines()
+ cleaned_lines = []
+
+ for line in lines:
+ # 1. Clean all types of whitespace (including non-breaking spaces \u00A0)
+ # This turns every cell into a single-spaced string
+ cells = [re.sub(r"[\s\u00A0]+", " ", cell.strip()) for cell in line.split("|")]
+
+ # 2. Check if the row is effectively empty (only pipes or whitespace)
+ # We join the content; if nothing is left, it's a "ghost" row.
+ if not "".join(cells).strip():
+ continue
+
+ # 3. Handle the separator row specifically (e.g., |:---|---:|)
+ # We reset these to a small fixed width so they don't stretch the table.
+ if re.match(r"^[|\s\-:]+$", line):
+ new_separator = []
+ for cell in cells:
+ if not cell: # Outer pipes
+ new_separator.append("")
+ elif ":" in cell: # Alignment markers
+ left = ":" if cell.startswith(":") else "-"
+ right = ":" if cell.endswith(":") else "-"
+ new_separator.append(f"{left}---{right}")
+ else:
+ new_separator.append("---")
+ cleaned_lines.append("|".join(new_separator))
+ continue
+
+ # 4. Standard data row: Rejoin with single padding
+ # We filter out empty outer parts caused by leading/trailing pipes
+ formatted_row = (
+ "| "
+ + " | ".join(
+ c for c in cells if c or cells.index(c) not in [0, len(cells) - 1]
+ )
+ + " |"
+ )
+
+ # Simple fallback if the logic above is too aggressive for your specific table style:
+ # formatted_row = "|".join(f" {c} " if c else "" for c in cells)
+
+ cleaned_lines.append(formatted_row)
+
+ return "\n".join(cleaned_lines)
+
+
+def summarise_output_topics_query(
+ model_choice: str,
+ in_api_key: str,
+ temperature: float,
+ formatted_summary_prompt: str,
+ summarise_topic_descriptions_system_prompt: str,
+ model_source: str,
+ bedrock_runtime: boto3.Session.client,
+ local_model=list(),
+ tokenizer=list(),
+ assistant_model=list(),
+ azure_endpoint: str = "",
+ api_url: str = None,
+):
+ """
+ Query an LLM to generate a summary of topics based on the provided prompts.
+
+ Args:
+ model_choice (str): The name/type of model to use for generation
+ in_api_key (str): API key for accessing the model service
+ temperature (float): Temperature parameter for controlling randomness in generation
+ formatted_summary_prompt (str): The formatted prompt containing topics to summarise
+ summarise_topic_descriptions_system_prompt (str): System prompt providing context and instructions
+ model_source (str): Source of the model (e.g. "AWS", "Gemini", "Local")
+ bedrock_runtime (boto3.Session.client): AWS Bedrock runtime client for AWS models
+ local_model (object, optional): Local model object if using local inference. Defaults to empty list.
+ tokenizer (object, optional): Tokenizer object if using local inference. Defaults to empty list.
+ Returns:
+ tuple: Contains:
+ - response_text (str): The generated summary text
+ - conversation_history (list): History of the conversation with the model
+ - whole_conversation_metadata (list): Metadata about the conversation
+ """
+ conversation_history = list()
+ whole_conversation_metadata = list()
+ client = list()
+ client_config = {}
+
+ # Combine system prompt and user prompt for token counting
+ full_input_text = (
+ summarise_topic_descriptions_system_prompt + "\n" + formatted_summary_prompt[0]
+ if isinstance(formatted_summary_prompt, list)
+ else summarise_topic_descriptions_system_prompt
+ + "\n"
+ + formatted_summary_prompt
+ )
+
+ # Count tokens in the input text
+ input_token_count = count_tokens_in_text(full_input_text, tokenizer, model_source)
+
+ # Check if input exceeds context length
+ if input_token_count > LLM_CONTEXT_LENGTH:
+ error_message = f"Input text exceeds LLM context length. Input tokens: {input_token_count}, Max context length: {LLM_CONTEXT_LENGTH}. Please reduce the input text size."
+ print(error_message)
+ raise ValueError(error_message)
+
+ print(f"Input token count: {input_token_count} (Max: {LLM_CONTEXT_LENGTH})")
+
+ # Prepare Gemini models before query
+ if "Gemini" in model_source:
+ # print("Using Gemini model:", model_choice)
+ client, config = construct_gemini_generative_model(
+ in_api_key=in_api_key,
+ temperature=temperature,
+ model_choice=model_choice,
+ system_prompt=system_prompt,
+ max_tokens=max_tokens,
+ )
+ elif "Azure/OpenAI" in model_source:
+ client, config = construct_azure_client(
+ in_api_key=os.environ.get("AZURE_INFERENCE_CREDENTIAL", ""),
+ endpoint=azure_endpoint,
+ )
+ elif "Local" in model_source:
+ pass
+ # print("Using local model: ", model_choice)
+ elif "AWS" in model_source:
+ pass
+ # print("Using AWS Bedrock model:", model_choice)
+
+ whole_conversation = [summarise_topic_descriptions_system_prompt]
+
+ # Process requests to large language model
+ (
+ responses,
+ conversation_history,
+ whole_conversation,
+ whole_conversation_metadata,
+ response_text,
+ ) = process_requests(
+ formatted_summary_prompt,
+ summarise_topic_descriptions_system_prompt,
+ conversation_history,
+ whole_conversation,
+ whole_conversation_metadata,
+ client,
+ client_config,
+ model_choice,
+ temperature,
+ bedrock_runtime=bedrock_runtime,
+ model_source=model_source,
+ local_model=local_model,
+ tokenizer=tokenizer,
+ assistant_model=assistant_model,
+ assistant_prefill=summary_assistant_prefill,
+ api_url=api_url,
+ )
+
+ summarised_output = re.sub(
+ r"\n{2,}", "\n", response_text
+ ) # Replace multiple line breaks with a single line break
+ summarised_output = re.sub(
+ r"^\n{1,}", "", summarised_output
+ ) # Remove one or more line breaks at the start
+ summarised_output = re.sub(
+ r"\n", "
", summarised_output
+ ) # Replace \n with more html friendly
tags
+ summarised_output = summarised_output.strip()
+
+ print("Finished summary query")
+
+ # Ensure the system prompt is included in the conversation history
+ try:
+ if isinstance(conversation_history, list):
+ has_system_prompt = False
+
+ if conversation_history:
+ first_entry = conversation_history[0]
+ if isinstance(first_entry, dict):
+ role_is_system = first_entry.get("role") == "system"
+ parts = first_entry.get("parts")
+ content_matches = (
+ parts == summarise_topic_descriptions_system_prompt
+ or (
+ isinstance(parts, list)
+ and summarise_topic_descriptions_system_prompt in parts
+ )
+ )
+ has_system_prompt = role_is_system and content_matches
+ elif isinstance(first_entry, str):
+ has_system_prompt = (
+ first_entry.strip().lower().startswith("system:")
+ )
+
+ if not has_system_prompt:
+ conversation_history.insert(
+ 0,
+ {
+ "role": "system",
+ "parts": [summarise_topic_descriptions_system_prompt],
+ },
+ )
+ except Exception as _e:
+ # Non-fatal: if anything goes wrong, return the original conversation history
+ pass
+
+ return (
+ summarised_output,
+ conversation_history,
+ whole_conversation_metadata,
+ response_text,
+ )
+
+
+def process_debug_output_iteration(
+ output_debug_files: str,
+ summaries_folder: str,
+ batch_file_path_details: str,
+ model_choice_clean_short: str,
+ final_system_prompt: str,
+ summarised_output: str,
+ conversation_history: list,
+ metadata: list,
+ log_output_files: list,
+ task_type: str,
+) -> tuple[str, str, str, str]:
+ """
+ Writes debug files for summary generation if output_debug_files is "True",
+ and returns the content of the prompt, summary, conversation, and metadata for the current iteration.
+
+ Args:
+ output_debug_files (str): Flag to indicate if debug files should be written.
+ summaries_folder (str): The folder where output files are saved.
+ batch_file_path_details (str): Details for the batch file path.
+ model_choice_clean_short (str): Shortened cleaned model choice.
+ final_system_prompt (str): The system prompt content.
+ summarised_output (str): The summarised output content.
+ conversation_history (list): The full conversation history.
+ metadata (list): The metadata for the conversation.
+ log_output_files (list): A list to append paths of written log files. This list is modified in-place.
+ task_type (str): The type of task being performed.
+ Returns:
+ tuple[str, str, str, str]: A tuple containing the content of the prompt,
+ summarised output, conversation history (as string),
+ and metadata (as string) for the current iteration.
+ """
+ current_prompt_content = final_system_prompt
+ current_summary_content = summarised_output
+
+ if isinstance(conversation_history, list):
+
+ # Handle both list of strings and list of dicts
+ if conversation_history and isinstance(conversation_history[0], dict):
+ # Convert list of dicts to list of strings
+ conversation_strings = list()
+ for entry in conversation_history:
+ if "role" in entry and "parts" in entry:
+ role = entry["role"].capitalize()
+ message = (
+ " ".join(entry["parts"])
+ if isinstance(entry["parts"], list)
+ else str(entry["parts"])
+ )
+ conversation_strings.append(f"{role}: {message}")
+ else:
+ # Fallback for unexpected dict format
+ conversation_strings.append(str(entry))
+ current_conversation_content = "\n".join(conversation_strings)
+ else:
+ # Handle list of strings
+ current_conversation_content = "\n".join(conversation_history)
+ else:
+ current_conversation_content = str(conversation_history)
+ current_metadata_content = str(metadata)
+ current_task_type = task_type
+
+ if output_debug_files == "True":
+ try:
+ formatted_prompt_output_path = (
+ summaries_folder
+ + batch_file_path_details
+ + "_full_prompt_"
+ + model_choice_clean_short
+ + "_"
+ + current_task_type
+ + ".txt"
+ )
+ final_table_output_path = (
+ summaries_folder
+ + batch_file_path_details
+ + "_full_response_"
+ + model_choice_clean_short
+ + "_"
+ + current_task_type
+ + ".txt"
+ )
+ whole_conversation_path = (
+ summaries_folder
+ + batch_file_path_details
+ + "_full_conversation_"
+ + model_choice_clean_short
+ + "_"
+ + current_task_type
+ + ".txt"
+ )
+ whole_conversation_path_meta = (
+ summaries_folder
+ + batch_file_path_details
+ + "_metadata_"
+ + model_choice_clean_short
+ + "_"
+ + current_task_type
+ + ".txt"
+ )
+
+ with open(
+ formatted_prompt_output_path,
+ "w",
+ encoding="utf-8-sig",
+ errors="replace",
+ ) as f:
+ f.write(current_prompt_content)
+ with open(
+ final_table_output_path, "w", encoding="utf-8-sig", errors="replace"
+ ) as f:
+ f.write(current_summary_content)
+ with open(
+ whole_conversation_path, "w", encoding="utf-8-sig", errors="replace"
+ ) as f:
+ f.write(current_conversation_content)
+ with open(
+ whole_conversation_path_meta,
+ "w",
+ encoding="utf-8-sig",
+ errors="replace",
+ ) as f:
+ f.write(current_metadata_content)
+
+ log_output_files.append(formatted_prompt_output_path)
+ log_output_files.append(final_table_output_path)
+ log_output_files.append(whole_conversation_path)
+ log_output_files.append(whole_conversation_path_meta)
+ except Exception as e:
+ print(f"Error in writing debug files for summary: {e}")
+
+ # Return the content of the objects for the current iteration.
+ # The caller can then append these to separate lists if accumulation is desired.
+ return (
+ current_prompt_content,
+ current_summary_content,
+ current_conversation_content,
+ current_metadata_content,
+ )
+
+
+def convert_markdown_headers_to_excel_format(text: str) -> str:
+ """
+ Convert markdown headers to Excel-friendly format that preserves hierarchy.
+
+ Converts:
+ - # Header (H1) -> === HEADER === (most prominent)
+ - ## Header (H2) -> --- Header --- (medium)
+ - ### Header (H3) -> ── Header ── (less prominent)
+ - #### Header (H4) -> • Header (with bullet)
+ - ##### Header (H5) -> • Header (indented)
+ - ###### Header (H6) -> • Header (more indented)
+
+ Args:
+ text (str): Text containing markdown headers
+
+ Returns:
+ str: Text with markdown headers converted to Excel-friendly format
+ """
+ if not text:
+ return text
+
+ lines = text.split("\n")
+ converted_lines = []
+
+ for line in lines:
+ # Match markdown headers (# through ######)
+ header_match = re.match(r"^(#{1,6})\s+(.+)$", line)
+ if header_match:
+ header_level = len(header_match.group(1)) # Number of # characters
+ header_text = header_match.group(2).strip()
+
+ if header_level == 1:
+ # H1: Most prominent - uppercase with double equals
+ converted_line = f"=== {header_text.upper()} ==="
+ elif header_level == 2:
+ # H2: Medium prominence - title case with dashes
+ converted_line = f"--- {header_text.title()} ---"
+ elif header_level == 3:
+ # H3: Less prominent - title case with single dashes
+ converted_line = f"── {header_text.title()} ──"
+ elif header_level == 4:
+ # H4: Bullet with no indentation
+ converted_line = f"• {header_text}"
+ elif header_level == 5:
+ # H5: Bullet with indentation
+ converted_line = f" • {header_text}"
+ else: # header_level == 6
+ # H6: Bullet with more indentation
+ converted_line = f" • {header_text}"
+
+ converted_lines.append(converted_line)
+ else:
+ converted_lines.append(line)
+
+ return "\n".join(converted_lines)
+
+
+@spaces.GPU(duration=MAX_SPACES_GPU_RUN_TIME)
+def overall_summary(
+ topic_summary_df: pd.DataFrame,
+ model_choice: str,
+ in_api_key: str,
+ temperature: float,
+ reference_data_file_name: str,
+ output_folder: str = OUTPUT_FOLDER,
+ context_textbox: str = "",
+ aws_access_key_textbox: str = "",
+ aws_secret_key_textbox: str = "",
+ aws_region_textbox: str = "",
+ model_name_map: dict = model_name_map,
+ hf_api_key_textbox: str = "",
+ azure_endpoint_textbox: str = "",
+ existing_logged_content: list = list(),
+ api_url: str = None,
+ output_debug_files: str = "False",
+ log_output_files: list = list(),
+ reasoning_suffix: str = reasoning_suffix,
+ local_model: object = None,
+ tokenizer: object = None,
+ assistant_model: object = None,
+ summarise_everything_prompt: str = summarise_everything_prompt,
+ summarise_everything_system_prompt: str = summarise_everything_system_prompt,
+ summarise_format_radio: str = detailed_summary_format_prompt,
+ additional_summary_instructions: str = "",
+ do_summaries: str = "Yes",
+ progress=gr.Progress(track_tqdm=True),
+) -> Tuple[
+ List[str],
+ List[str],
+ int,
+ str,
+ List[str],
+ List[str],
+ int,
+ int,
+ int,
+ float,
+ List[dict],
+]:
+ """
+ Create an overall summary of all responses based on a topic summary table.
+
+ Args:
+ topic_summary_df (pd.DataFrame): DataFrame with columns "Summary number", "Page range", "Summary"
+ model_choice (str): Name of the LLM model to use
+ in_api_key (str): API key for model access
+ temperature (float): Temperature parameter for model generation
+ reference_data_file_name (str): Name of reference data file
+ output_folder (str, optional): Folder to save outputs. Defaults to OUTPUT_FOLDER.
+ context_textbox (str, optional): Additional context. Defaults to empty string.
+ aws_access_key_textbox (str, optional): AWS access key. Defaults to empty string.
+ aws_secret_key_textbox (str, optional): AWS secret key. Defaults to empty string.
+ aws_region_textbox (str, optional): AWS region. Defaults to empty string.
+ model_name_map (dict, optional): Mapping of model names. Defaults to model_name_map.
+ hf_api_key_textbox (str, optional): Hugging Face API key. Defaults to empty string.
+ existing_logged_content (list, optional): List of existing logged content. Defaults to empty list.
+ output_debug_files (str, optional): Flag to indicate if debug files should be written. Defaults to "False".
+ log_output_files (list, optional): List of existing logged content. Defaults to empty list.
+ api_url (str, optional): API URL for inference-server models. Defaults to None.
+ reasoning_suffix (str, optional): Suffix for reasoning. Defaults to reasoning_suffix.
+ local_model (object, optional): Local model object. Defaults to None.
+ tokenizer (object, optional): Tokenizer object. Defaults to None.
+ assistant_model (object, optional): Assistant model object. Defaults to None.
+ summarise_everything_prompt (str, optional): Prompt for overall summary
+ summarise_everything_system_prompt (str, optional): System prompt for overall summary
+ summarise_format_radio (str, optional): Summary format radio. Defaults to summarise_format_radio.
+ additional_summary_instructions (str, optional): Additional summary instructions. Defaults to additional_summary_instructions.
+ do_summaries (str, optional): Whether to generate summaries. Defaults to "Yes".
+ progress (gr.Progress, optional): Progress tracker. Defaults to gr.Progress(track_tqdm=True).
+
+ Returns:
+ Tuple containing:
+ List[str]: Output files
+ List[str]: Text summarised outputs
+ int: Latest summary completed
+ str: Output metadata
+ List[str]: Summarised outputs
+ List[str]: Summarised outputs for DataFrame
+ int: Number of input tokens
+ int: Number of output tokens
+ int: Number of API calls
+ float: Time taken
+ List[dict]: List of logged content
+ """
+
+ out_metadata = list()
+ latest_summary_completed = 0
+ output_files = list()
+ txt_summarised_outputs = list()
+ summarised_outputs = list()
+ summarised_outputs_for_df = list()
+ input_tokens_num = 0
+ output_tokens_num = 0
+ number_of_calls_num = 0
+ time_taken = 0
+ out_message = list()
+ all_logged_content = list()
+ all_prompts_content = list()
+ all_summaries_content = list()
+ all_metadata_content = list()
+ all_groups_content = list()
+ all_batches_content = list()
+ all_model_choice_content = list()
+ all_validated_content = list()
+ task_type = "Overall summary"
+ all_task_type_content = list()
+ log_output_files = list()
+ all_logged_content = list()
+ all_file_names_content = list()
+ tic = time.perf_counter()
+
+ summaries_folder = os.path.join(output_folder, "summaries")
+ os.makedirs(summaries_folder, exist_ok=True)
+
+ # Expect three columns: Summary number, Page range, Summary
+ required_cols = ["Summary number", "Page range", "Summary"]
+ if not all(c in topic_summary_df.columns for c in required_cols):
+ raise ValueError(
+ "topic_summary_df must have columns: Summary number, Page range, Summary"
+ )
+ topic_summary_df = topic_summary_df[required_cols].copy()
+ topic_summary_df = topic_summary_df.sort_values(by="Summary number", ascending=True)
+
+ # Single "group" containing the whole table (no grouping by Group column)
+ unique_groups = ["All"]
+
+ len(unique_groups)
+
+ if context_textbox and "The context of this analysis is" not in context_textbox:
+ context_textbox = "The context of this analysis is '" + context_textbox + "'."
+
+ # if length_groups > 1:
+ # comprehensive_summary_format_prompt = (
+ # comprehensive_summary_format_prompt_by_group
+ # )
+ # else:
+ # comprehensive_summary_format_prompt = comprehensive_summary_format_prompt
+
+ batch_file_path_details = create_batch_file_path_details(reference_data_file_name)
+ # Use model_choice directly as short_name, or try to get from model_name_map if available
+ if model_name_map and model_choice in model_name_map:
+ model_choice_clean = model_name_map[model_choice]["short_name"]
+ else:
+ # Use model_choice directly if not in model_name_map
+ model_choice_clean = model_choice
+ model_choice_clean_short = clean_column_name(
+ model_choice_clean, max_length=20, front_characters=False
+ )
+
+ tic = time.perf_counter()
+
+ # Determine model source from model_choice using defaults from config.py
+ # Does not check model_name_map - uses the defined defaults
+ model_source = get_model_source_from_model_choice(model_choice)
+
+ # Load model and tokenizer together to ensure they're from the same source
+ # This prevents mismatches that could occur if they're loaded separately
+ # Similar to llm_funcs.py pattern (lines 830-839) and llm_entity_detection.py (lines 519-533)
+ if (model_source == "Local") & (local_model is None or tokenizer is None):
+ progress(0.1, f"Using model: {LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE}")
+ # Use load_model() to ensure both are loaded atomically
+ # This is safer than calling get_pii_model() and get_pii_tokenizer() separately
+ loaded_model, loaded_tokenizer, loaded_assistant_model = load_model()
+ if local_model is None:
+ local_model = loaded_model
+ if tokenizer is None:
+ tokenizer = loaded_tokenizer
+ if assistant_model is None:
+ assistant_model = loaded_assistant_model
+
+ summary_loop = tqdm(
+ unique_groups, desc="Creating overall summary for groups", unit="groups"
+ )
+
+ if do_summaries == "Yes":
+ # Determine model source from model_choice using defaults from config.py
+ # Does not check model_name_map - uses the defined defaults
+ model_source = get_model_source_from_model_choice(model_choice)
+
+ # Setup bedrock for AWS models only
+ # Use the same approach as file_redaction.py (lines 939-969) for consistency
+ bedrock_runtime = None
+ if model_source == "AWS":
+ # Use aws_region_textbox if provided, otherwise fall back to AWS_REGION from config
+ region = aws_region_textbox if aws_region_textbox else AWS_REGION
+
+ if RUN_AWS_FUNCTIONS and PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS:
+ print("Connecting to Bedrock via existing SSO connection")
+ bedrock_runtime = boto3.client("bedrock-runtime", region_name=region)
+ elif aws_access_key_textbox and aws_secret_key_textbox:
+ print(
+ "Connecting to Bedrock using AWS access key and secret keys from user input."
+ )
+ bedrock_runtime = boto3.client(
+ "bedrock-runtime",
+ aws_access_key_id=aws_access_key_textbox,
+ aws_secret_access_key=aws_secret_key_textbox,
+ region_name=region,
+ )
+ elif RUN_AWS_FUNCTIONS:
+ print("Connecting to Bedrock via existing SSO connection")
+ bedrock_runtime = boto3.client("bedrock-runtime", region_name=region)
+ elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
+ print("Getting Bedrock credentials from environment variables")
+ bedrock_runtime = boto3.client(
+ "bedrock-runtime",
+ aws_access_key_id=AWS_ACCESS_KEY,
+ aws_secret_access_key=AWS_SECRET_KEY,
+ region_name=region,
+ )
+ else:
+ bedrock_runtime = None
+ out_message = "Cannot connect to AWS Bedrock service. Please provide access keys under LLM settings, or choose another model type."
+ print(out_message)
+ raise Exception(out_message)
+
+ for summary_group in summary_loop:
+
+ print("Creating overall summary for group:", summary_group)
+
+ # Use the full table (three columns: Summary number, Page range, Summary)
+ group_df = topic_summary_df.copy()
+
+ # Prepare the system prompt first (needed for token counting)
+ formatted_summarise_everything_system_prompt = (
+ summarise_everything_system_prompt.format(
+ consultation_context=context_textbox
+ )
+ )
+
+ # Apply reasoning suffix for GPT-OSS models (Local, inference-server, or AWS)
+ is_gpt_oss_model = (
+ "gpt-oss" in model_choice.lower() or "gpt_oss" in model_choice.lower()
+ )
+
+ if is_gpt_oss_model:
+ # Use default reasoning suffix if not set
+ effective_reasoning_suffix = (
+ reasoning_suffix if reasoning_suffix else "Reasoning: low"
+ )
+ if effective_reasoning_suffix:
+ formatted_summarise_everything_system_prompt = (
+ formatted_summarise_everything_system_prompt
+ + "\n"
+ + effective_reasoning_suffix
+ )
+ elif "Local" in model_source and reasoning_suffix:
+ # For other local models, use reasoning_suffix if provided
+ formatted_summarise_everything_system_prompt = (
+ formatted_summarise_everything_system_prompt
+ + "\n"
+ + reasoning_suffix
+ )
+
+ if additional_summary_instructions:
+ additional_summary_instructions = (
+ "Important additional instructions to follow closely: "
+ + additional_summary_instructions
+ )
+
+ # Create a test prompt with empty table to get base token count
+ test_summary_text = ""
+ test_formatted_summary_prompt = [
+ summarise_everything_prompt.format(
+ topic_summary_table=test_summary_text,
+ summary_format=summarise_format_radio,
+ additional_summary_instructions=additional_summary_instructions,
+ )
+ ]
+
+ # Calculate base token count (system prompt + prompt template without table)
+ full_test_text = (
+ formatted_summarise_everything_system_prompt
+ + "\n"
+ + test_formatted_summary_prompt[0]
+ )
+ base_token_count = count_tokens_in_text(
+ full_test_text, tokenizer, model_source
+ )
+
+ # Calculate available tokens for the summary table
+ available_tokens = LLM_CONTEXT_LENGTH - base_token_count
+
+ # Ensure markdown table rows don't get visually "split" by newlines inside cells.
+ # Markdown tables don't reliably support multiline cells, so we replace internal
+ # newlines with a single-line representation before calling `to_markdown()`.
+ def _escape_markdown_table_cell(value):
+ if not isinstance(value, str):
+ return value
+ s = value.replace("\r\n", "\n").replace("\r", "\n")
+ # Keep content in a single cell/row in markdown output
+ s = s.replace("\n", "\\n")
+ # Avoid breaking markdown table syntax
+ s = s.replace("|", "\\|")
+ return s
+
+ if "Summary" in group_df.columns:
+ group_df["Summary"] = group_df["Summary"].apply(
+ _escape_markdown_table_cell
+ )
+
+ # Truncate DataFrame rows if needed to fit within context limit
+ if len(group_df) > 0:
+ # Start with all rows and check if they fit
+ current_summary_text = group_df.to_markdown(index=False)
+ current_summary_text = clean_markdown_table_whitespace(
+ current_summary_text
+ )
+ current_token_count = count_tokens_in_text(
+ current_summary_text, tokenizer, model_source
+ )
+
+ # If the full table exceeds available tokens, truncate rows
+ if current_token_count > available_tokens:
+ print(
+ f"Warning: Summary table for group '{summary_group}' exceeds context limit. "
+ f"Truncating rows. Table tokens: {current_token_count}, Available: {available_tokens}"
+ )
+
+ # Binary search approach: find the maximum number of rows that fit
+ # Start with all rows and reduce until we fit
+ num_rows = len(group_df)
+ min_rows = 0
+ max_rows = num_rows
+ best_df = group_df.iloc[:0] # Empty DataFrame as fallback
+
+ # Try to find the maximum number of rows that fit
+ while min_rows < max_rows:
+ mid_rows = (min_rows + max_rows + 1) // 2
+ test_df = group_df.iloc[:mid_rows]
+ test_summary = test_df.to_markdown(index=False)
+ test_summary = clean_markdown_table_whitespace(test_summary)
+ test_token_count = count_tokens_in_text(
+ test_summary, tokenizer, model_source
+ )
+
+ if test_token_count <= available_tokens:
+ best_df = test_df
+ min_rows = mid_rows
+ else:
+ max_rows = mid_rows - 1
+
+ # Use the best fitting DataFrame
+ group_df = best_df
+ print(
+ f"Truncated to {len(group_df)} rows (from {num_rows} original rows) "
+ f"to fit within context limit."
+ )
+
+ # Create summary_text from (possibly truncated) DataFrame
+ summary_text = group_df.to_markdown(index=False)
+ # Clean extraneous whitespace from markdown table cells
+ summary_text = clean_markdown_table_whitespace(summary_text)
+
+ formatted_summary_prompt = [
+ summarise_everything_prompt.format(
+ topic_summary_table=summary_text,
+ summary_format=summarise_format_radio,
+ additional_summary_instructions=additional_summary_instructions,
+ )
+ ]
+
+ combined_prompt = (
+ formatted_summarise_everything_system_prompt
+ + "\n"
+ + formatted_summary_prompt[0]
+ )
+
+ try:
+ response, conversation_history, metadata, response_text = (
+ summarise_output_topics_query(
+ model_choice,
+ in_api_key,
+ temperature,
+ formatted_summary_prompt,
+ formatted_summarise_everything_system_prompt,
+ model_source,
+ bedrock_runtime,
+ local_model,
+ tokenizer=tokenizer,
+ assistant_model=assistant_model,
+ azure_endpoint=azure_endpoint_textbox,
+ api_url=api_url,
+ )
+ )
+ summarised_output_for_df = response_text
+ summarised_output = response
+ except Exception as e:
+ print(
+ "Cannot create overall summary for group:",
+ summary_group,
+ "due to:",
+ e,
+ )
+ summarised_output = ""
+ summarised_output_for_df = ""
+
+ # Remove multiple consecutive line breaks (2 or more) and replace with single line break
+ if summarised_output_for_df:
+ summarised_output_for_df = re.sub(
+ r"\n{2,}", "\n", summarised_output_for_df
+ )
+ # Convert markdown headers to Excel-friendly format
+ summarised_output_for_df = convert_markdown_headers_to_excel_format(
+ summarised_output_for_df
+ )
+ if summarised_output:
+ summarised_output = re.sub(r"\n{2,}", "\n", summarised_output)
+
+ summarised_outputs_for_df.append(summarised_output_for_df)
+ summarised_outputs.append(summarised_output)
+ txt_summarised_outputs.append(
+ f"""Group name: {summary_group}\n""" + summarised_output
+ )
+
+ out_metadata.extend(metadata)
+ out_metadata_str = ". ".join(out_metadata)
+
+ full_prompt = (
+ formatted_summarise_everything_system_prompt
+ + "\n"
+ + formatted_summary_prompt[0]
+ )
+
+ (
+ current_prompt_content_logged,
+ current_summary_content_logged,
+ current_conversation_content_logged,
+ current_metadata_content_logged,
+ ) = process_debug_output_iteration(
+ output_debug_files,
+ summaries_folder,
+ batch_file_path_details,
+ model_choice_clean_short,
+ full_prompt,
+ summarised_output,
+ conversation_history,
+ metadata,
+ log_output_files,
+ task_type=task_type,
+ )
+
+ all_prompts_content.append(current_prompt_content_logged)
+ all_summaries_content.append(current_summary_content_logged)
+ # all_conversation_content.append(current_conversation_content_logged)
+ all_metadata_content.append(current_metadata_content_logged)
+ all_groups_content.append(summary_group)
+ all_batches_content.append("1")
+ all_model_choice_content.append(model_choice_clean_short)
+ all_validated_content.append("No")
+ all_task_type_content.append(task_type)
+ all_file_names_content.append(reference_data_file_name)
+ latest_summary_completed += 1
+ clean_column_name(summary_group)
+
+ # Write overall outputs to csv
+ overall_summary_output_csv_path = (
+ output_folder
+ + "summaries/"
+ + batch_file_path_details
+ + "_overall_summary_"
+ + model_choice_clean_short
+ + ".csv"
+ )
+ summarised_outputs_df = pd.DataFrame(
+ data={"Group": unique_groups, "Summary": summarised_outputs_for_df}
+ )
+ if output_debug_files == "True":
+ summarised_outputs_df.drop(["1", "2", "3"], axis=1, errors="ignore").to_csv(
+ overall_summary_output_csv_path, index=None, encoding="utf-8-sig"
+ )
+ output_files.append(overall_summary_output_csv_path)
+
+ summarised_outputs_df_for_display = pd.DataFrame(
+ data={"Group": unique_groups, "Summary": summarised_outputs}
+ )
+ summarised_outputs_df_for_display["Summary"] = (
+ summarised_outputs_df_for_display["Summary"]
+ .apply(lambda x: markdown.markdown(x) if isinstance(x, str) else x)
+ .str.replace(r"\n", "
", regex=False)
+ .str.replace(r"(
\s*){2,}", "
", regex=True)
+ )
+ html_output_table = summarised_outputs_df_for_display.to_html(
+ index=False, escape=False
+ )
+
+ output_files = list(set(output_files))
+
+ input_tokens_num, output_tokens_num, number_of_calls_num = (
+ calculate_tokens_from_metadata(
+ out_metadata_str, model_choice, model_name_map
+ )
+ )
+
+ # Check if beyond max time allowed for processing and break if necessary
+ toc = time.perf_counter()
+ time_taken = toc - tic
+
+ out_message = "\n".join(out_message)
+ out_message = (
+ out_message
+ + " "
+ + f"Overall summary finished processing. Total time: {time_taken:.2f}s"
+ )
+ print(out_message)
+
+ # Combine the logged content into a list of dictionaries
+ all_logged_content = [
+ {
+ "prompt": prompt,
+ "response": summary,
+ "metadata": metadata,
+ "batch": batch,
+ "model_choice": model_choice,
+ "validated": validated,
+ "group": group,
+ "task_type": task_type,
+ "file_name": file_name,
+ }
+ for prompt, summary, metadata, batch, model_choice, validated, group, task_type, file_name in zip(
+ all_prompts_content,
+ all_summaries_content,
+ all_metadata_content,
+ all_batches_content,
+ all_model_choice_content,
+ all_validated_content,
+ all_groups_content,
+ all_task_type_content,
+ all_file_names_content,
+ )
+ ]
+
+ if isinstance(existing_logged_content, pd.DataFrame):
+ existing_logged_content = existing_logged_content.to_dict(orient="records")
+
+ out_logged_content = existing_logged_content + all_logged_content
+
+ return (
+ output_files,
+ html_output_table,
+ summarised_outputs_df,
+ out_metadata_str,
+ input_tokens_num,
+ output_tokens_num,
+ number_of_calls_num,
+ time_taken,
+ out_message,
+ out_logged_content,
+ combined_prompt,
+ response_text,
+ )
diff --git a/tools/textract_batch_call.py b/tools/textract_batch_call.py
new file mode 100644
index 0000000000000000000000000000000000000000..c57fddee7e3bb09258a81e5b7335bfeb72fb5992
--- /dev/null
+++ b/tools/textract_batch_call.py
@@ -0,0 +1,954 @@
+import ast
+import datetime
+import json
+import logging
+import os
+from io import StringIO
+from typing import List
+
+import boto3
+import gradio as gr
+import pandas as pd
+import pymupdf
+from botocore.exceptions import (
+ ClientError,
+ NoCredentialsError,
+ PartialCredentialsError,
+ TokenRetrievalError,
+)
+from gradio import FileData
+
+from tools.aws_functions import download_file_from_s3
+from tools.config import (
+ AWS_REGION,
+ DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS,
+ DOCUMENT_REDACTION_BUCKET,
+ INPUT_FOLDER,
+ LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
+ OUTPUT_FOLDER,
+ RUN_AWS_FUNCTIONS,
+ TEXTRACT_JOBS_LOCAL_LOC,
+ TEXTRACT_JOBS_S3_LOC,
+ TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET,
+)
+from tools.file_conversion import get_input_file_names
+from tools.helper_functions import get_file_name_without_type, get_textract_file_suffix
+from tools.secure_path_utils import (
+ secure_basename,
+ secure_file_write,
+ secure_join,
+)
+
+
+def analyse_document_with_textract_api(
+ local_pdf_path: str,
+ s3_input_prefix: str,
+ s3_output_prefix: str,
+ job_df: pd.DataFrame,
+ s3_bucket_name: str = TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET,
+ local_output_dir: str = OUTPUT_FOLDER,
+ handwrite_signature_checkbox: List[str] = list(),
+ successful_job_number: int = 0,
+ total_document_page_count: int = 1,
+ general_s3_bucket_name: str = DOCUMENT_REDACTION_BUCKET,
+ aws_region: str = AWS_REGION, # Optional: specify region if not default
+):
+ """
+ Uploads a local PDF to S3, starts a Textract analysis job (detecting text & signatures),
+ waits for completion, and downloads the output JSON from S3 to a local directory.
+
+ Args:
+ local_pdf_path (str): Path to the local PDF file.
+ s3_bucket_name (str): Name of the S3 bucket to use.
+ s3_input_prefix (str): S3 prefix (folder) to upload the input PDF.
+ s3_output_prefix (str): S3 prefix (folder) where Textract should write output.
+ job_df (pd.DataFrame): Dataframe containing information from previous Textract API calls.
+ s3_bucket_name (str, optional): S3 bucket in which to save API call outputs.
+ local_output_dir (str, optional): Local directory to save the downloaded JSON results.
+ handwrite_signature_checkbox (List[str], optional): List of feature types to extract from the document.
+ successful_job_number (int): The number of successful jobs that have been submitted in this session.
+ total_document_page_count (int): The number of pages in the document
+ aws_region (str, optional): AWS region name. Defaults to boto3 default region.
+
+ Returns:
+ str: Path to the downloaded local JSON output file, or None if failed.
+
+ Raises:
+ FileNotFoundError: If the local_pdf_path does not exist.
+ boto3.exceptions.NoCredentialsError: If AWS credentials are not found.
+ Exception: For other AWS errors or job failures.
+ """
+
+ # This is a variable that is written to logs to indicate that a Textract API call was made
+ is_a_textract_api_call = True
+ task_textbox = "textract"
+
+ # Keep only latest pdf path if it's a list
+ if isinstance(local_pdf_path, list):
+ local_pdf_path = local_pdf_path[-1]
+
+ if not os.path.exists(local_pdf_path):
+ raise FileNotFoundError(f"Input document not found {local_pdf_path}")
+
+ file_extension = os.path.splitext(local_pdf_path)[1].lower()
+
+ # Load pdf to get page count if not provided
+ if not total_document_page_count and file_extension in [".pdf"]:
+ print("Page count not provided. Loading PDF to get page count")
+ try:
+ pymupdf_doc = pymupdf.open(local_pdf_path)
+ total_document_page_count = pymupdf_doc.page_count
+ pymupdf_doc.close()
+ print("Page count:", total_document_page_count)
+ except Exception as e:
+ print("Failed to load PDF to get page count:", e, "setting page count to 1")
+ total_document_page_count = 1
+ # raise Exception(f"Failed to load PDF to get page count: {e}")
+ else:
+ total_document_page_count = 1
+
+ if not os.path.exists(local_output_dir):
+ os.makedirs(local_output_dir)
+ log_message = f"Created local output directory: {local_output_dir}"
+ print(log_message)
+ # logging.info(log_message)
+
+ # Initialize boto3 clients
+ session = boto3.Session(region_name=aws_region)
+ s3_client = session.client("s3")
+ textract_client = session.client("textract")
+
+ # --- 1. Upload PDF to S3 ---
+ pdf_filename = secure_basename(local_pdf_path)
+ s3_input_key = secure_join(s3_input_prefix, pdf_filename).replace(
+ "\\", "/"
+ ) # Ensure forward slashes for S3
+
+ log_message = (
+ f"Uploading '{local_pdf_path}' to 's3://{s3_bucket_name}/{s3_input_key}'..."
+ )
+ print(log_message)
+ # logging.info(log_message)
+ try:
+ s3_client.upload_file(local_pdf_path, s3_bucket_name, s3_input_key)
+ log_message = "Upload successful."
+ print(log_message)
+ # logging.info(log_message)
+ except Exception as e:
+ log_message = f"Failed to upload PDF to S3: {e}"
+ print(log_message)
+ # logging.error(log_message)
+ raise
+
+ # Filter job_df to include rows only where the analysis date is after the current date - DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS
+ job_df["job_date_time"] = pd.to_datetime(job_df["job_date_time"], errors="coerce")
+
+ if not job_df.empty:
+ job_df = job_df.loc[
+ job_df["job_date_time"]
+ > (
+ datetime.datetime.now()
+ - datetime.timedelta(days=DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS)
+ ),
+ :,
+ ]
+
+ # If job_df is not empty
+ if not job_df.empty:
+
+ if "file_name" in job_df.columns:
+ matching_job_id_file_names = job_df.loc[
+ (job_df["file_name"] == pdf_filename)
+ & (
+ job_df["signature_extraction"].astype(str)
+ == str(handwrite_signature_checkbox)
+ ),
+ "file_name",
+ ]
+ matching_job_id_file_names_dates = job_df.loc[
+ (job_df["file_name"] == pdf_filename)
+ & (
+ job_df["signature_extraction"].astype(str)
+ == str(handwrite_signature_checkbox)
+ ),
+ "job_date_time",
+ ]
+ matching_job_id = job_df.loc[
+ (job_df["file_name"] == pdf_filename)
+ & (
+ job_df["signature_extraction"].astype(str)
+ == str(handwrite_signature_checkbox)
+ ),
+ "job_id",
+ ]
+ matching_handwrite_signature = job_df.loc[
+ (job_df["file_name"] == pdf_filename)
+ & (
+ job_df["signature_extraction"].astype(str)
+ == str(handwrite_signature_checkbox)
+ ),
+ "signature_extraction",
+ ]
+
+ if len(matching_job_id) > 0:
+ pass
+ else:
+ matching_job_id = "unknown_job_id"
+
+ if (
+ len(matching_job_id_file_names) > 0
+ and len(matching_handwrite_signature) > 0
+ ):
+ out_message = f"Existing Textract outputs found for file {pdf_filename} from date {matching_job_id_file_names_dates.iloc[0]}. No need to re-analyse. Please download existing results from the list with job ID {matching_job_id.iloc[0]}"
+ print(out_message)
+ raise Exception(out_message)
+
+ # --- 2. Start Textract Document Analysis ---
+ message = "Starting Textract document analysis job..."
+ print(message)
+
+ try:
+ if (
+ "Extract signatures" in handwrite_signature_checkbox
+ or "Extract forms" in handwrite_signature_checkbox
+ or "Extract layout" in handwrite_signature_checkbox
+ or "Extract tables" in handwrite_signature_checkbox
+ ):
+ feature_types = list()
+ if "Extract signatures" in handwrite_signature_checkbox:
+ feature_types.append("SIGNATURES")
+ if "Extract forms" in handwrite_signature_checkbox:
+ feature_types.append("FORMS")
+ if "Extract layout" in handwrite_signature_checkbox:
+ feature_types.append("LAYOUT")
+ if "Extract tables" in handwrite_signature_checkbox:
+ feature_types.append("TABLES")
+ response = textract_client.start_document_analysis(
+ DocumentLocation={
+ "S3Object": {"Bucket": s3_bucket_name, "Name": s3_input_key}
+ },
+ FeatureTypes=feature_types, # Analyze for signatures, forms, and tables
+ OutputConfig={"S3Bucket": s3_bucket_name, "S3Prefix": s3_output_prefix},
+ )
+ job_type = "document_analysis"
+
+ if (
+ "Extract signatures" not in handwrite_signature_checkbox
+ and "Extract forms" not in handwrite_signature_checkbox
+ and "Extract layout" not in handwrite_signature_checkbox
+ and "Extract tables" not in handwrite_signature_checkbox
+ ):
+ response = textract_client.start_document_text_detection(
+ DocumentLocation={
+ "S3Object": {"Bucket": s3_bucket_name, "Name": s3_input_key}
+ },
+ OutputConfig={"S3Bucket": s3_bucket_name, "S3Prefix": s3_output_prefix},
+ )
+ job_type = "document_text_detection"
+
+ job_id = response["JobId"]
+ print(f"Textract job started with JobId: {job_id}")
+
+ # Prepare CSV in memory
+ log_csv_key_location = f"{s3_output_prefix}/textract_document_jobs.csv"
+
+ StringIO()
+ log_df = pd.DataFrame(
+ [
+ {
+ "job_id": job_id,
+ "file_name": pdf_filename,
+ "job_type": job_type,
+ "signature_extraction": handwrite_signature_checkbox,
+ "job_date_time": datetime.datetime.now().strftime(
+ "%Y-%m-%d %H:%M:%S"
+ ),
+ }
+ ]
+ )
+
+ # File path
+ log_file_path = secure_join(local_output_dir, "textract_document_jobs.csv")
+
+ # Write latest job ID to local text file
+ secure_file_write(
+ local_output_dir,
+ pdf_filename + "_textract_document_jobs_job_id.txt",
+ job_id,
+ )
+
+ # Check if file exists
+ file_exists = os.path.exists(log_file_path)
+
+ # Append to CSV if it exists, otherwise write with header
+ log_df.to_csv(log_file_path, mode="a", index=False, header=not file_exists)
+
+ # log_df.to_csv(csv_buffer)
+
+ # Upload the file
+ s3_client.upload_file(
+ log_file_path, general_s3_bucket_name, log_csv_key_location
+ )
+
+ # Upload to S3 (overwrite existing file)
+ # s3_client.put_object(Bucket=general_s3_bucket_name, Key=log_csv_key_location, Body=csv_buffer.getvalue())
+ print(f"Job ID written to {log_csv_key_location}")
+ # logging.info(f"Job ID written to s3://{s3_bucket_name}/{s3_output_prefix}/textract_document_jobs.csv")
+
+ except Exception as e:
+ error = f"Failed to start Textract job: {e}"
+ print(error)
+ # logging.error(error)
+ raise
+
+ successful_job_number += 1
+ total_number_of_textract_page_calls = total_document_page_count
+
+ return (
+ f"Textract analysis job submitted, job ID:{job_id}",
+ job_id,
+ job_type,
+ successful_job_number,
+ is_a_textract_api_call,
+ total_number_of_textract_page_calls,
+ task_textbox,
+ )
+
+
+def return_job_status(
+ job_id: str,
+ response: dict,
+ attempts: int,
+ poll_interval_seconds: int = 0,
+ max_polling_attempts: int = 1, # ~10 minutes total wait time
+):
+ """
+ Polls the AWS Textract service to retrieve the current status of an asynchronous document analysis job.
+ This function checks the job status from the provided response and logs relevant information or errors.
+
+ Args:
+ job_id (str): The unique identifier of the Textract job.
+ response (dict): The response dictionary received from Textract's `get_document_analysis` or `get_document_text_detection` call.
+ attempts (int): The current polling attempt number.
+ poll_interval_seconds (int, optional): The time in seconds to wait before the next poll (currently unused in this function, but kept for context). Defaults to 0.
+ max_polling_attempts (int, optional): The maximum number of polling attempts allowed (currently unused in this function, but kept for context). Defaults to 1.
+
+ Returns:
+ str: The current status of the Textract job (e.g., 'IN_PROGRESS', 'SUCCEEDED').
+
+ Raises:
+ Exception: If the Textract job status is 'FAILED' or 'PARTIAL_SUCCESS', or if an unexpected status is encountered.
+ """
+
+ job_status = response["JobStatus"]
+ logging.info(
+ f"Polling attempt {attempts}/{max_polling_attempts}. Job status: {job_status}"
+ )
+
+ if job_status == "IN_PROGRESS":
+ pass
+ # time.sleep(poll_interval_seconds)
+ elif job_status == "SUCCEEDED":
+ logging.info("Textract job succeeded.")
+ elif job_status in ["FAILED", "PARTIAL_SUCCESS"]:
+ status_message = response.get("StatusMessage", "No status message provided.")
+ warnings = response.get("Warnings", [])
+ logging.error(
+ f"Textract job ended with status: {job_status}. Message: {status_message}"
+ )
+ if warnings:
+ logging.warning(f"Warnings: {warnings}")
+ # Decide if PARTIAL_SUCCESS should proceed or raise error
+ # For simplicity here, we raise for both FAILED and PARTIAL_SUCCESS
+ raise Exception(
+ f"Textract job {job_id} failed or partially failed. Status: {job_status}. Message: {status_message}"
+ )
+ else:
+ # Should not happen based on documentation, but handle defensively
+ raise Exception(f"Unexpected Textract job status: {job_status}")
+
+ return job_status
+
+
+def download_textract_job_files(
+ s3_client: str,
+ s3_bucket_name: str,
+ s3_output_key_prefix: str,
+ pdf_filename: str,
+ job_id: str,
+ local_output_dir: str,
+ handwrite_signature_checkbox: List[str] = list(),
+):
+ """
+ Download and combine output job files from AWS Textract for a given job.
+
+ Args:
+ s3_client (boto3.client): The Boto3 S3 client to interact with AWS S3.
+ s3_bucket_name (str): Name of the S3 bucket where Textract job outputs are stored.
+ s3_output_key_prefix (str): S3 prefix (folder path) under which job output files are located (usually ends with job_id/).
+ pdf_filename (str): The name of the PDF file related to this Textract job (used for local naming or logging, not S3 lookup).
+ job_id (str): The AWS Textract job ID whose outputs are being fetched.
+ local_output_dir (str): The local directory in which to save downloaded and combined results.
+ handwrite_signature_checkbox (List[str], optional): List indicating user options regarding post-processing for handwriting/signature (used for filtering or downstream handling).
+
+ Returns:
+ str: The local file path to the combined output JSON file.
+
+ Raises:
+ Exception: If no output files are found, or if an error occurs during download or processing.
+ """
+ list_response = s3_client.list_objects_v2(
+ Bucket=s3_bucket_name, Prefix=s3_output_key_prefix
+ )
+
+ output_files = list_response.get("Contents", [])
+ if not output_files:
+ list_response = s3_client.list_objects_v2(
+ Bucket=s3_bucket_name, Prefix=s3_output_key_prefix
+ )
+
+ if not output_files:
+ out_message = (
+ f"No output files found in s3://{s3_bucket_name}/{s3_output_key_prefix}"
+ )
+ print(out_message)
+ raise Exception(out_message)
+
+ # Usually, we only need the first/main JSON output file(s)
+ # For simplicity, download the first one found. A more complex scenario might merge multiple files.
+ # Filter out potential directory markers if any key ends with '/'
+ json_files_to_download = [
+ f
+ for f in output_files
+ if f["Key"] != s3_output_key_prefix
+ and not f["Key"].endswith("/")
+ and "access_check" not in f["Key"]
+ ]
+
+ # print("json_files_to_download:", json_files_to_download)
+
+ if not json_files_to_download:
+ error = f"No JSON files found (only prefix marker?) in s3://{s3_bucket_name}/{s3_output_key_prefix}"
+ print(error)
+ # logging.error(error)
+ raise FileNotFoundError(error)
+
+ combined_blocks = []
+
+ for f in sorted(
+ json_files_to_download, key=lambda x: x["Key"]
+ ): # Optional: sort to ensure consistent order
+ obj = s3_client.get_object(Bucket=s3_bucket_name, Key=f["Key"])
+ data = json.loads(obj["Body"].read())
+
+ # Assuming Textract-style output with a "Blocks" key
+ if "Blocks" in data:
+ combined_blocks.extend(data["Blocks"])
+ else:
+ logging.warning(f"No 'Blocks' key in file: {f['Key']}")
+
+ # Build final combined JSON structure
+ combined_output = {
+ "DocumentMetadata": {
+ "Pages": len(set(block.get("Page", 1) for block in combined_blocks))
+ },
+ "Blocks": combined_blocks,
+ "JobStatus": "SUCCEEDED",
+ }
+
+ output_filename_base = os.path.basename(pdf_filename)
+ output_filename_base_no_ext = os.path.splitext(output_filename_base)[0]
+ # Generate suffix based on checkbox options
+ textract_suffix = get_textract_file_suffix(handwrite_signature_checkbox)
+ local_output_filename = (
+ f"{output_filename_base_no_ext}{textract_suffix}_textract.json"
+ )
+ local_output_path = secure_join(local_output_dir, local_output_filename)
+
+ secure_file_write(
+ local_output_dir, local_output_filename, json.dumps(combined_output)
+ )
+
+ print(f"Combined Textract output written to {local_output_path}")
+
+ downloaded_file_path = local_output_path
+
+ return downloaded_file_path
+
+
+def check_for_provided_job_id(job_id: str):
+ if not job_id:
+ raise Exception("Please provide a job ID.")
+ return
+
+
+def load_pdf_job_file_from_s3(
+ load_s3_jobs_input_loc: str,
+ pdf_filename: str,
+ local_output_dir: str,
+ s3_bucket_name: str,
+ RUN_AWS_FUNCTIONS: bool = RUN_AWS_FUNCTIONS,
+) -> tuple:
+ """
+ Downloads a PDF job file from S3 and saves it locally.
+
+ Args:
+ load_s3_jobs_input_loc (str): S3 prefix/location where the PDF job file is stored.
+ pdf_filename (str): The name of the PDF file (without .pdf extension).
+ local_output_dir (str): Directory to which the file should be saved locally.
+ s3_bucket_name (str): The S3 bucket name.
+ RUN_AWS_FUNCTIONS (bool, optional): Whether to run AWS functions (download from S3). Defaults to RUN_AWS_FUNCTIONS.
+
+ Returns:
+ tuple: (pdf_file_location (list of str), doc_file_name_no_extension_textbox (str))
+ """
+
+ try:
+ pdf_file_location = ""
+ doc_file_name_no_extension_textbox = ""
+
+ s3_input_key_prefix = secure_join(load_s3_jobs_input_loc, pdf_filename).replace(
+ "\\", "/"
+ )
+ s3_input_key_prefix = s3_input_key_prefix + ".pdf"
+
+ local_input_file_path = secure_join(local_output_dir, pdf_filename)
+ local_input_file_path = local_input_file_path + ".pdf"
+
+ download_file_from_s3(
+ s3_bucket_name,
+ s3_input_key_prefix,
+ local_input_file_path,
+ RUN_AWS_FUNCTIONS=RUN_AWS_FUNCTIONS,
+ )
+
+ pdf_file_location = [local_input_file_path]
+ doc_file_name_no_extension_textbox = get_file_name_without_type(pdf_filename)
+ except Exception as e:
+ print("Could not download PDF job file from S3 due to:", e)
+
+ return pdf_file_location, doc_file_name_no_extension_textbox
+
+
+def replace_existing_pdf_input_for_whole_document_outputs(
+ load_s3_jobs_input_loc: str,
+ pdf_filename: str,
+ local_output_dir: str,
+ s3_bucket_name: str,
+ in_doc_files: FileData = [],
+ input_folder: str = INPUT_FOLDER,
+ RUN_AWS_FUNCTIONS=RUN_AWS_FUNCTIONS,
+ progress=gr.Progress(track_tqdm=True),
+):
+ """
+ Ensures the PDF input for whole document outputs is loaded from S3 unless an identical PDF is already supplied.
+
+ Args:
+ load_s3_jobs_input_loc (str): The S3 input prefix/location for the PDF job file.
+ pdf_filename (str): The PDF file name (without extension).
+ local_output_dir (str): The local directory for saving the file.
+ s3_bucket_name (str): The S3 bucket name.
+ in_doc_files (FileData, optional): List of Gradio FileData objects or paths that may already contain the PDF file. Defaults to [].
+ input_folder (str, optional): Input folder path on disk. Defaults to INPUT_FOLDER.
+ RUN_AWS_FUNCTIONS (bool, optional): Whether to run AWS-related operations. Defaults to RUN_AWS_FUNCTIONS global.
+ progress (gr.Progress, optional): Gradio Progress object for reporting progress. Defaults to a tqdm-enabled progress tracker.
+
+ Returns:
+ Returns the downloaded file location and associated file name information for downstream use.
+ """
+
+ progress(0.1, "Loading PDF from s3")
+
+ if in_doc_files:
+ (
+ doc_file_name_no_extension_textbox,
+ doc_file_name_with_extension_textbox,
+ doc_full_file_name_textbox,
+ doc_file_name_textbox_list,
+ total_pdf_page_count,
+ ) = get_input_file_names(in_doc_files)
+
+ if pdf_filename == doc_file_name_no_extension_textbox:
+ print("Existing loaded PDF file has same name as file from S3")
+ doc_file_name_no_extension_textbox = pdf_filename
+ downloaded_pdf_file_location = in_doc_files
+ else:
+ downloaded_pdf_file_location, doc_file_name_no_extension_textbox = (
+ load_pdf_job_file_from_s3(
+ load_s3_jobs_input_loc,
+ pdf_filename,
+ local_output_dir,
+ s3_bucket_name,
+ RUN_AWS_FUNCTIONS=RUN_AWS_FUNCTIONS,
+ )
+ )
+
+ (
+ doc_file_name_no_extension_textbox,
+ doc_file_name_with_extension_textbox,
+ doc_full_file_name_textbox,
+ doc_file_name_textbox_list,
+ total_pdf_page_count,
+ ) = get_input_file_names(downloaded_pdf_file_location)
+ else:
+ downloaded_pdf_file_location, doc_file_name_no_extension_textbox = (
+ load_pdf_job_file_from_s3(
+ load_s3_jobs_input_loc,
+ pdf_filename,
+ local_output_dir,
+ s3_bucket_name,
+ RUN_AWS_FUNCTIONS=RUN_AWS_FUNCTIONS,
+ )
+ )
+
+ (
+ doc_file_name_no_extension_textbox,
+ doc_file_name_with_extension_textbox,
+ doc_full_file_name_textbox,
+ doc_file_name_textbox_list,
+ total_pdf_page_count,
+ ) = get_input_file_names(downloaded_pdf_file_location)
+
+ return (
+ downloaded_pdf_file_location,
+ doc_file_name_no_extension_textbox,
+ doc_file_name_with_extension_textbox,
+ doc_full_file_name_textbox,
+ doc_file_name_textbox_list,
+ total_pdf_page_count,
+ )
+
+
+def poll_whole_document_textract_analysis_progress_and_download(
+ job_id: str,
+ job_type_dropdown: str,
+ s3_output_prefix: str,
+ pdf_filename: str,
+ job_df: pd.DataFrame,
+ s3_bucket_name: str = TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET,
+ local_output_dir: str = OUTPUT_FOLDER,
+ load_s3_jobs_loc: str = TEXTRACT_JOBS_S3_LOC,
+ load_local_jobs_loc: str = TEXTRACT_JOBS_LOCAL_LOC,
+ aws_region: str = AWS_REGION, # Optional: specify region if not default
+ load_jobs_from_s3: str = LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
+ poll_interval_seconds: int = 1,
+ max_polling_attempts: int = 1, # ~10 minutes total wait time
+ DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS: int = DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS,
+ progress=gr.Progress(track_tqdm=True),
+):
+ """
+ Polls AWS Textract for the status of a document analysis job and, once finished, downloads and combines the output into a local JSON file for further processing.
+
+ Args:
+ job_id (str): The AWS Textract job ID to check for completion.
+ job_type_dropdown (str): The Textract operation type to use ('document_analysis' or 'document_text_detection').
+ s3_output_prefix (str): The S3 prefix (folder path) where the job's output files are located.
+ pdf_filename (str): The name of the PDF document associated with this job.
+ job_df (pd.DataFrame): DataFrame containing information from previous Textract API calls.
+ s3_bucket_name (str, optional): S3 bucket containing the job outputs. Defaults to TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET.
+ local_output_dir (str, optional): Local directory to which output JSON results will be saved. Defaults to OUTPUT_FOLDER.
+ load_s3_jobs_loc (str, optional): S3 location for previously saved Textract jobs metadata. Defaults to TEXTRACT_JOBS_S3_LOC.
+ load_local_jobs_loc (str, optional): Local location for previously saved Textract jobs metadata. Defaults to TEXTRACT_JOBS_LOCAL_LOC.
+ aws_region (str, optional): AWS region for API calls. Defaults to AWS_REGION.
+ load_jobs_from_s3 (str, optional): Whether to load previous jobs from S3 or local. Defaults to LOAD_PREVIOUS_TEXTRACT_JOBS_S3.
+ poll_interval_seconds (int, optional): Seconds between polling attempts. Defaults to 1.
+ max_polling_attempts (int, optional): How many times to check the job's status before timing out. Defaults to 1.
+ DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS (int, optional): How many days back to display finished jobs. Defaults to DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS.
+ progress (gr.Progress, optional): Gradio Progress object for tracking progress in a UI.
+
+ Returns:
+ [function output not explicitly documented here; see function logic for details]
+
+ Raises:
+ Exception: If job fails, polling times out, or download fails.
+ """
+
+ progress(0.1, "Querying AWS Textract for status of document analysis job")
+
+ if job_id:
+ # Initialize boto3 clients
+ session = boto3.Session(region_name=aws_region)
+ s3_client = session.client("s3")
+ textract_client = session.client("textract")
+
+ # --- 3. Poll for Job Completion ---
+ job_status = "IN_PROGRESS"
+ attempts = 0
+
+ message = "Polling Textract for job completion status..."
+ print(message)
+ # logging.info("Polling Textract for job completion status...")
+
+ # Update Textract document history df
+ try:
+ job_df = load_in_textract_job_details(
+ load_s3_jobs=load_jobs_from_s3,
+ load_s3_jobs_loc=load_s3_jobs_loc,
+ load_local_jobs_loc=load_local_jobs_loc,
+ )
+ except Exception as e:
+ print(f"Failed to update job details dataframe: {e}")
+
+ while job_status == "IN_PROGRESS" and attempts <= max_polling_attempts:
+ attempts += 1
+ try:
+ if job_type_dropdown == "document_analysis":
+ response = textract_client.get_document_analysis(JobId=job_id)
+ job_status = return_job_status(
+ job_id,
+ response,
+ attempts,
+ poll_interval_seconds,
+ max_polling_attempts,
+ )
+ elif job_type_dropdown == "document_text_detection":
+ response = textract_client.get_document_text_detection(JobId=job_id)
+ job_status = return_job_status(
+ job_id,
+ response,
+ attempts,
+ poll_interval_seconds,
+ max_polling_attempts,
+ )
+ else:
+ error = "Unknown job type, cannot poll job"
+ print(error)
+ logging.error(error)
+ raise Exception(error)
+
+ except textract_client.exceptions.InvalidJobIdException:
+ error_message = f"Invalid JobId: {job_id}. This might happen if the job expired (older than {DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS} days) or never existed."
+ print(error_message)
+ logging.error(error_message)
+ raise Exception(error_message)
+ except Exception as e:
+ error_message = (
+ f"Error while polling Textract status for job {job_id}: {e}"
+ )
+ print(error_message)
+ logging.error(error_message)
+ raise Exception(error_message)
+
+ downloaded_file_path = None
+ if job_status == "SUCCEEDED":
+ # raise TimeoutError(f"Textract job {job_id} did not complete successfully within the polling limit.")
+ # 3b - Replace PDF file name if it exists in the job dataframe
+
+ progress(0.5, "Document analysis task outputs found. Downloading from S3")
+
+ # If job_df is not empty
+
+ # if not job_df.empty:
+ # job_df = job_df.loc[job_df["job_date_time"] > (datetime.datetime.now() - datetime.timedelta(days=DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS)),:]
+
+ # Extract signature_extraction from job_df for file naming
+ handwrite_signature_checkbox = list()
+ if not job_df.empty:
+ if "signature_extraction" in job_df.columns:
+ matching_signature_extraction = job_df.loc[
+ job_df["job_id"] == job_id, "signature_extraction"
+ ]
+ if not matching_signature_extraction.empty:
+ signature_extraction_str = matching_signature_extraction.iloc[0]
+ # Convert string representation to list
+ # Handle both string representations like "['Extract signatures']" and actual lists
+ if isinstance(signature_extraction_str, str):
+ try:
+ handwrite_signature_checkbox = ast.literal_eval(
+ signature_extraction_str
+ )
+ except (ValueError, SyntaxError):
+ # If parsing fails, try to extract from string
+ handwrite_signature_checkbox = [
+ signature_extraction_str
+ ]
+ elif isinstance(signature_extraction_str, list):
+ handwrite_signature_checkbox = signature_extraction_str
+
+ if "file_name" in job_df.columns:
+ matching_job_id_file_names = job_df.loc[
+ job_df["job_id"] == job_id, "file_name"
+ ]
+
+ if pdf_filename and not matching_job_id_file_names.empty:
+ if pdf_filename == matching_job_id_file_names.iloc[0]:
+ out_message = f"Existing Textract outputs found for file {pdf_filename}. No need to re-download."
+ gr.Warning(out_message)
+ raise Exception(out_message)
+
+ if not matching_job_id_file_names.empty:
+ pdf_filename = matching_job_id_file_names.iloc[0]
+ else:
+ pdf_filename = "unknown_file"
+
+ # --- 4. Download Output JSON from S3 ---
+ # Textract typically creates output under s3_output_prefix/job_id/
+ # There might be multiple JSON files if pagination occurred during writing.
+ # Usually, for smaller docs, there's one file, often named '1'.
+ # For robust handling, list objects and find the JSON(s).
+
+ s3_output_key_prefix = (
+ secure_join(s3_output_prefix, job_id).replace("\\", "/") + "/"
+ )
+ logging.info(
+ f"Searching for output files in s3://{s3_bucket_name}/{s3_output_key_prefix}"
+ )
+
+ try:
+ downloaded_file_path = download_textract_job_files(
+ s3_client,
+ s3_bucket_name,
+ s3_output_key_prefix,
+ pdf_filename,
+ job_id,
+ local_output_dir,
+ handwrite_signature_checkbox,
+ )
+
+ except Exception as e:
+ out_message = (
+ f"Failed to download or process Textract output from S3. Error: {e}"
+ )
+ print(out_message)
+ raise Exception(out_message)
+
+ else:
+ raise Exception("No Job ID provided.")
+
+ output_pdf_filename = get_file_name_without_type(pdf_filename)
+
+ return downloaded_file_path, job_status, job_df, output_pdf_filename
+
+
+def load_in_textract_job_details(
+ load_s3_jobs: str = LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
+ load_s3_jobs_loc: str = TEXTRACT_JOBS_S3_LOC,
+ load_local_jobs_loc: str = TEXTRACT_JOBS_LOCAL_LOC,
+ document_redaction_bucket: str = DOCUMENT_REDACTION_BUCKET,
+ aws_region: str = AWS_REGION,
+ DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS: int = DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS,
+):
+ """
+ Load in a dataframe of jobs previous submitted to the Textract API service.
+ """
+ job_df = pd.DataFrame(
+ columns=[
+ "job_id",
+ "file_name",
+ "job_type",
+ "signature_extraction",
+ "job_date_time",
+ ]
+ )
+
+ # Initialize boto3 clients
+ session = boto3.Session(region_name=aws_region)
+ s3_client = session.client("s3")
+
+ local_output_path = f"{load_local_jobs_loc}/textract_document_jobs.csv"
+
+ if load_s3_jobs == "True":
+ s3_output_key = f"{load_s3_jobs_loc}/textract_document_jobs.csv"
+
+ try:
+ s3_client.head_object(Bucket=document_redaction_bucket, Key=s3_output_key)
+ # print(f"File exists. Downloading from '{s3_output_key}' to '{local_output_path}'...")
+ s3_client.download_file(
+ document_redaction_bucket, s3_output_key, local_output_path
+ )
+ # print("Download successful.")
+ except ClientError as e:
+ if e.response["Error"]["Code"] == "404":
+ print("Log file does not exist in S3.")
+ else:
+ print(f"Unexpected error occurred: {e}")
+ except (NoCredentialsError, PartialCredentialsError, TokenRetrievalError) as e:
+ print(f"AWS credential issue encountered: {e}")
+ print("Skipping S3 log file download.")
+
+ # If the log path exists, load it in
+ if os.path.exists(local_output_path):
+ print("Found Textract job list log file in local path")
+ job_df = pd.read_csv(local_output_path)
+
+ if "job_date_time" in job_df.columns:
+ job_df["job_date_time"] = pd.to_datetime(
+ job_df["job_date_time"], errors="coerce"
+ )
+ # Keep only jobs that have been completed in the last 'DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS' days
+ cutoff_time = pd.Timestamp.now() - pd.Timedelta(
+ days=DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS
+ )
+ job_df = job_df.loc[job_df["job_date_time"] > cutoff_time, :]
+
+ try:
+ job_df = job_df[
+ [
+ "job_id",
+ "file_name",
+ "job_type",
+ "signature_extraction",
+ "job_date_time",
+ ]
+ ]
+ except Exception as e:
+ print(
+ "Could not find one or more columns in Textract job list log file.",
+ f"Error: {e}",
+ )
+
+ return job_df
+
+
+def download_textract_output(
+ job_id: str, output_bucket: str, output_prefix: str, local_folder: str
+):
+ """
+ Checks the status of a Textract job and downloads the output ZIP file if the job is complete.
+
+ :param job_id: The Textract job ID.
+ :param output_bucket: The S3 bucket where the output is stored.
+ :param output_prefix: The prefix (folder path) in S3 where the output file is stored.
+ :param local_folder: The local directory where the ZIP file should be saved.
+ """
+ textract_client = boto3.client("textract")
+ s3_client = boto3.client("s3")
+
+ # Check job status
+ while True:
+ response = textract_client.get_document_analysis(JobId=job_id)
+ status = response["JobStatus"]
+
+ if status == "SUCCEEDED":
+ print("Job completed successfully.")
+ break
+ elif status == "FAILED":
+ print(
+ "Job failed:",
+ response.get("StatusMessage", "No error message provided."),
+ )
+ return
+ else:
+ print(f"Job is still {status}.")
+ # time.sleep(10) # Wait before checking again
+
+ # Find output ZIP file in S3
+ output_file_key = f"{output_prefix}/{job_id}.zip"
+ local_file_path = secure_join(local_folder, f"{job_id}.zip")
+
+ # Download file
+ try:
+ s3_client.download_file(output_bucket, output_file_key, local_file_path)
+ print(f"Output file downloaded to: {local_file_path}")
+ except Exception as e:
+ print(f"Error downloading file: {e}")
+
+
+def check_textract_outputs_exist(textract_output_found_checkbox):
+ if textract_output_found_checkbox is True:
+ print("Textract outputs found")
+ return
+ else:
+ raise Exception(
+ "Relevant Textract outputs not found. Please ensure you have selected to correct results output and you have uploaded the relevant document file in 'Choose document or image file...' above"
+ )
diff --git a/tools/word_segmenter.py b/tools/word_segmenter.py
new file mode 100644
index 0000000000000000000000000000000000000000..e602e0d58a35fe03894c0dd6524ec00605be6571
--- /dev/null
+++ b/tools/word_segmenter.py
@@ -0,0 +1,2011 @@
+import os
+from bisect import bisect_left
+from typing import Dict, List, Tuple
+
+import cv2
+import numpy as np
+
+from tools.config import OUTPUT_FOLDER, SAVE_WORD_SEGMENTER_OUTPUT_IMAGES
+
+# Adaptive thresholding parameters (resolution-independent via line_height / median CC height)
+BLOCK_SIZE_FACTOR = 0.5 # Fraction of line_height when median CC height unavailable
+BLOCK_SIZE_MEDIAN_CC_FACTOR = 1.2 # Block size = median_cc_height * this when available
+C_VALUE = 2 # Constant subtracted from mean in adaptive thresholding
+REFERENCE_LINE_HEIGHT = 50 # Line height (px) at which NOISE_THRESHOLD is defined
+
+# Word segmentation search parameters
+INITIAL_KERNEL_WIDTH_FACTOR = 0.0 # Starting kernel width factor for Stage 2 search
+INITIAL_VALLEY_THRESHOLD_FACTOR = (
+ 0.0 # Starting valley threshold factor for Stage 1 search
+)
+MAIN_VALLEY_THRESHOLD_FACTOR = (
+ 0.15 # Primary valley threshold factor for word separation
+)
+MIN_SPACE_FACTOR = 0.2 # Minimum space width relative to character width
+MATCH_TOLERANCE = 0 # Tolerance for word count matching
+
+# Noise removal parameters (resolution-independent: derived from line_height)
+MIN_AREA_HEIGHT_FRACTION = 0.05 # MIN_AREA = (line_height * this)^2
+MIN_AREA_FLOOR = 2 # Minimum pixel area floor for very low-res lines
+DEFAULT_TRIM_PERCENTAGE = (
+ 0.2 # Percentage to trim from top/bottom for vertical cropping
+)
+
+# Skew detection parameters
+MIN_SKEW_THRESHOLD = 0.5 # Ignore angles smaller than this (likely noise)
+MAX_SKEW_THRESHOLD = 15.0 # Angles larger than this are extreme and likely errors
+# Baseline (Hough) skew: minimum bottom points to use baseline method; Hough threshold
+SKEW_BASELINE_MIN_POINTS = 20
+SKEW_HOUGH_THRESHOLD = 25 # Min votes for a line to be considered
+
+ALLOWED_WORD_MISMATCH_COUNT = 0 # Maximum allowed difference in word count between the target and the detected words during the word segmentation process. If above this, it will use the fallback segmenter.
+
+# Noise detection: if estimated noise (Laplacian variance) is above this (at REFERENCE_LINE_HEIGHT),
+# skip primary segmentation and use fallback. Scaled by line_height for resolution independence.
+NOISE_THRESHOLD = 800
+
+# Polarity: binarization assumes dark text on light background. If estimated background
+# mean is below this, the image is treated as light-on-dark and inverted before binarization.
+POLARITY_MEAN_THRESHOLD = 128
+POLARITY_CORNER_FRACTION = (
+ 0.15 # Fraction of width/height used for corner/edge sampling
+)
+
+SEARCH_STAGE1_COARSE_STEP = 0.06
+SEARCH_STAGE1_FINE_STEP = 0.02
+SEARCH_STAGE2_COARSE_STEP = 0.05
+SEARCH_STAGE2_FINE_STEP = 0.02
+
+
+def _find_widest_zero_gaps(
+ vertical_projection: np.ndarray,
+ n: int,
+ gap_threshold: float = 0.0,
+) -> List[Tuple[int, int]]:
+ """
+ Find the N widest contiguous zero-gaps (or near-zero) in the vertical projection.
+ Used for justified text: anchor word cut points to the centers of these gaps.
+ Returns list of (start, end) in left-to-right order, or empty if not enough gaps.
+ """
+ if vertical_projection is None or n <= 0:
+ return []
+ w = len(vertical_projection)
+ gaps = []
+ in_gap = False
+ start = 0
+ for x in range(w):
+ val = vertical_projection[x] if x < w else 0
+ if val <= gap_threshold and not in_gap:
+ start = x
+ in_gap = True
+ elif val > gap_threshold and in_gap:
+ gaps.append((start, x))
+ in_gap = False
+ if in_gap:
+ gaps.append((start, w))
+ if not gaps:
+ return []
+ # Sort by width descending, take first n
+ gaps_by_width = sorted(gaps, key=lambda g: g[1] - g[0], reverse=True)
+ selected = gaps_by_width[:n]
+ # Sort by position (left-to-right) for cutting
+ selected.sort(key=lambda g: g[0])
+ return selected
+
+
+# Punctuation that often sits after a word with a visible gap (anchor to include in word box)
+TRAILING_PUNCTUATION_CHARS = frozenset(".,:;\"'!?)]}")
+
+
+def _word_ends_with_punctuation(word: str) -> bool:
+ """True if word ends with a punctuation character that may have a gap before it."""
+ return bool(word and word[-1] in TRAILING_PUNCTUATION_CHARS)
+
+
+def get_weighted_length(text: str) -> float:
+ """
+ Proportional-font heuristic: sum character width weights instead of counting chars.
+ Narrow chars (i, l, 1, punctuation) get < 1.0; wide chars (W, M, w) get > 1.0.
+ Used by HybridWordSegmenter.convert_line_to_word_level for better blind estimation.
+ """
+ width = 0.0
+ weights = {
+ "i": 0.4,
+ "l": 0.4,
+ "1": 0.4,
+ "t": 0.6,
+ "j": 0.4,
+ ".": 0.3,
+ ",": 0.3,
+ "!": 0.3,
+ "'": 0.3,
+ "W": 1.3,
+ "M": 1.3,
+ "m": 1.3,
+ "w": 1.2,
+ "@": 1.2,
+ "%": 1.2,
+ " ": 0.5, # space between words
+ }
+ for char in text:
+ base = 1.1 if char.isupper() else 1.0
+ width += weights.get(char, base)
+ return width
+
+
+def _sanitize_filename(filename: str, max_length: int = 100) -> str:
+ """
+ Sanitizes a string to be used as a valid filename.
+ Removes or replaces invalid characters for Windows/Linux file systems.
+
+ Args:
+ filename: The string to sanitize
+ max_length: Maximum length of the sanitized filename
+
+ Returns:
+ A sanitized string safe for use in file names
+ """
+ if not filename:
+ return "unnamed"
+
+ # Replace spaces with underscores
+ sanitized = filename.replace(" ", "_")
+
+ # Remove or replace invalid characters for Windows/Linux
+ # Invalid: < > : " / \ | ? *
+ invalid_chars = '<>:"/\\|?*'
+ for char in invalid_chars:
+ sanitized = sanitized.replace(char, "_")
+
+ # Remove control characters
+ sanitized = "".join(
+ char for char in sanitized if ord(char) >= 32 or char in "\n\r\t"
+ )
+
+ # Remove leading/trailing dots and spaces (Windows doesn't allow these)
+ sanitized = sanitized.strip(". ")
+
+ # Replace multiple consecutive underscores with a single one
+ while "__" in sanitized:
+ sanitized = sanitized.replace("__", "_")
+
+ # Truncate if too long
+ if len(sanitized) > max_length:
+ sanitized = sanitized[:max_length]
+
+ # Ensure it's not empty after sanitization
+ if not sanitized:
+ sanitized = "unnamed"
+
+ return sanitized
+
+
+class AdaptiveSegmenter:
+ """
+ Line to word segmentation pipeline. It features:
+ 1. Adaptive Thresholding.
+ 2. Targeted Noise Removal using Connected Component Analysis.
+ 3. The robust two-stage adaptive search (Valley -> Kernel).
+ 4. CCA for final pixel-perfect refinement.
+ """
+
+ def __init__(self, output_folder: str = OUTPUT_FOLDER):
+ self.output_folder = output_folder
+ self.fallback_segmenter = HybridWordSegmenter()
+
+ def _correct_orientation(
+ self, gray_image: np.ndarray
+ ) -> Tuple[np.ndarray, np.ndarray]:
+ """
+ Detects and corrects 90-degree orientation issues.
+ """
+ h, w = gray_image.shape
+ center = (w // 2, h // 2)
+
+ block_size = 21
+ if h < block_size:
+ block_size = h if h % 2 != 0 else h - 1
+
+ if block_size > 3:
+ binary = cv2.adaptiveThreshold(
+ gray_image,
+ 255,
+ cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+ cv2.THRESH_BINARY_INV,
+ block_size,
+ 4,
+ )
+ else:
+ _, binary = cv2.threshold(
+ gray_image, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU
+ )
+
+ opening_kernel = np.ones((2, 2), np.uint8)
+ binary = cv2.morphologyEx(binary, cv2.MORPH_OPEN, opening_kernel)
+
+ coords = np.column_stack(np.where(binary > 0))
+ if len(coords) < 50:
+ M_orient = cv2.getRotationMatrix2D(center, 0, 1.0)
+ return gray_image, M_orient
+
+ ymin, xmin = coords.min(axis=0)
+ ymax, xmax = coords.max(axis=0)
+ box_height = ymax - ymin
+ box_width = xmax - xmin
+
+ orientation_angle = 0.0
+ if box_height > box_width:
+ orientation_angle = 90.0
+ else:
+ M_orient = cv2.getRotationMatrix2D(center, 0, 1.0)
+ return gray_image, M_orient
+
+ M_orient = cv2.getRotationMatrix2D(center, orientation_angle, 1.0)
+ new_w, new_h = h, w
+ M_orient[0, 2] += (new_w - w) / 2
+ M_orient[1, 2] += (new_h - h) / 2
+
+ oriented_gray = cv2.warpAffine(
+ gray_image,
+ M_orient,
+ (new_w, new_h),
+ flags=cv2.INTER_CUBIC,
+ borderMode=cv2.BORDER_REPLICATE,
+ )
+
+ return oriented_gray, M_orient
+
+ def _skew_angle_from_baseline(self, binary: np.ndarray) -> float:
+ """
+ Estimate skew angle from the text baseline using bottom points of foreground
+ and Hough line transform. More stable than minAreaRect for short words or
+ lines with ascenders/descenders (e.g. "all"). Returns correction angle in
+ degrees, or None if baseline cannot be reliably estimated.
+ """
+ h, w = binary.shape
+ # For each column, take the bottom-most foreground pixel (baseline point)
+ bottom_points = []
+ for x in range(w):
+ col = binary[:, x]
+ on_pixels = np.where(col > 0)[0]
+ if len(on_pixels) > 0:
+ y_bottom = int(np.max(on_pixels))
+ bottom_points.append((x, y_bottom))
+ if len(bottom_points) < SKEW_BASELINE_MIN_POINTS:
+ return None
+ # Draw baseline points on a blank image for Hough
+ baseline_img = np.zeros((h, w), dtype=np.uint8)
+ for x, y in bottom_points:
+ baseline_img[y, x] = 255
+ # Slight dilation so Hough sees a denser line
+ kernel = np.ones((2, 2), np.uint8)
+ baseline_img = cv2.dilate(baseline_img, kernel)
+ lines = cv2.HoughLines(
+ baseline_img,
+ rho=1,
+ theta=np.pi / 180,
+ threshold=SKEW_HOUGH_THRESHOLD,
+ )
+ if lines is None or len(lines) == 0:
+ return None
+ # Score each line by number of bottom points near it; take best
+ best_angle = None
+ best_score = 0
+ dist_thresh = max(2, h // 30)
+ for line in lines:
+ rho, theta = line[0]
+ # Line equation: rho = x*cos(theta) + y*sin(theta). Perpendicular is at angle theta.
+ # Baseline angle from horizontal = theta - 90°. To level it we rotate by -(theta - 90°) = 90° - theta.
+ correction_deg = 90.0 - np.degrees(theta)
+ # Normalize to [-90, 90] for comparison
+ if correction_deg > 90:
+ correction_deg -= 180
+ elif correction_deg < -90:
+ correction_deg += 180
+ score = 0
+ for x, y in bottom_points:
+ # Distance from (x,y) to line rho = x*cos(theta)+y*sin(theta)
+ d = abs(x * np.cos(theta) + y * np.sin(theta) - rho)
+ if d <= dist_thresh:
+ score += 1
+ if score > best_score:
+ best_score = score
+ best_angle = correction_deg
+ if best_angle is None:
+ return None
+ return float(best_angle)
+
+ def _skew_angle_from_min_area_rect(
+ self, coords: np.ndarray, w: int, h: int
+ ) -> float:
+ """Fallback: skew angle from minAreaRect of all foreground pixels."""
+ if len(coords) < 50:
+ return 0.0
+ rect = cv2.minAreaRect(coords[:, ::-1])
+ rect_width, rect_height = rect[1]
+ angle = rect[2]
+ if rect_width < rect_height:
+ angle += 90
+ if angle > 45:
+ angle -= 90
+ elif angle < -45:
+ angle += 90
+ return float(angle)
+
+ def _deskew_image(self, gray_image: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+ """
+ Detects skew using baseline (Hough on bottom points of letters) when possible,
+ which is more stable for short words and ascenders/descenders; falls back to
+ minAreaRect otherwise.
+ """
+ h, w = gray_image.shape
+
+ block_size = 21
+ if h < block_size:
+ block_size = h if h % 2 != 0 else h - 1
+
+ if block_size > 3:
+ binary = cv2.adaptiveThreshold(
+ gray_image,
+ 255,
+ cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+ cv2.THRESH_BINARY_INV,
+ block_size,
+ 4,
+ )
+ else:
+ _, binary = cv2.threshold(
+ gray_image, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU
+ )
+
+ opening_kernel = np.ones((2, 2), np.uint8)
+ binary = cv2.morphologyEx(binary, cv2.MORPH_OPEN, opening_kernel)
+
+ coords = np.column_stack(np.where(binary > 0))
+ if len(coords) < 50:
+ M = cv2.getRotationMatrix2D((w // 2, h // 2), 0, 1.0)
+ return gray_image, M
+
+ # Prefer baseline-based skew (stable for short words / ascenders-descenders)
+ correction_angle = self._skew_angle_from_baseline(binary)
+ if correction_angle is None:
+ correction_angle = self._skew_angle_from_min_area_rect(coords, w, h)
+
+ if abs(correction_angle) < MIN_SKEW_THRESHOLD:
+ correction_angle = 0.0
+ elif abs(correction_angle) > MAX_SKEW_THRESHOLD:
+ correction_angle = 0.0
+
+ center = (w // 2, h // 2)
+ M = cv2.getRotationMatrix2D(center, correction_angle, 1.0)
+
+ deskewed_gray = cv2.warpAffine(
+ gray_image,
+ M,
+ (w, h),
+ flags=cv2.INTER_CUBIC,
+ borderMode=cv2.BORDER_REPLICATE,
+ )
+
+ return deskewed_gray, M
+
+ def _estimate_quick_skew_degrees(self, gray_image: np.ndarray) -> float:
+ """Cheap skew estimate used to skip expensive orientation/deskew when safe."""
+ if gray_image is None or gray_image.size == 0:
+ return 0.0
+ h, w = gray_image.shape[:2]
+ if h < 8 or w < 8:
+ return 0.0
+ _, quick_bin = cv2.threshold(
+ gray_image, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU
+ )
+ coords = np.column_stack(np.where(quick_bin > 0))
+ if len(coords) < 30:
+ return 0.0
+ rect = cv2.minAreaRect(coords[:, ::-1])
+ rect_w, rect_h = rect[1]
+ angle = float(rect[2])
+ if rect_w < rect_h:
+ angle += 90.0
+ if angle > 45.0:
+ angle -= 90.0
+ elif angle < -45.0:
+ angle += 90.0
+ return angle
+
+ def _can_skip_expensive_preprocess(self, gray_image: np.ndarray) -> bool:
+ """Return True when line is already horizontal enough for direct segmentation."""
+ if gray_image is None or gray_image.size == 0:
+ return True
+ h, w = gray_image.shape[:2]
+ if h <= 0 or w <= 0:
+ return True
+ if w <= int(h * 1.2):
+ return False
+ skew = self._estimate_quick_skew_degrees(gray_image)
+ return abs(skew) < 1.2
+
+ def _get_boxes_from_profile(
+ self,
+ binary_image: np.ndarray,
+ stable_avg_char_width: float,
+ min_space_factor: float,
+ valley_threshold_factor: float,
+ ) -> List:
+ """
+ Extracts word bounding boxes from vertical projection profile.
+ """
+ img_h, img_w = binary_image.shape
+ vertical_projection = np.sum(binary_image, axis=0)
+ peaks = vertical_projection[vertical_projection > 0]
+ if len(peaks) == 0:
+ return []
+ avg_peak_height = np.mean(peaks)
+ valley_threshold = int(avg_peak_height * valley_threshold_factor)
+ min_space_width = int(stable_avg_char_width * min_space_factor)
+
+ patched_projection = vertical_projection.copy()
+ in_gap = False
+ gap_start = 0
+
+ for x, col_sum in enumerate(patched_projection):
+ if col_sum <= valley_threshold and not in_gap:
+ in_gap = True
+ gap_start = x
+ elif col_sum > valley_threshold and in_gap:
+ in_gap = False
+ if (x - gap_start) < min_space_width:
+ patched_projection[gap_start:x] = int(avg_peak_height)
+
+ unlabeled_boxes = []
+ in_word = False
+ start_x = 0
+ for x, col_sum in enumerate(patched_projection):
+ if col_sum > valley_threshold and not in_word:
+ start_x = x
+ in_word = True
+ elif col_sum <= valley_threshold and in_word:
+ # [NOTE] Returns full height stripe
+ unlabeled_boxes.append((start_x, 0, x - start_x, img_h))
+ in_word = False
+ if in_word:
+ unlabeled_boxes.append((start_x, 0, img_w - start_x, img_h))
+ return unlabeled_boxes
+
+ def _enforce_logical_constraints(
+ self, output: Dict[str, List], image_width: int, image_height: int
+ ) -> Dict[str, List]:
+ """
+ Enforces geometric sanity checks with 2D awareness.
+ """
+ if not output or not output["text"]:
+ return output
+
+ num_items = len(output["text"])
+ boxes = []
+ for i in range(num_items):
+ boxes.append(
+ {
+ "text": output["text"][i],
+ "left": int(output["left"][i]),
+ "top": int(output["top"][i]),
+ "width": int(output["width"][i]),
+ "height": int(output["height"][i]),
+ "conf": output["conf"][i],
+ }
+ )
+
+ valid_boxes = []
+ for box in boxes:
+ x0 = max(0, box["left"])
+ y0 = max(0, box["top"])
+ x1 = min(image_width, box["left"] + box["width"])
+ y1 = min(image_height, box["top"] + box["height"])
+
+ w = x1 - x0
+ h = y1 - y0
+
+ if w > 0 and h > 0:
+ box["left"] = x0
+ box["top"] = y0
+ box["width"] = w
+ box["height"] = h
+ valid_boxes.append(box)
+ boxes = valid_boxes
+
+ is_vertical = image_height > (image_width * 1.2)
+ if is_vertical:
+ boxes.sort(key=lambda b: (b["top"], b["left"]))
+ else:
+ boxes.sort(key=lambda b: (b["left"], -b["width"]))
+
+ final_pass_boxes = []
+ if boxes:
+ keep_indices = [True] * len(boxes)
+ # Fast path: adjacent comparisons after sorting removes most duplicates
+ # without full O(n^2) cross checks.
+ for i in range(len(boxes) - 1):
+ b1 = boxes[i]
+ b2 = boxes[i + 1]
+ x_nested = (b1["left"] >= b2["left"] - 2) and (
+ b1["left"] + b1["width"] <= b2["left"] + b2["width"] + 2
+ )
+ y_nested = (b1["top"] >= b2["top"] - 2) and (
+ b1["top"] + b1["height"] <= b2["top"] + b2["height"] + 2
+ )
+ if x_nested and y_nested and b1["text"] == b2["text"]:
+ if b1["width"] * b1["height"] <= b2["width"] * b2["height"]:
+ keep_indices[i] = False
+ # Also evaluate opposite containment (b2 inside b1).
+ x_nested_rev = (b2["left"] >= b1["left"] - 2) and (
+ b2["left"] + b2["width"] <= b1["left"] + b1["width"] + 2
+ )
+ y_nested_rev = (b2["top"] >= b1["top"] - 2) and (
+ b2["top"] + b2["height"] <= b1["top"] + b1["height"] + 2
+ )
+ if x_nested_rev and y_nested_rev and b1["text"] == b2["text"]:
+ if b2["width"] * b2["height"] <= b1["width"] * b1["height"]:
+ keep_indices[i + 1] = False
+
+ for i, keep in enumerate(keep_indices):
+ if keep:
+ final_pass_boxes.append(boxes[i])
+
+ boxes = final_pass_boxes
+
+ if is_vertical:
+ boxes.sort(key=lambda b: (b["top"], b["left"]))
+ else:
+ boxes.sort(key=lambda b: (b["left"], -b["width"]))
+
+ for i in range(len(boxes) - 1):
+ b1 = boxes[i]
+ b2 = boxes[i + 1]
+ x_overlap = min(b1["left"] + b1["width"], b2["left"] + b2["width"]) - max(
+ b1["left"], b2["left"]
+ )
+ y_overlap = min(b1["top"] + b1["height"], b2["top"] + b2["height"]) - max(
+ b1["top"], b2["top"]
+ )
+
+ if x_overlap > 0 and y_overlap > 0:
+ if is_vertical:
+ if b1["top"] < b2["top"]:
+ b1["height"] = max(1, b2["top"] - b1["top"])
+ else:
+ if b1["left"] < b2["left"]:
+ b1_right = b1["left"] + b1["width"]
+ b2_right = b2["left"] + b2["width"]
+ left_slice_width = max(0, b2["left"] - b1["left"])
+ right_slice_width = max(0, b1_right - b2_right)
+ if b1_right > b2_right and right_slice_width > left_slice_width:
+ b1["left"] = b2_right
+ b1["width"] = right_slice_width
+ else:
+ b1["width"] = max(1, left_slice_width)
+
+ cleaned_output = {
+ k: [] for k in ["text", "left", "top", "width", "height", "conf"]
+ }
+ if is_vertical:
+ boxes.sort(key=lambda b: (b["top"], b["left"]))
+ else:
+ boxes.sort(key=lambda b: (b["left"], -b["width"]))
+
+ for box in boxes:
+ for key in cleaned_output.keys():
+ cleaned_output[key].append(box[key])
+
+ return cleaned_output
+
+ def _is_geometry_valid(
+ self,
+ boxes: List[Tuple[int, int, int, int]],
+ words: List[str],
+ expected_height: float = 0,
+ ) -> bool:
+ """
+ Validates if the detected boxes are physically plausible.
+ [FIX] Improved robustness for punctuation and mixed-case text.
+ """
+ if len(boxes) != len(words):
+ return False
+
+ baseline = expected_height
+ # Use median only if provided expected height is unreliable
+ if baseline < 5:
+ heights = [b[3] for b in boxes]
+ if heights:
+ baseline = np.median(heights)
+
+ if baseline < 5:
+ return True
+
+ for i, box in enumerate(boxes):
+ word = words[i]
+
+ # [FIX] Check for punctuation/symbols. They are allowed to be small.
+ # If word is just punctuation, skip geometry checks
+ is_punctuation = not any(c.isalnum() for c in word)
+ if is_punctuation:
+ continue
+
+ # Standard checks for alphanumeric words
+ num_chars = len(word)
+ if num_chars < 1:
+ continue
+
+ width = box[2]
+ height = box[3]
+
+ # [FIX] Only reject height if it's REALLY small compared to baseline
+ # A period might be small, but we skipped that check above.
+ # This check ensures a real word like "The" isn't 2 pixels tall.
+ if height < (baseline * 0.20):
+ return False
+
+ avg_char_width = width / num_chars
+ min_expected = baseline * 0.20
+
+ # Only reject if it fails BOTH absolute (4px) and relative checks
+ if avg_char_width < min_expected and avg_char_width < 4:
+ # Exception: If the word is 1 char long (e.g. "I", "l", "1"), allow it to be skinny.
+ if num_chars == 1 and avg_char_width >= 2:
+ continue
+ return False
+
+ return True
+
+ def _estimate_noise(self, gray: np.ndarray) -> float:
+ """
+ Estimate image noisiness using Laplacian variance. Noisy images tend to have
+ high high-frequency content, so higher values indicate more noise (or very
+ sharp edges). Used to skip the primary segmentation pipeline when above
+ NOISE_THRESHOLD and use the fallback segmenter instead.
+ """
+ if gray is None or gray.size == 0:
+ return 0.0
+ lap = cv2.Laplacian(gray, cv2.CV_64F, ksize=3)
+ return float(lap.var())
+
+ def _block_size_from_median_cc_height(
+ self, gray: np.ndarray, line_height: int, fallback_block_size: int
+ ) -> int:
+ """
+ Determine adaptive threshold block size from median height of connected components
+ (resolution-independent). Uses an Otsu pre-pass to get CCs; if median height is
+ valid, returns block_size = median_cc_height * BLOCK_SIZE_MEDIAN_CC_FACTOR.
+ Otherwise returns fallback_block_size (e.g. from line_height).
+ """
+ if gray is None or gray.size == 0 or line_height < 3:
+ return fallback_block_size
+ _, otsu_binary = cv2.threshold(
+ gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU
+ )
+ num_labels, _, stats, _ = cv2.connectedComponentsWithStats(
+ otsu_binary, 8, cv2.CV_32S
+ )
+ if num_labels < 3: # background + need at least 2 components
+ return fallback_block_size
+ areas = stats[1:, cv2.CC_STAT_AREA]
+ heights = stats[1:, cv2.CC_STAT_HEIGHT]
+ min_area_cc = max(2, int((line_height * 0.02) ** 2))
+ valid = areas >= min_area_cc
+ if not np.any(valid):
+ return fallback_block_size
+ median_h = np.median(heights[valid])
+ if np.isnan(median_h) or median_h < 2:
+ return fallback_block_size
+ block = max(3, int(median_h * BLOCK_SIZE_MEDIAN_CC_FACTOR))
+ if block % 2 == 0:
+ block += 1
+ return block
+
+ def _normalize_polarity_for_binarization(self, gray: np.ndarray) -> np.ndarray:
+ """
+ Ensure we work with dark-text-on-light-background for binarization. If the
+ image is mostly dark (light text on dark background), invert it so that
+ adaptive threshold and projection profile logic behave correctly.
+
+ Uses corner/edge regions to estimate background (typical in documents);
+ falls back to global mean for very small or full-page line crops.
+ """
+ if gray is None or gray.size == 0:
+ return gray
+ h, w = gray.shape
+ frac = POLARITY_CORNER_FRACTION
+ # Sample corners and edges (background is often visible there)
+ margin_w = max(1, int(w * frac))
+ margin_h = max(1, int(h * frac))
+ corner_pixels = []
+ if margin_w < w and margin_h < h:
+ top_left = gray[:margin_h, :margin_w]
+ top_right = gray[:margin_h, -margin_w:]
+ bottom_left = gray[-margin_h:, :margin_w]
+ bottom_right = gray[-margin_h:, -margin_w:]
+ for region in (top_left, top_right, bottom_left, bottom_right):
+ corner_pixels.append(region.ravel())
+ if corner_pixels:
+ corner_pixels = np.concatenate(corner_pixels)
+ background_mean = float(np.mean(corner_pixels))
+ else:
+ background_mean = float(np.mean(gray))
+ else:
+ background_mean = float(np.mean(gray))
+ if background_mean < POLARITY_MEAN_THRESHOLD:
+ return cv2.bitwise_not(gray)
+ return gray
+
+ def segment(
+ self,
+ line_data: Dict[str, List],
+ line_image: np.ndarray,
+ min_space_factor=MIN_SPACE_FACTOR,
+ match_tolerance=MATCH_TOLERANCE,
+ image_name: str = None,
+ ) -> Tuple[Dict[str, List], bool]:
+ if (
+ line_image is None
+ or not isinstance(line_image, np.ndarray)
+ or line_image.size == 0
+ ):
+ return ({}, False)
+ # Allow grayscale (2 dims) or color (3 dims)
+ if len(line_image.shape) < 2:
+ return ({}, False)
+ if not line_data or not line_data.get("text") or len(line_data["text"]) == 0:
+ return ({}, False)
+
+ line_text = line_data["text"][0]
+ words = line_text.split()
+
+ # Early return if 1 or fewer words
+ if len(words) <= 1:
+ img_h, img_w = line_image.shape[:2]
+ one_word_result = self.fallback_segmenter.convert_line_to_word_level(
+ line_data, img_w, img_h
+ )
+ return (one_word_result, False)
+
+ # Validate that line_image is not empty before processing
+ if line_image is None or line_image.size == 0 or len(line_image.shape) < 2:
+ # If line_image is empty, fall back to proportional estimation
+ return {}, False
+
+ line_number = line_data["line"][0]
+ safe_image_name = "image"
+ safe_line_number = str(line_number)
+ safe_shortened_line_text = "line"
+ if SAVE_WORD_SEGMENTER_OUTPUT_IMAGES:
+ safe_image_name = _sanitize_filename(image_name or "image", max_length=50)
+ safe_line_number = _sanitize_filename(str(line_number), max_length=10)
+ safe_shortened_line_text = _sanitize_filename(line_text, max_length=10)
+
+ if SAVE_WORD_SEGMENTER_OUTPUT_IMAGES:
+ os.makedirs(self.output_folder, exist_ok=True)
+ output_path = f"{self.output_folder}/word_segmentation/{safe_image_name}_{safe_line_number}_{safe_shortened_line_text}_original.png"
+ os.makedirs(f"{self.output_folder}/word_segmentation", exist_ok=True)
+ # Only write if image is valid
+ if line_image.size > 0 and len(line_image.shape) >= 2:
+ cv2.imwrite(output_path, line_image)
+
+ if len(line_image.shape) == 3:
+ gray = cv2.cvtColor(line_image, cv2.COLOR_BGR2GRAY)
+ else:
+ gray = line_image.copy()
+
+ # ========================================================================
+ # IMAGE PREPROCESSING (Deskew / Rotate)
+ # ========================================================================
+ if self._can_skip_expensive_preprocess(gray):
+ h, w = gray.shape[:2]
+ deskewed_gray = gray
+ deskewed_line_image = line_image.copy()
+ M = np.array([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]], dtype=np.float32)
+ else:
+ oriented_gray, M_orient = self._correct_orientation(gray)
+ deskewed_gray, M_skew = self._deskew_image(oriented_gray)
+
+ # Combine matrices: M_total = M_skew * M_orient
+ M_orient_3x3 = np.vstack([M_orient, [0, 0, 1]])
+ M_skew_3x3 = np.vstack([M_skew, [0, 0, 1]])
+ M_total_3x3 = M_skew_3x3 @ M_orient_3x3
+ M = M_total_3x3[0:2, :] # Extract 2x3 affine matrix
+
+ # Apply transformation to the original color image
+ h, w = deskewed_gray.shape
+ deskewed_line_image = cv2.warpAffine(
+ line_image,
+ M,
+ (w, h),
+ flags=cv2.INTER_CUBIC,
+ borderMode=cv2.BORDER_REPLICATE,
+ )
+
+ # [FIX] Create Local Line Data that matches the deskewed/rotated image dimensions.
+ # This prevents the fallback segmenter from using vertical dimensions on a horizontal image.
+ local_line_data = {
+ "text": line_data["text"],
+ "conf": line_data["conf"],
+ "left": [0], # Local coordinate system starts at 0
+ "top": [0],
+ "width": [w], # Use the ROTATED width
+ "height": [h], # Use the ROTATED height
+ "line": line_data.get("line", [0]),
+ }
+
+ if SAVE_WORD_SEGMENTER_OUTPUT_IMAGES:
+ os.makedirs(self.output_folder, exist_ok=True)
+ output_path = f"{self.output_folder}/word_segmentation/{safe_image_name}_{safe_line_number}_{safe_shortened_line_text}_deskewed.png"
+ cv2.imwrite(output_path, deskewed_line_image)
+
+ # ========================================================================
+ # MAIN SEGMENTATION PIPELINE
+ # ========================================================================
+ approx_char_count = len(line_data["text"][0].replace(" ", ""))
+ if approx_char_count == 0:
+ return {}, False
+
+ img_h, img_w = deskewed_gray.shape
+ line_height = img_h
+ estimated_char_height = img_h * 0.6
+ avg_char_width_approx = img_w / approx_char_count
+
+ # Block size from line height (resolution-independent); could be refined from median CC height in two-pass
+ block_size = max(3, int(line_height * BLOCK_SIZE_FACTOR))
+ if block_size % 2 == 0:
+ block_size += 1
+
+ # Noise threshold scaled by line height so behavior is resolution-independent
+ effective_noise_threshold = NOISE_THRESHOLD * (
+ line_height / REFERENCE_LINE_HEIGHT
+ )
+
+ # --- Noise check: skip primary pipeline if image is too noisy ---
+ noise_level = self._estimate_noise(deskewed_gray)
+ if noise_level > effective_noise_threshold:
+ used_fallback = True
+ final_output = self.fallback_segmenter.refine_words_bidirectional(
+ local_line_data, deskewed_line_image
+ )
+ else:
+ # --- Polarity: ensure dark text on light background for binarization ---
+ gray_for_binary = self._normalize_polarity_for_binarization(deskewed_gray)
+
+ # Refine block size from median CC height (Otsu pre-pass) when possible
+ block_size = self._block_size_from_median_cc_height(
+ gray_for_binary, line_height, block_size
+ )
+
+ # --- Binarization ---
+ binary_adaptive = cv2.adaptiveThreshold(
+ gray_for_binary,
+ 255,
+ cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+ cv2.THRESH_BINARY_INV,
+ block_size,
+ C_VALUE,
+ )
+ otsu_thresh_val, _ = cv2.threshold(
+ gray_for_binary, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU
+ )
+ strict_thresh_val = otsu_thresh_val * 0.75
+ _, binary_strict = cv2.threshold(
+ gray_for_binary, strict_thresh_val, 255, cv2.THRESH_BINARY_INV
+ )
+ binary = cv2.bitwise_and(binary_adaptive, binary_strict)
+
+ if SAVE_WORD_SEGMENTER_OUTPUT_IMAGES:
+ output_path = f"{self.output_folder}/word_segmentation/{safe_image_name}_{safe_line_number}_{safe_shortened_line_text}_binary.png"
+ cv2.imwrite(output_path, binary)
+
+ # --- Morphological Closing ---
+ morph_width = max(3, int(avg_char_width_approx * 0.40))
+ morph_height = max(2, int(avg_char_width_approx * 0.1))
+ kernel = cv2.getStructuringElement(
+ cv2.MORPH_RECT, (morph_width, morph_height)
+ )
+ closed_binary = cv2.morphologyEx(
+ binary, cv2.MORPH_CLOSE, kernel, iterations=1
+ )
+
+ # --- Noise Removal ---
+ num_labels, labels, stats, _ = cv2.connectedComponentsWithStats(
+ closed_binary, 8, cv2.CV_32S
+ )
+ clean_binary = np.zeros_like(binary)
+
+ force_fallback = False
+ significant_labels = 0
+ if num_labels > 1:
+ # Only count components with area > 3 pixels
+ significant_labels = np.sum(stats[1:, cv2.CC_STAT_AREA] > 3)
+
+ if approx_char_count > 0 and significant_labels > (approx_char_count * 12):
+ force_fallback = True
+
+ if num_labels > 1:
+ areas = stats[1:, cv2.CC_STAT_AREA]
+ if len(areas) == 0:
+ clean_binary = binary
+ areas = np.array([0])
+ else:
+ p1 = np.percentile(areas, 1)
+ img_h, img_w = binary.shape
+ line_h = img_h
+ estimated_char_height = img_h * 0.7
+ # Resolution-independent min area: (line_height * 0.05)^2 with floor
+ min_area_threshold = max(
+ MIN_AREA_FLOOR,
+ int((line_h * MIN_AREA_HEIGHT_FRACTION) ** 2),
+ )
+ estimated_min_letter_area = max(
+ 2,
+ int(estimated_char_height * 0.2 * estimated_char_height * 0.15),
+ )
+ area_threshold = max(
+ min_area_threshold, min(p1, estimated_min_letter_area)
+ )
+
+ # Gap detection logic...
+ sorted_areas = np.sort(areas)
+ area_diffs = np.diff(sorted_areas)
+ if len(sorted_areas) > 10 and len(area_diffs) > 0:
+ jump_threshold = np.percentile(area_diffs, 95)
+ significant_jump_thresh = max(10, jump_threshold * 3)
+ jump_indices = np.where(area_diffs > significant_jump_thresh)[0]
+ if len(jump_indices) > 0:
+ gap_idx = jump_indices[0]
+ area_before_gap = sorted_areas[gap_idx]
+ final_threshold = max(area_before_gap + 1, area_threshold)
+ final_threshold = min(final_threshold, 15)
+ area_threshold = final_threshold
+
+ for i in range(1, num_labels):
+ if stats[i, cv2.CC_STAT_AREA] >= area_threshold:
+ clean_binary[labels == i] = 255
+ else:
+ clean_binary = binary
+
+ # Validate clean_binary is not empty before proceeding
+ if (
+ clean_binary is None
+ or clean_binary.size == 0
+ or len(clean_binary.shape) < 2
+ ):
+ # If clean_binary is empty, fall back to proportional estimation
+ return {}, False
+
+ # --- Vertical Cropping ---
+ horizontal_projection = np.sum(clean_binary, axis=1)
+ y_start = 0
+ non_zero_rows = np.where(horizontal_projection > 0)[0]
+ if len(non_zero_rows) > 0:
+ p_top = int(np.percentile(non_zero_rows, 5))
+ p_bottom = int(np.percentile(non_zero_rows, 95))
+ core_height = p_bottom - p_top
+ trim_pixels = int(core_height * 0.1)
+ y_start = max(0, p_top + trim_pixels)
+ y_end = min(clean_binary.shape[0], p_bottom - trim_pixels)
+ if y_end - y_start < 5:
+ y_start = p_top
+ y_end = p_bottom
+ # Ensure y_end > y_start to avoid empty slice
+ if y_end > y_start:
+ analysis_image = clean_binary[y_start:y_end, :]
+ else:
+ # If slice would be empty, use the full image
+ analysis_image = clean_binary
+ else:
+ analysis_image = clean_binary
+
+ # Validate that analysis_image is not empty before proceeding
+ if (
+ analysis_image is None
+ or analysis_image.size == 0
+ or len(analysis_image.shape) < 2
+ ):
+ # If analysis_image is empty, fall back to proportional estimation
+ return {}, False
+
+ if SAVE_WORD_SEGMENTER_OUTPUT_IMAGES:
+ # Validate that analysis_image is not empty before writing
+ if analysis_image.size > 0 and len(analysis_image.shape) >= 2:
+ output_path = f"{self.output_folder}/word_segmentation/{safe_image_name}_{safe_line_number}_{safe_shortened_line_text}_clean_binary.png"
+ cv2.imwrite(output_path, analysis_image)
+
+ # --- Adaptive Search ---
+ best_boxes = None
+ successful_binary_image = None
+
+ if not force_fallback:
+ words = line_data["text"][0].split()
+ target = len(words)
+ backup_boxes_s1 = None
+ best_stage1_diff = float("inf")
+ best_stage1_factor = INITIAL_VALLEY_THRESHOLD_FACTOR
+
+ # STAGE 1
+ stage1_coarse = np.arange(
+ INITIAL_VALLEY_THRESHOLD_FACTOR, 0.60, SEARCH_STAGE1_COARSE_STEP
+ )
+ for v_factor in stage1_coarse:
+ curr_boxes = self._get_boxes_from_profile(
+ analysis_image,
+ avg_char_width_approx,
+ min_space_factor,
+ v_factor,
+ )
+ diff = abs(target - len(curr_boxes))
+ is_geom_valid = self._is_geometry_valid(
+ curr_boxes, words, estimated_char_height
+ )
+ if diff < best_stage1_diff:
+ best_stage1_diff = diff
+ best_stage1_factor = float(v_factor)
+
+ if diff == 0:
+ if is_geom_valid:
+ best_boxes = curr_boxes
+ successful_binary_image = analysis_image
+ break
+ else:
+ if backup_boxes_s1 is None:
+ backup_boxes_s1 = curr_boxes
+ if (
+ diff <= ALLOWED_WORD_MISMATCH_COUNT
+ and backup_boxes_s1 is None
+ and is_geom_valid
+ ):
+ backup_boxes_s1 = curr_boxes
+
+ # Refine around best coarse factor only when needed.
+ if best_boxes is None:
+ lower = max(
+ INITIAL_VALLEY_THRESHOLD_FACTOR,
+ best_stage1_factor - SEARCH_STAGE1_COARSE_STEP,
+ )
+ upper = min(0.60, best_stage1_factor + SEARCH_STAGE1_COARSE_STEP)
+ for v_factor in np.arange(
+ lower, upper + 1e-9, SEARCH_STAGE1_FINE_STEP
+ ):
+ curr_boxes = self._get_boxes_from_profile(
+ analysis_image,
+ avg_char_width_approx,
+ min_space_factor,
+ v_factor,
+ )
+ diff = abs(target - len(curr_boxes))
+ is_geom_valid = self._is_geometry_valid(
+ curr_boxes, words, estimated_char_height
+ )
+ if diff == 0 and is_geom_valid:
+ best_boxes = curr_boxes
+ successful_binary_image = analysis_image
+ break
+ if (
+ diff <= ALLOWED_WORD_MISMATCH_COUNT
+ and backup_boxes_s1 is None
+ and is_geom_valid
+ ):
+ backup_boxes_s1 = curr_boxes
+
+ # STAGE 2 (if needed)
+ if best_boxes is None:
+ backup_boxes_s2 = None
+ best_stage2_diff = float("inf")
+ best_stage2_factor = INITIAL_KERNEL_WIDTH_FACTOR
+ for k_factor in np.arange(
+ INITIAL_KERNEL_WIDTH_FACTOR, 0.5, SEARCH_STAGE2_COARSE_STEP
+ ):
+ k_w = max(1, int(avg_char_width_approx * k_factor))
+ s2_bin = cv2.morphologyEx(
+ clean_binary, cv2.MORPH_CLOSE, np.ones((1, k_w), np.uint8)
+ )
+ s2_img = (
+ s2_bin[y_start:y_end, :]
+ if len(non_zero_rows) > 0
+ else s2_bin
+ )
+
+ if s2_img is None or s2_img.size == 0:
+ continue
+
+ curr_boxes = self._get_boxes_from_profile(
+ s2_img,
+ avg_char_width_approx,
+ min_space_factor,
+ MAIN_VALLEY_THRESHOLD_FACTOR,
+ )
+ diff = abs(target - len(curr_boxes))
+ if diff < best_stage2_diff:
+ best_stage2_diff = diff
+ best_stage2_factor = float(k_factor)
+ is_geom_valid = self._is_geometry_valid(
+ curr_boxes, words, estimated_char_height
+ )
+
+ if diff == 0 and is_geom_valid:
+ best_boxes = curr_boxes
+ successful_binary_image = s2_bin
+ break
+
+ if (
+ diff <= ALLOWED_WORD_MISMATCH_COUNT
+ and backup_boxes_s2 is None
+ and is_geom_valid
+ ):
+ backup_boxes_s2 = curr_boxes
+
+ if best_boxes is None:
+ lower = max(
+ INITIAL_KERNEL_WIDTH_FACTOR,
+ best_stage2_factor - SEARCH_STAGE2_COARSE_STEP,
+ )
+ upper = min(0.5, best_stage2_factor + SEARCH_STAGE2_COARSE_STEP)
+ for k_factor in np.arange(
+ lower, upper + 1e-9, SEARCH_STAGE2_FINE_STEP
+ ):
+ k_w = max(1, int(avg_char_width_approx * k_factor))
+ s2_bin = cv2.morphologyEx(
+ clean_binary,
+ cv2.MORPH_CLOSE,
+ np.ones((1, k_w), np.uint8),
+ )
+ s2_img = (
+ s2_bin[y_start:y_end, :]
+ if len(non_zero_rows) > 0
+ else s2_bin
+ )
+ if s2_img is None or s2_img.size == 0:
+ continue
+ curr_boxes = self._get_boxes_from_profile(
+ s2_img,
+ avg_char_width_approx,
+ min_space_factor,
+ MAIN_VALLEY_THRESHOLD_FACTOR,
+ )
+ diff = abs(target - len(curr_boxes))
+ is_geom_valid = self._is_geometry_valid(
+ curr_boxes, words, estimated_char_height
+ )
+ if diff == 0 and is_geom_valid:
+ best_boxes = curr_boxes
+ successful_binary_image = s2_bin
+ break
+ if (
+ diff <= ALLOWED_WORD_MISMATCH_COUNT
+ and backup_boxes_s2 is None
+ and is_geom_valid
+ ):
+ backup_boxes_s2 = curr_boxes
+
+ if best_boxes is None:
+ if backup_boxes_s1 is not None:
+ best_boxes = backup_boxes_s1
+ successful_binary_image = analysis_image
+ elif backup_boxes_s2 is not None:
+ best_boxes = backup_boxes_s2
+ successful_binary_image = clean_binary
+
+ final_output = None
+ used_fallback = False
+
+ if best_boxes is None:
+ # --- FALLBACK WITH ROTATED DATA ---
+ used_fallback = True
+ # [FIX] Use local_line_data (rotated dims) instead of line_data (original dims)
+ final_output = self.fallback_segmenter.refine_words_bidirectional(
+ local_line_data, deskewed_line_image
+ )
+ else:
+ # --- CCA Refinement ---
+ unlabeled_boxes = best_boxes
+ if successful_binary_image is analysis_image:
+ cca_source_image = clean_binary
+ else:
+ cca_source_image = successful_binary_image
+
+ num_labels, _, stats, _ = cv2.connectedComponentsWithStats(
+ cca_source_image, 8, cv2.CV_32S
+ )
+ cca_img_h, cca_img_w = cca_source_image.shape[:2]
+
+ component_assignments = {}
+ num_proc = min(len(words), len(unlabeled_boxes))
+ min_valid_component_area = estimated_char_height * 2
+ box_meta = []
+ for i in range(num_proc):
+ box_x, box_y, box_w, box_h = unlabeled_boxes[i]
+ box_r = box_x + box_w
+ box_center_x = box_x + box_w / 2
+ box_meta.append((i, box_x, box_r, box_center_x, box_w))
+ box_starts = [meta[1] for meta in box_meta]
+
+ for j in range(1, num_labels):
+ comp_x = stats[j, cv2.CC_STAT_LEFT]
+ comp_w = stats[j, cv2.CC_STAT_WIDTH]
+ comp_area = stats[j, cv2.CC_STAT_AREA]
+ comp_r = comp_x + comp_w
+ comp_center_x = comp_x + comp_w / 2
+ comp_y = stats[j, cv2.CC_STAT_TOP]
+ comp_h = stats[j, cv2.CC_STAT_HEIGHT]
+ comp_center_y = comp_y + comp_h / 2
+
+ if (
+ comp_center_y < cca_img_h * 0.1
+ or comp_center_y > cca_img_h * 0.9
+ ):
+ continue
+ if comp_area < min_valid_component_area:
+ continue
+
+ best_box_idx = None
+ max_overlap = 0
+ best_center_distance = float("inf")
+ component_center_in_box = False
+
+ # Assign components to boxes...
+ # Candidate pruning: only evaluate boxes near this component.
+ left_search = max(0, comp_x - comp_w)
+ right_search = comp_r + comp_w
+ start_idx = bisect_left(box_starts, left_search)
+ idx = start_idx
+ while idx < len(box_meta) and box_meta[idx][1] <= right_search:
+ i, box_x, box_r, box_center_x, box_w = box_meta[idx]
+ idx += 1
+ if comp_w > box_w * 1.5:
+ continue
+
+ if comp_x < box_r and box_x < comp_r:
+ overlap_start = max(comp_x, box_x)
+ overlap_end = min(comp_r, box_r)
+ overlap = overlap_end - overlap_start
+
+ if overlap > 0:
+ center_in_box = box_x <= comp_center_x < box_r
+ center_distance = abs(comp_center_x - box_center_x)
+
+ if center_in_box:
+ if (
+ not component_center_in_box
+ or overlap > max_overlap
+ ):
+ component_center_in_box = True
+ best_center_distance = center_distance
+ max_overlap = overlap
+ best_box_idx = i
+ elif not component_center_in_box:
+ if center_distance < best_center_distance or (
+ center_distance == best_center_distance
+ and overlap > max_overlap
+ ):
+ best_center_distance = center_distance
+ max_overlap = overlap
+ best_box_idx = i
+
+ if best_box_idx is not None:
+ component_assignments[j] = best_box_idx
+
+ refined_boxes_list = []
+ for i in range(num_proc):
+ word_label = words[i]
+ components_in_box = [
+ stats[j] for j, b in component_assignments.items() if b == i
+ ]
+
+ use_original_box = False
+ if not components_in_box:
+ use_original_box = True
+ else:
+ min_x = min(c[cv2.CC_STAT_LEFT] for c in components_in_box)
+ min_y = min(c[cv2.CC_STAT_TOP] for c in components_in_box)
+ max_r = max(
+ c[cv2.CC_STAT_LEFT] + c[cv2.CC_STAT_WIDTH]
+ for c in components_in_box
+ )
+ max_b = max(
+ c[cv2.CC_STAT_TOP] + c[cv2.CC_STAT_HEIGHT]
+ for c in components_in_box
+ )
+ cca_h = max(1, max_b - min_y)
+ if cca_h < (estimated_char_height * 0.35):
+ use_original_box = True
+
+ if use_original_box:
+ box_x, box_y, box_w, box_h = unlabeled_boxes[i]
+ adjusted_box_y = y_start + box_y
+ refined_boxes_list.append(
+ {
+ "text": word_label,
+ "left": box_x,
+ "top": adjusted_box_y,
+ "width": box_w,
+ "height": box_h,
+ "conf": line_data["conf"][0],
+ }
+ )
+ else:
+ refined_boxes_list.append(
+ {
+ "text": word_label,
+ "left": min_x,
+ "top": min_y,
+ "width": max(1, max_r - min_x),
+ "height": cca_h,
+ "conf": line_data["conf"][0],
+ }
+ )
+
+ # Check validity
+ cca_check_list = [
+ (b["left"], b["top"], b["width"], b["height"])
+ for b in refined_boxes_list
+ ]
+ if not self._is_geometry_valid(
+ cca_check_list, words, estimated_char_height
+ ):
+ if abs(len(refined_boxes_list) - len(words)) > 1:
+ best_boxes = None # Trigger fallback
+ else:
+ final_output = {
+ k: []
+ for k in ["text", "left", "top", "width", "height", "conf"]
+ }
+ for box in refined_boxes_list:
+ for key in final_output.keys():
+ final_output[key].append(box[key])
+ else:
+ final_output = {
+ k: []
+ for k in ["text", "left", "top", "width", "height", "conf"]
+ }
+ for box in refined_boxes_list:
+ for key in final_output.keys():
+ final_output[key].append(box[key])
+
+ # --- REPEAT FALLBACK IF VALIDATION FAILED ---
+ if best_boxes is None and not used_fallback:
+ used_fallback = True
+ # [FIX] Use local_line_data here too
+ final_output = self.fallback_segmenter.refine_words_bidirectional(
+ local_line_data, deskewed_line_image
+ )
+
+ # ========================================================================
+ # COORDINATE TRANSFORMATION (Map back to Original)
+ # ========================================================================
+ M_inv = cv2.invertAffineTransform(M)
+ remapped_boxes_list = []
+ for i in range(len(final_output["text"])):
+ left, top = final_output["left"][i], final_output["top"][i]
+ width, height = final_output["width"][i], final_output["height"][i]
+
+ # Map the 4 corners
+ corners = np.array(
+ [
+ [left, top],
+ [left + width, top],
+ [left + width, top + height],
+ [left, top + height],
+ ],
+ dtype="float32",
+ )
+ corners_expanded = np.expand_dims(corners, axis=1)
+ original_corners = cv2.transform(corners_expanded, M_inv)
+ squeezed_corners = original_corners.squeeze(axis=1)
+
+ # Get axis aligned bounding box in original space
+ min_x = int(np.min(squeezed_corners[:, 0]))
+ max_x = int(np.max(squeezed_corners[:, 0]))
+ min_y = int(np.min(squeezed_corners[:, 1]))
+ max_y = int(np.max(squeezed_corners[:, 1]))
+
+ remapped_boxes_list.append(
+ {
+ "text": final_output["text"][i],
+ "left": min_x,
+ "top": min_y,
+ "width": max_x - min_x,
+ "height": max_y - min_y,
+ "conf": final_output["conf"][i],
+ }
+ )
+
+ remapped_output = {k: [] for k in final_output.keys()}
+ for box in remapped_boxes_list:
+ for key in remapped_output.keys():
+ remapped_output[key].append(box[key])
+
+ img_h, img_w = line_image.shape[:2]
+ remapped_output = self._enforce_logical_constraints(
+ remapped_output, img_w, img_h
+ )
+
+ # ========================================================================
+ # FINAL SAFETY NET
+ # ========================================================================
+ words = line_data["text"][0].split()
+ target_count = len(words)
+ current_count = len(remapped_output["text"])
+ has_collapsed_boxes = any(w < 3 for w in remapped_output["width"])
+
+ if current_count > 0:
+ total_text_len = sum(len(t) for t in remapped_output["text"])
+ total_box_width = sum(remapped_output["width"])
+ avg_width_pixels = total_box_width / max(1, total_text_len)
+ else:
+ avg_width_pixels = 0
+ is_suspiciously_thin = avg_width_pixels < 4
+
+ if current_count != target_count or is_suspiciously_thin or has_collapsed_boxes:
+ used_fallback = True
+
+ # [FIX] Do NOT use original line_image/line_data here.
+ # Use the local_line_data + deskewed_line_image pipeline,
+ # then transform back using M_inv (same as above).
+
+ # 1. Run fallback on rotated data
+ temp_local_output = self.fallback_segmenter.refine_words_bidirectional(
+ local_line_data, deskewed_line_image
+ )
+
+ # 2. If bidirectional failed to split correctly, use purely mathematical split on rotated data
+ if len(temp_local_output["text"]) != target_count:
+ h, w = deskewed_line_image.shape[:2]
+ temp_local_output = self.fallback_segmenter.convert_line_to_word_level(
+ local_line_data, w, h
+ )
+
+ # 3. Transform the result back to original coordinates (M_inv)
+ # (Repeating the transformation logic for the safety net result)
+ remapped_boxes_list = []
+ for i in range(len(temp_local_output["text"])):
+ left, top = temp_local_output["left"][i], temp_local_output["top"][i]
+ width, height = (
+ temp_local_output["width"][i],
+ temp_local_output["height"][i],
+ )
+
+ corners = np.array(
+ [
+ [left, top],
+ [left + width, top],
+ [left + width, top + height],
+ [left, top + height],
+ ],
+ dtype="float32",
+ )
+ corners_expanded = np.expand_dims(corners, axis=1)
+ original_corners = cv2.transform(corners_expanded, M_inv)
+ squeezed_corners = original_corners.squeeze(axis=1)
+
+ min_x = int(np.min(squeezed_corners[:, 0]))
+ max_x = int(np.max(squeezed_corners[:, 0]))
+ min_y = int(np.min(squeezed_corners[:, 1]))
+ max_y = int(np.max(squeezed_corners[:, 1]))
+
+ remapped_boxes_list.append(
+ {
+ "text": temp_local_output["text"][i],
+ "left": min_x,
+ "top": min_y,
+ "width": max_x - min_x,
+ "height": max_y - min_y,
+ "conf": temp_local_output["conf"][i],
+ }
+ )
+
+ remapped_output = {k: [] for k in temp_local_output.keys()}
+ for box in remapped_boxes_list:
+ for key in remapped_output.keys():
+ remapped_output[key].append(box[key])
+
+ if SAVE_WORD_SEGMENTER_OUTPUT_IMAGES:
+ output_path = f"{self.output_folder}/word_segmentation/{safe_image_name}_{safe_shortened_line_text}_final_boxes.png"
+ os.makedirs(f"{self.output_folder}/word_segmentation", exist_ok=True)
+ output_image_vis = line_image.copy()
+ for i in range(len(remapped_output["text"])):
+ x, y, w, h = (
+ int(remapped_output["left"][i]),
+ int(remapped_output["top"][i]),
+ int(remapped_output["width"][i]),
+ int(remapped_output["height"][i]),
+ )
+ cv2.rectangle(output_image_vis, (x, y), (x + w, y + h), (0, 255, 0), 2)
+ cv2.imwrite(output_path, output_image_vis)
+
+ return remapped_output, used_fallback
+
+
+class HybridWordSegmenter:
+ """
+ Implements a two-step approach for word segmentation:
+ 1. Proportional estimation based on text (primary; avoids image noise).
+ 2. Image-based refinement with a "Bounded Scan" that cannot shrink boxes
+ beyond a fraction of the text-based width.
+
+ Design: Relies more on expected character spacing from the text than on
+ image analysis, so noisy images are less likely to produce tiny or
+ missing boxes.
+
+ Situations that could otherwise cause very small boxes (and how we mitigate):
+ - False gaps in the vertical projection (noise/speckle) -> refinement is
+ bounded by shrink_limit_fraction; initial boxes use proportional only.
+ - Image-based "justified" gap anchoring picking wrong cuts -> we do not
+ use vertical_projection for initial segmentation here; only proportional.
+ - Bidirectional scan snapping to a thin low-density strip inside a word ->
+ same shrink bound; fallback "thinnest point" also clamped.
+ - De-overlapping stealing space from the next word -> shrink bound keeps
+ each box at least (1 - shrink_limit_fraction) of initial width.
+
+ ROBUSTNESS UPGRADES:
+ - Uses Horizontal Smearing to prevent cutting inside noisy characters.
+ - Uses Gaussian Blur to suppress speckle noise.
+ - Implements 'Noise Floors' for gap detection (never assumes perfect 0).
+ """
+
+ def convert_line_to_word_level(
+ self,
+ line_data: Dict[str, List],
+ image_width: int,
+ image_height: int,
+ vertical_projection: np.ndarray = None,
+ ) -> Dict[str, List]:
+ """
+ Step 1: Converts line-level OCR results to word-level using proportional estimation.
+ Includes noise-tolerant gap anchoring for justified text.
+ """
+ output = {
+ "text": list(),
+ "left": list(),
+ "top": list(),
+ "width": list(),
+ "height": list(),
+ "conf": list(),
+ }
+
+ if not line_data or not line_data.get("text"):
+ return output
+
+ i = 0
+ line_text = line_data["text"][i]
+ line_left = float(line_data["left"][i])
+ line_top = float(line_data["top"][i])
+ line_width = float(line_data["width"][i])
+ line_height = float(line_data["height"][i])
+ line_conf = line_data["conf"][i]
+
+ if not line_text.strip():
+ return output
+ words = line_text.split()
+ if not words:
+ return output
+ num_chars = len("".join(words))
+ num_spaces = len(words) - 1
+ if num_chars == 0:
+ return output
+
+ # --- Justified text: anchor cut points to widest zero-gaps in projection ---
+ if (
+ vertical_projection is not None
+ and len(vertical_projection) == image_width
+ and num_spaces > 0
+ ):
+ # ROBUSTNESS: Allow significantly more noise in gaps for justified text detection.
+ # Allow up to 3% of the column height to be noise and still count as a "gap".
+ dynamic_gap_threshold = max(255.0 * 0.03 * image_height, 255.0 * 2)
+ gaps = _find_widest_zero_gaps(
+ vertical_projection, n=num_spaces, gap_threshold=dynamic_gap_threshold
+ )
+ if len(gaps) == num_spaces:
+ cuts = [0]
+ for start, end in gaps:
+ cuts.append((start + end) // 2)
+ cuts.append(image_width)
+
+ for idx, word in enumerate(words):
+ left_px = cuts[idx]
+ right_px = cuts[idx + 1]
+ width_px = max(1, right_px - left_px)
+ output["text"].append(word)
+ output["left"].append(line_left + left_px)
+ output["top"].append(line_top)
+ output["width"].append(width_px)
+ output["height"].append(line_height)
+ output["conf"].append(line_conf)
+ return output
+
+ # --- Proportional estimation ---
+ total_line_weight = get_weighted_length(line_text)
+ if total_line_weight <= 0:
+ total_line_weight = 1.0
+ avg_weight_unit = line_width / total_line_weight
+ estimated_space_width = get_weighted_length(" ") * avg_weight_unit
+
+ avg_char_width = line_width / (num_chars if num_chars > 0 else 1)
+ avg_char_width = max(3.0, avg_char_width)
+ min_word_width = max(5.0, avg_char_width * 0.5)
+
+ current_left = line_left
+ for word in words:
+ word_weight = get_weighted_length(word)
+ raw_word_width = word_weight * avg_weight_unit
+ word_width = max(min_word_width, raw_word_width)
+
+ clamped_left = max(0, min(current_left, image_width))
+ output["text"].append(word)
+ output["left"].append(clamped_left)
+ output["top"].append(line_top)
+ output["width"].append(word_width)
+ output["height"].append(line_height)
+ output["conf"].append(line_conf)
+ current_left += word_width + estimated_space_width
+
+ return output
+
+ def _run_single_pass(
+ self,
+ initial_boxes: List[Dict],
+ vertical_projection: np.ndarray,
+ max_scan_distance: int,
+ img_w: int,
+ img_h: int,
+ direction: str = "ltr",
+ trailing_punctuation: List[bool] = None,
+ shrink_limit_fraction: float = 0.5,
+ ) -> List[Dict]:
+ """
+ Helper function to run one pass of refinement.
+ ROBUSTNESS UPGRADE:
+ - Uses a 'gap_noise_floor' instead of looking for 0.
+ - Enforces 'safety_density_limit': if the "thinnest" point is still thick (ink),
+ it refuses to cut there (prevents cutting bold letters).
+ - shrink_limit_fraction: Refinement cannot shrink a box by more than this fraction
+ of its initial (text-based) width from either edge. Prevents noise from creating
+ tiny boxes; keeps segmentation anchored to expected character spacing.
+ """
+
+ refined_boxes = [box.copy() for box in initial_boxes]
+ if trailing_punctuation is None:
+ trailing_punctuation = [False] * len(initial_boxes)
+
+ # ROBUSTNESS: Define what constitutes a "gap" vs "ink"
+ # 1. Gap Floor: Anything below 5% of image height is treated as empty space (noise tolerance)
+ gap_noise_floor = 255.0 * (img_h * 0.05)
+
+ # 2. Ink Safety Limit: If the "thinnest" point has > 25% ink density, it is NOT a gap.
+ # It's a character. Do not cut.
+ safety_density_limit = 255.0 * (img_h * 0.25)
+
+ if direction == "ltr":
+ last_corrected_right_edge = 0
+ indices = range(len(refined_boxes))
+ else: # rtl
+ next_corrected_left_edge = img_w
+ indices = range(len(refined_boxes) - 1, -1, -1)
+
+ for i in indices:
+ box = refined_boxes[i]
+ left = int(box["left"])
+ right = int(box["left"] + box["width"])
+ init_width = max(1, int(box["width"]))
+ # Bounds from initial (text-based) box: don't let image refinement shrink too much
+ min_right = right - int(shrink_limit_fraction * init_width)
+ max_left = left + int(shrink_limit_fraction * init_width)
+
+ left = max(0, min(left, img_w - 1))
+ right = max(0, min(right, img_w - 1))
+
+ new_left, new_right = left, right
+
+ if direction == "ltr" or direction == "both": # Scan right
+ if right < img_w:
+ scan_limit = min(img_w, right + max_scan_distance)
+ search_range = range(right, scan_limit)
+
+ best_x = right
+ min_density = float("inf")
+ found_gap = False
+ first_gap_x = None
+
+ for x in search_range:
+ density = vertical_projection[x]
+
+ # Check for Gap
+ if density <= gap_noise_floor:
+ first_gap_x = x
+ found_gap = True
+ break
+
+ # Track minimum density for fallback
+ if density < min_density:
+ min_density = density
+ best_x = x
+
+ if found_gap and first_gap_x is not None:
+ if trailing_punctuation[i]:
+ # Logic to jump over the gap and include the punctuation blob
+ # ... (same safety limits as before) ...
+ proj_len = len(vertical_projection)
+ x_pos = first_gap_x
+
+ # 1. Cross the gap
+ gap_safety_limit = x_pos + (max_scan_distance // 2)
+ while (
+ x_pos < scan_limit
+ and x_pos < proj_len
+ and vertical_projection[x_pos] <= gap_noise_floor
+ ):
+ if x_pos >= gap_safety_limit:
+ break
+ x_pos += 1
+
+ # 2. Consume blob
+ blob_start = x_pos
+ blob_safety_limit = blob_start + max(1, int(img_h * 0.5))
+ while (
+ x_pos < scan_limit
+ and x_pos < proj_len
+ and vertical_projection[x_pos] > gap_noise_floor
+ ):
+ if x_pos >= blob_safety_limit:
+ x_pos = first_gap_x # Revert
+ break
+ x_pos += 1
+ new_right = min(x_pos, scan_limit)
+ else:
+ new_right = first_gap_x
+
+ elif not found_gap:
+ # Fallback: No clear gap found.
+ # ROBUSTNESS CHECK: Is the "thinnest" point actually thin?
+ if min_density < safety_density_limit:
+ new_right = best_x
+ else:
+ # The thinnest point is still very dark (ink).
+ # Don't cut through a letter. Keep original guess or limit.
+ new_right = right
+
+ if direction == "rtl" or direction == "both": # Scan left
+ if left > 0:
+ scan_limit = max(0, left - max_scan_distance)
+ search_range = range(left, scan_limit, -1)
+
+ best_x = left
+ min_density = float("inf")
+ found_gap = False
+
+ for x in search_range:
+ density = vertical_projection[x]
+
+ if density <= gap_noise_floor:
+ new_left = x
+ found_gap = True
+ break
+
+ if density < min_density:
+ min_density = density
+ best_x = x
+
+ if not found_gap:
+ # ROBUSTNESS CHECK
+ if min_density < safety_density_limit:
+ new_left = best_x
+ else:
+ # Refuse to cut through dense ink
+ new_left = left
+
+ # --- Anchor to text: don't shrink past allowed fraction of initial width ---
+ new_right = max(new_right, min_right)
+ new_left = min(new_left, max_left)
+
+ # --- Directional de-overlapping ---
+ if direction == "ltr":
+ if new_left < last_corrected_right_edge:
+ new_left = last_corrected_right_edge
+ if new_right <= new_left:
+ new_right = new_left + 1
+ last_corrected_right_edge = new_right
+ else: # rtl
+ if new_right > next_corrected_left_edge:
+ new_right = next_corrected_left_edge
+ if new_left >= new_right:
+ new_left = new_right - 1
+ next_corrected_left_edge = new_left
+
+ box["left"] = new_left
+ box["width"] = max(1, new_right - new_left)
+
+ return refined_boxes
+
+ def refine_words_bidirectional(
+ self,
+ line_data: Dict[str, List],
+ line_image: np.ndarray,
+ ) -> Dict[str, List]:
+ """
+ Refines boxes using a robust bidirectional scan.
+ DIFFERENCE FROM MAIN SEGMENTER: Uses aggressive smoothing and horizontal
+ smearing to force-merge characters, prioritizing word separation over
+ character detail.
+ """
+ if line_image is None:
+ return line_data
+
+ # Handle grayscale (2D) or BGR (3D) line images
+ if len(line_image.shape) == 2:
+ gray = np.ascontiguousarray(line_image)
+ else:
+ gray = cv2.cvtColor(line_image, cv2.COLOR_BGR2GRAY)
+ img_h, img_w = gray.shape[:2]
+
+ # OpenCV GaussianBlur(5,5) and later adaptiveThreshold need minimum dimensions.
+ # Avoid "Unknown C++ exception" on very small line crops (e.g. 1–4 px).
+ if img_h < 5 or img_w < 5:
+ return self.convert_line_to_word_level(line_data, img_w, img_h)
+
+ if line_data and line_data.get("text"):
+ words = line_data["text"][0].split()
+ if len(words) <= 1:
+ return self.convert_line_to_word_level(line_data, img_w, img_h)
+
+ # --- PRE-PROCESSING: The "Bulldozer" Approach ---
+ # 1. Gaussian Blur: Suppress high-frequency speckle noise that confuses the main segmenter
+ # We accept slight edge blurring for the sake of noise reduction.
+ # OpenCV can intermittently throw low-information C++ exceptions on some
+ # page crops (often due to dtype/range/nan/inf issues). If that happens,
+ # fall back to the non-image-based word conversion to keep OCR flowing.
+ try:
+ # Guard against NaN/Inf propagating into OpenCV internals.
+ if gray.dtype.kind in ("f", "c"):
+ gray = np.nan_to_num(gray, nan=0.0, posinf=255.0, neginf=0.0)
+
+ # GaussianBlur is most stable on uint8 or float32. If we have another
+ # dtype (e.g. int16/float64/object), normalize and cast.
+ if gray.dtype != np.uint8 and gray.dtype != np.float32:
+ # Normalize to 0..255 if range looks unusual.
+ gmin = float(np.min(gray)) if gray.size else 0.0
+ gmax = float(np.max(gray)) if gray.size else 255.0
+ if gmax > 255.0 or gmin < 0.0:
+ gray = cv2.normalize(gray, None, 0, 255, cv2.NORM_MINMAX)
+ gray = np.clip(gray, 0, 255).astype(np.uint8)
+
+ blurred_gray = cv2.GaussianBlur(gray, (5, 5), 0)
+ except Exception:
+ return self.convert_line_to_word_level(line_data, img_w, img_h)
+
+ # 2. Aggressive Thresholding
+ # We use a larger block size here to be less sensitive to local texture variations
+ block_size = max(25, int(img_h * 0.5))
+ if block_size % 2 == 0:
+ block_size += 1
+
+ binary = cv2.adaptiveThreshold(
+ blurred_gray,
+ 255,
+ cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+ cv2.THRESH_BINARY_INV,
+ block_size,
+ 10,
+ )
+
+ # 3. Horizontal Smearing (The critical difference)
+ # We intentionally smear mostly horizontally to bridge gaps inside noisy letters.
+ # Kernel width: ~15-20% of line height.
+ smear_w = max(3, int(img_h * 0.20))
+ smear_h = max(1, int(img_h * 0.05))
+ kernel_smear = cv2.getStructuringElement(cv2.MORPH_RECT, (smear_w, smear_h))
+
+ # Apply Morphological Closing
+ binary_smeared = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel_smear)
+
+ # Calculate projection on the SMEARED image
+ vertical_projection = np.sum(binary_smeared, axis=0)
+
+ # --- Setup for Scan ---
+ # Detect blobs to estimate character width for scan limiting
+ char_blobs = []
+ in_blob = False
+ blob_start = 0
+ for x, col_sum in enumerate(vertical_projection):
+ if col_sum > 0 and not in_blob:
+ blob_start = x
+ in_blob = True
+ elif col_sum == 0 and in_blob:
+ char_blobs.append((blob_start, x))
+ in_blob = False
+ if in_blob:
+ char_blobs.append((blob_start, img_w))
+
+ if not char_blobs:
+ return self.convert_line_to_word_level(line_data, img_w, img_h)
+
+ total_chars = len("".join(words))
+ if total_chars > 0:
+ geom_avg_char_width = img_w / total_chars
+ else:
+ geom_avg_char_width = 10
+
+ blob_avg_char_width = np.mean([end - start for start, end in char_blobs])
+ safe_avg_char_width = min(blob_avg_char_width, geom_avg_char_width * 1.5)
+
+ # Scan distance parameters
+ max_scan_distance = max(int(safe_avg_char_width * 2.5), int(img_h * 0.6))
+ min_safe_box_width = max(4, int(safe_avg_char_width * 0.5))
+
+ # --- Standard Logic Continues ---
+ # Use proportional estimation only (no vertical_projection) so initial boxes
+ # are driven by text/character spacing. Image-based gap anchoring on noisy
+ # images can produce tiny slices; refinement will still run but is bounded.
+ estimated_data = self.convert_line_to_word_level(line_data, img_w, img_h)
+ if not estimated_data["text"]:
+ return estimated_data
+
+ initial_boxes = []
+ for i in range(len(estimated_data["text"])):
+ initial_boxes.append(
+ {
+ "text": estimated_data["text"][i],
+ "left": estimated_data["left"][i],
+ "top": estimated_data["top"][i],
+ "width": estimated_data["width"][i],
+ "height": estimated_data["height"][i],
+ "conf": estimated_data["conf"][i],
+ }
+ )
+
+ trailing_punctuation = [
+ _word_ends_with_punctuation(estimated_data["text"][j])
+ for j in range(len(estimated_data["text"]))
+ ]
+
+ # Run passes (ensure _run_single_pass uses the robust gap logic)
+ ltr_boxes = self._run_single_pass(
+ initial_boxes,
+ vertical_projection,
+ max_scan_distance,
+ img_w,
+ img_h,
+ "ltr",
+ trailing_punctuation,
+ )
+ rtl_boxes = self._run_single_pass(
+ initial_boxes,
+ vertical_projection,
+ max_scan_distance,
+ img_w,
+ img_h,
+ "rtl",
+ trailing_punctuation,
+ )
+
+ # [Re-use stitching logic from previous code...]
+ combined_boxes = [box.copy() for box in initial_boxes]
+ for i in range(len(combined_boxes)):
+ final_left = ltr_boxes[i]["left"]
+ rtl_right = rtl_boxes[i]["left"] + rtl_boxes[i]["width"]
+ combined_boxes[i]["left"] = final_left
+ combined_boxes[i]["width"] = max(min_safe_box_width, rtl_right - final_left)
+
+ for i in range(len(combined_boxes) - 1):
+ if combined_boxes[i + 1]["left"] <= combined_boxes[i]["left"]:
+ combined_boxes[i + 1]["left"] = (
+ combined_boxes[i]["left"] + min_safe_box_width
+ )
+
+ for i in range(len(combined_boxes) - 1):
+ curr = combined_boxes[i]
+ nxt = combined_boxes[i + 1]
+ gap_width = nxt["left"] - curr["left"]
+ curr["width"] = max(min_safe_box_width, gap_width)
+
+ final_output = {k: [] for k in estimated_data.keys()}
+ for box in combined_boxes:
+ # Always keep one box per word; enforce minimum width 1 for valid geometry
+ box_width = max(1, box["width"])
+ box["width"] = box_width
+ for key in final_output.keys():
+ final_output[key].append(box[key])
+
+ return final_output