Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on about 2 hours ago

Commit

45a02da

0 Parent(s):

Sync: Updated CDK deployment with options for using ECS Express mode, direct run mode, and agent route

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.coveragerc +56 -0
.dockerignore +52 -0
.gitattributes +9 -0
.github/scripts/setup_test_data.py +320 -0
.github/workflow_README.md +183 -0
.github/workflows/archive_workflows/multi-os-test.yml +115 -0
.github/workflows/ci.yml +269 -0
.github/workflows/simple-test.yml +74 -0
.github/workflows/sync-pi-agent-space.yml +64 -0
.github/workflows/sync_to_hf.yml +54 -0
.github/workflows/sync_to_hf_zero_gpu.yml +59 -0
.gitignore +62 -0
AGENTS.md +113 -0
Dockerfile +232 -0
Dockerfile.pi +40 -0
MANIFEST.in +4 -0
README.md +344 -0
README_PYPI.md +328 -0
agent-redact/README.md +25 -0
agent-redact/pi-agent/.dockerignore +10 -0
agent-redact/pi-agent/.gitattributes +2 -0
agent-redact/pi-agent/Dockerfile +70 -0
agent-redact/pi-agent/README.md +45 -0
agent-redact/pi-agent/sync-manifest.txt +10 -0
agent-redact/pi-agent/sync_to_space.sh +42 -0
agent-redact/pi/agent/README.md +183 -0
agent-redact/pi/agent/models.json +31 -0
agent-redact/pi/agent/settings.json +32 -0
agent-redact/pi/bootstrap_pi_config.py +151 -0
agent-redact/pi/gradio_app.py +1769 -0
agent-redact/pi/output_files.py +316 -0
agent-redact/pi/pi_agent_config.py +715 -0
agent-redact/pi/pi_examples.py +180 -0
agent-redact/pi/pi_rpc_client.py +649 -0
agent-redact/pi/pi_session_usage.py +185 -0
agent-redact/pi/pi_workspace_skills.py +182 -0
agent-redact/pi/redaction_prompt.py +556 -0
agent-redact/pi/remote_redaction.py +104 -0
agent-redact/pi/session_logs.py +119 -0
agent-redact/pi/session_workspace.py +192 -0
agent-redact/pi/start.sh +26 -0
agent-redact/requirements_pi_agent.txt +35 -0
agent_routes.py +1167 -0
app.py +0 -0
cdk/__init__.py +0 -0
cdk/app.py +119 -0
cdk/cdk.json.example +7 -0
cdk/cdk_appregistry.py +69 -0
cdk/cdk_config.py +590 -0
cdk/cdk_functions.py +2448 -0

.coveragerc ADDED Viewed

	@@ -0,0 +1,56 @@

+[run]
+source = .
+omit =
+    */tests/*
+    */test/*
+    */__pycache__/*
+    */venv/*
+    */env/*
+    */build/*
+    */dist/*
+    */cdk/*
+    */docs/*
+    */example_data/*
+    */examples/*
+    */feedback/*
+    */logs/*
+    */old_code/*
+    */output/*
+    */tmp/*
+    */usage/*
+    */tld/*
+    */tesseract/*
+    */poppler/*
+    config*.py
+    setup.py
+    lambda_entrypoint.py
+    entrypoint.sh
+    cli_redact.py
+    load_dynamo_logs.py
+    load_s3_logs.py
+    *.spec
+    Dockerfile
+    *.qmd
+    *.md
+    *.txt
+    *.yml
+    *.yaml
+    *.json
+    *.csv
+    *.env
+    *.bat
+    *.ps1
+    *.sh
+[report]
+exclude_lines =
+    pragma: no cover
+    def __repr__
+    if self.debug:
+    if settings.DEBUG
+    raise AssertionError
+    raise NotImplementedError
+    if 0:
+    if __name__ == .__main__.:
+    class .*\bProtocol\):
+    @(abc\.)?abstractmethod

.dockerignore ADDED Viewed

	@@ -0,0 +1,52 @@

+*.url
+*.ipynb
+*.pyc
+*.qmd
+_quarto.yml
+quarto_site/*
+src/*
+redaction_deps/*
+.venv/*
+examples/*
+processing/*
+tools/__pycache__/*
+old_code/*
+tesseract/*
+poppler/*
+build/*
+dist/*
+docs/*
+.pi/*
+build_deps/*
+user_guide/*
+_extensions/*
+workspace/*
+doc_redaction.egg-info/*
+.venv_pypi_test/*
+cdk/config/*
+tld/*
+cdk/config/*
+cdk/cdk.out/*
+cdk/archive/*
+cdk.json
+cdk.context.json
+.quarto/*
+logs/
+output/
+input/
+feedback/
+config/
+usage/
+test/config/*
+test/feedback/*
+test/input/*
+test/logs/*
+test/output/*
+test/tmp/*
+test/usage/*
+.ruff_cache/*
+model_cache/*
+sanitized_file/*
+src/doc_redaction.egg-info/*
+docker_compose/*
+skills/example_prompts/*

.gitattributes ADDED Viewed

	@@ -0,0 +1,9 @@

+*.pdf filter=lfs diff=lfs merge=lfs -text
+*.sh text eol=lf
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.xls filter=lfs diff=lfs merge=lfs -text
+*.xlsx filter=lfs diff=lfs merge=lfs -text
+*.docx filter=lfs diff=lfs merge=lfs -text
+*.doc filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.ico filter=lfs diff=lfs merge=lfs -text

.github/scripts/setup_test_data.py ADDED Viewed

	@@ -0,0 +1,320 @@

+#!/usr/bin/env python3
+"""
+Setup script for GitHub Actions test data.
+Creates dummy test files when example data is not available.
+"""
+import os
+import sys
+import pandas as pd
+def create_directories():
+    """Create necessary directories."""
+    dirs = ["doc_redaction/example_data", "doc_redaction/example_data/example_outputs"]
+    for dir_path in dirs:
+        os.makedirs(dir_path, exist_ok=True)
+        print(f"Created directory: {dir_path}")
+def create_dummy_pdf():
+    """Create dummy PDFs for testing."""
+    # Install reportlab if not available
+    try:
+        from reportlab.lib.pagesizes import letter
+        from reportlab.pdfgen import canvas
+    except ImportError:
+        import subprocess
+        subprocess.check_call(["pip", "install", "reportlab"])
+        from reportlab.lib.pagesizes import letter
+        from reportlab.pdfgen import canvas
+    try:
+        # Create the main test PDF
+        pdf_path = "doc_redaction/example_data/example_of_emails_sent_to_a_professor_before_applying.pdf"
+        print(f"Creating PDF: {pdf_path}")
+        print(f"Directory exists: {os.path.exists('doc_redaction/example_data')}")
+        c = canvas.Canvas(pdf_path, pagesize=letter)
+        c.drawString(100, 750, "This is a test document for redaction testing.")
+        c.drawString(100, 700, "Email: test@example.com")
+        c.drawString(100, 650, "Phone: 123-456-7890")
+        c.drawString(100, 600, "Name: John Doe")
+        c.drawString(100, 550, "Address: 123 Test Street, Test City, TC 12345")
+        c.showPage()
+        # Add second page
+        c.drawString(100, 750, "Second page content")
+        c.drawString(100, 700, "More test data: jane.doe@example.com")
+        c.drawString(100, 650, "Another phone: 987-654-3210")
+        c.save()
+        print(f"Created dummy PDF: {pdf_path}")
+        # Create Partnership Agreement Toolkit PDF
+        partnership_pdf_path = (
+            "doc_redaction/example_data/Partnership-Agreement-Toolkit_0_0.pdf"
+        )
+        print(f"Creating PDF: {partnership_pdf_path}")
+        c = canvas.Canvas(partnership_pdf_path, pagesize=letter)
+        c.drawString(100, 750, "Partnership Agreement Toolkit")
+        c.drawString(100, 700, "This is a test partnership agreement document.")
+        c.drawString(100, 650, "Contact: partnership@example.com")
+        c.drawString(100, 600, "Phone: (555) 123-4567")
+        c.drawString(100, 550, "Address: 123 Partnership Street, City, State 12345")
+        c.showPage()
+        # Add second page
+        c.drawString(100, 750, "Page 2 - Partnership Details")
+        c.drawString(100, 700, "More partnership information here.")
+        c.drawString(100, 650, "Contact: info@partnership.org")
+        c.showPage()
+        # Add third page
+        c.drawString(100, 750, "Page 3 - Terms and Conditions")
+        c.drawString(100, 700, "Terms and conditions content.")
+        c.drawString(100, 650, "Legal contact: legal@partnership.org")
+        c.save()
+        print(f"Created dummy PDF: {partnership_pdf_path}")
+        # Create Graduate Job Cover Letter PDF
+        cover_letter_pdf_path = (
+            "doc_redaction/example_data/graduate-job-example-cover-letter.pdf"
+        )
+        print(f"Creating PDF: {cover_letter_pdf_path}")
+        c = canvas.Canvas(cover_letter_pdf_path, pagesize=letter)
+        c.drawString(100, 750, "Cover Letter Example")
+        c.drawString(100, 700, "Dear Hiring Manager,")
+        c.drawString(100, 650, "I am writing to apply for the position.")
+        c.drawString(100, 600, "Contact: applicant@example.com")
+        c.drawString(100, 550, "Phone: (555) 987-6543")
+        c.drawString(100, 500, "Address: 456 Job Street, Employment City, EC 54321")
+        c.drawString(100, 450, "Sincerely,")
+        c.drawString(100, 400, "John Applicant")
+        c.save()
+        print(f"Created dummy PDF: {cover_letter_pdf_path}")
+    except ImportError:
+        print("ReportLab not available, skipping PDF creation")
+        # Create simple text files instead
+        with open(
+            "doc_redaction/example_data/example_of_emails_sent_to_a_professor_before_applying.pdf",
+            "w",
+        ) as f:
+            f.write("This is a dummy PDF file for testing")
+        with open(
+            "doc_redaction/example_data/Partnership-Agreement-Toolkit_0_0.pdf",
+            "w",
+        ) as f:
+            f.write("This is a dummy Partnership Agreement PDF file for testing")
+        with open(
+            "doc_redaction/example_data/graduate-job-example-cover-letter.pdf",
+            "w",
+        ) as f:
+            f.write("This is a dummy cover letter PDF file for testing")
+        print("Created dummy text files instead of PDFs")
+def create_dummy_csv():
+    """Create dummy CSV files for testing."""
+    # Main CSV
+    csv_data = {
+        "Case Note": [
+            "Client visited for consultation regarding housing issues",
+            "Follow-up appointment scheduled for next week",
+            "Documentation submitted for review",
+        ],
+        "Client": ["John Smith", "Jane Doe", "Bob Johnson"],
+        "Date": ["2024-01-15", "2024-01-16", "2024-01-17"],
+    }
+    df = pd.DataFrame(csv_data)
+    df.to_csv("doc_redaction/example_data/combined_case_notes.csv", index=False)
+    print("Created dummy CSV: doc_redaction/example_data/combined_case_notes.csv")
+    # Lambeth CSV
+    lambeth_data = {
+        "text": [
+            "Lambeth 2030 vision document content",
+            "Our Future Our Lambeth strategic plan",
+            "Community engagement and development",
+        ],
+        "page": [1, 2, 3],
+    }
+    df_lambeth = pd.DataFrame(lambeth_data)
+    df_lambeth.to_csv(
+        "doc_redaction/example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv",
+        index=False,
+    )
+    print(
+        "Created dummy CSV: doc_redaction/example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv"
+    )
+def create_dummy_word_doc():
+    """Create dummy Word document."""
+    try:
+        from docx import Document
+        doc = Document()
+        doc.add_heading("Test Document for Redaction", 0)
+        doc.add_paragraph("This is a test document for redaction testing.")
+        doc.add_paragraph("Contact Information:")
+        doc.add_paragraph("Email: test@example.com")
+        doc.add_paragraph("Phone: 123-456-7890")
+        doc.add_paragraph("Name: John Doe")
+        doc.add_paragraph("Address: 123 Test Street, Test City, TC 12345")
+        doc.save(
+            "doc_redaction/example_data/Bold minimalist professional cover letter.docx"
+        )
+        print("Created dummy Word document")
+    except ImportError:
+        print("python-docx not available, skipping Word document creation")
+def create_allow_deny_lists():
+    """Create dummy allow/deny lists."""
+    # Allow lists
+    allow_data = {"word": ["test", "example", "document"]}
+    pd.DataFrame(allow_data).to_csv(
+        "doc_redaction/example_data/test_allow_list_graduate.csv", index=False
+    )
+    pd.DataFrame(allow_data).to_csv(
+        "doc_redaction/example_data/test_allow_list_partnership.csv", index=False
+    )
+    print("Created allow lists")
+    # Deny lists
+    deny_data = {"word": ["sensitive", "confidential", "private"]}
+    pd.DataFrame(deny_data).to_csv(
+        "doc_redaction/example_data/partnership_toolkit_redact_custom_deny_list.csv",
+        index=False,
+    )
+    pd.DataFrame(deny_data).to_csv(
+        "doc_redaction/example_data/Partnership-Agreement-Toolkit_test_deny_list_para_single_spell.csv",
+        index=False,
+    )
+    print("Created deny lists")
+    # Whole page redaction list
+    page_data = {"page": [1, 2]}
+    pd.DataFrame(page_data).to_csv(
+        "doc_redaction/example_data/partnership_toolkit_redact_some_pages.csv",
+        index=False,
+    )
+    print("Created whole page redaction list")
+def create_ocr_output():
+    """Create dummy OCR output CSV."""
+    ocr_data = {
+        "page": [1, 2, 3],
+        "text": [
+            "This is page 1 content with some text",
+            "This is page 2 content with different text",
+            "This is page 3 content with more text",
+        ],
+        "left": [0.1, 0.3, 0.5],
+        "top": [0.95, 0.92, 0.88],
+        "width": [0.05, 0.02, 0.02],
+        "height": [0.01, 0.02, 0.02],
+        "line": [1, 2, 3],
+    }
+    df = pd.DataFrame(ocr_data)
+    df.to_csv(
+        "doc_redaction/example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv",
+        index=False,
+    )
+    print("Created dummy OCR output CSV")
+def create_dummy_image():
+    """Create dummy image for testing."""
+    try:
+        from PIL import Image, ImageDraw, ImageFont
+        img = Image.new("RGB", (800, 600), color="white")
+        draw = ImageDraw.Draw(img)
+        # Try to use a system font
+        try:
+            font = ImageFont.truetype(
+                "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 20
+            )
+        except Exception as e:
+            print(f"Error loading DejaVuSans font: {e}")
+            try:
+                font = ImageFont.truetype("/System/Library/Fonts/Arial.ttf", 20)
+            except Exception as e:
+                print(f"Error loading Arial font: {e}")
+                font = ImageFont.load_default()
+        # Add text to image
+        draw.text((50, 50), "Test Document for Redaction", fill="black", font=font)
+        draw.text((50, 100), "Email: test@example.com", fill="black", font=font)
+        draw.text((50, 150), "Phone: 123-456-7890", fill="black", font=font)
+        draw.text((50, 200), "Name: John Doe", fill="black", font=font)
+        draw.text((50, 250), "Address: 123 Test Street", fill="black", font=font)
+        img.save("doc_redaction/example_data/example_complaint_letter.jpg")
+        print("Created dummy image")
+    except ImportError:
+        print("PIL not available, skipping image creation")
+def main():
+    """Main setup function."""
+    print("Setting up test data for GitHub Actions...")
+    print(f"Current working directory: {os.getcwd()}")
+    print(f"Python version: {sys.version}")
+    create_directories()
+    create_dummy_pdf()
+    create_dummy_csv()
+    create_dummy_word_doc()
+    create_allow_deny_lists()
+    create_ocr_output()
+    create_dummy_image()
+    print("\nTest data setup complete!")
+    print("Created files:")
+    for root, dirs, files in os.walk("doc_redaction/example_data"):
+        for file in files:
+            file_path = os.path.join(root, file)
+            print(f"  {file_path}")
+            # Verify the file exists and has content
+            if os.path.exists(file_path):
+                file_size = os.path.getsize(file_path)
+                print(f"    Size: {file_size} bytes")
+            else:
+                print("    WARNING: File does not exist!")
+    # Verify critical files exist
+    critical_files = [
+        "doc_redaction/example_data/Partnership-Agreement-Toolkit_0_0.pdf",
+        "doc_redaction/example_data/graduate-job-example-cover-letter.pdf",
+        "doc_redaction/example_data/example_of_emails_sent_to_a_professor_before_applying.pdf",
+    ]
+    print("\nVerifying critical test files:")
+    for file_path in critical_files:
+        if os.path.exists(file_path):
+            file_size = os.path.getsize(file_path)
+            print(f"✅ {file_path} exists ({file_size} bytes)")
+        else:
+            print(f"❌ {file_path} MISSING!")
+if __name__ == "__main__":
+    main()

.github/workflow_README.md ADDED Viewed

	@@ -0,0 +1,183 @@

+# GitHub Actions CI/CD Setup
+This directory contains GitHub Actions workflows for automated testing of the CLI redaction application.
+## Workflows Overview
+### 1. **Simple Test Run** (`.github/workflows/simple-test.yml`)
+- **Purpose**: Basic test execution
+- **Triggers**: Push to main/dev, Pull requests
+- **OS**: Ubuntu Latest
+- **Python**: 3.11
+- **Features**:
+  - Installs system dependencies
+  - Sets up test data
+  - Runs CLI tests
+  - Runs pytest
+### 2. **Comprehensive CI/CD** (`.github/workflows/ci.yml`)
+- **Purpose**: Full CI/CD pipeline
+- **Features**:
+  - Linting (Ruff, Black)
+  - Unit tests (Python 3.10, 3.11, 3.12)
+  - Integration tests
+  - Security scanning (Safety, Bandit)
+  - Coverage reporting
+  - Package building (on main branch)
+### 3. **Multi-OS Testing** (`.github/workflows/multi-os-test.yml`)
+- **Purpose**: Cross-platform testing
+- **OS**: Ubuntu, macOS (Windows not included currently but may be reintroduced)
+- **Python**: 3.10, 3.11, 3.12
+- **Features**: Tests compatibility across different operating systems
+### 4. **Basic Test Suite** (`.github/workflows/test.yml`)
+- **Purpose**: Original test workflow
+- **Features**:
+  - Multiple Python versions
+  - System dependency installation
+  - Test data creation
+  - Coverage reporting
+## Setup Scripts
+### Test Data Setup (`.github/scripts/setup_test_data.py`)
+Creates dummy test files when example data is not available:
+- PDF documents
+- CSV files
+- Word documents
+- Images
+- Allow/deny lists
+- OCR output files
+## Usage
+### Running Tests Locally
+```bash
+# Install dependencies
+pip install -r requirements.txt
+pip install pytest pytest-cov
+# Setup test data
+python .github/scripts/setup_test_data.py
+# Run tests
+cd test
+python cli_epilog_suite.py
+```
+### GitHub Actions Triggers
+1. **Push to main/dev**: Runs all tests
+2. **Pull Request**: Runs tests and linting
+3. **Daily Schedule**: Runs tests at 2 AM UTC
+4. **Manual Trigger**: Can be triggered manually from GitHub
+## Configuration
+### Environment Variables
+- `PYTHON_VERSION`: Default Python version (3.11)
+- `PYTHONPATH`: Set automatically for test discovery
+### Caching
+- Pip dependencies are cached for faster builds
+- Cache key based on requirements.txt hash
+### Artifacts
+- Test results (JUnit XML)
+- Coverage reports (HTML, XML)
+- Security reports
+- Build artifacts (on main branch)
+## Test Data
+The workflows automatically create test data when example files are missing:
+### Required Files Created:
+- `example_data/example_of_emails_sent_to_a_professor_before_applying.pdf`
+- `example_data/combined_case_notes.csv`
+- `example_data/Bold minimalist professional cover letter.docx`
+- `example_data/example_complaint_letter.jpg`
+- `example_data/test_allow_list_*.csv`
+- `example_data/partnership_toolkit_redact_*.csv`
+- `example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv`
+### Dependencies Installed:
+- **System**: tesseract-ocr, poppler-utils, OpenGL libraries
+- **Python**: All requirements.txt packages + pytest, reportlab, pillow
+## Workflow Status
+### Success Criteria:
+- ✅ All tests pass
+- ✅ No linting errors
+- ✅ Security checks pass
+- ✅ Coverage meets threshold (if configured)
+### Failure Handling:
+- Tests are designed to skip gracefully if files are missing
+- AWS tests are expected to fail without credentials
+- System dependency failures are handled with fallbacks
+## Customization
+### Adding New Tests:
+1. Add test methods to `test/cli_epilog_suite.py` or pytest files under `test/test_*.py`
+2. Update test data in `setup_test_data.py` if needed
+3. Tests will automatically run in all workflows
+### Modifying Workflows:
+1. Edit the appropriate `.yml` file
+2. Test locally first
+3. Push to trigger the workflow
+### Environment-Specific Settings:
+- **Ubuntu**: Full system dependencies
+- **Windows**: Python packages only
+- **macOS**: Homebrew dependencies
+## Troubleshooting
+### Common Issues:
+1. **Missing Dependencies**:
+   - Check system dependency installation
+   - Verify Python package versions
+2. **Test Failures**:
+   - Check test data creation
+   - Verify file paths
+   - Review test output logs
+3. **AWS Test Failures**:
+   - Expected without credentials
+   - Tests are designed to handle this gracefully
+4. **System Dependency Issues**:
+   - Different OS have different requirements
+   - Check the specific OS section in workflows
+### Debug Mode:
+Add `--verbose` or `-v` flags to pytest commands for more detailed output.
+## Security
+- Dependencies are scanned with Safety
+- Code is scanned with Bandit
+- No secrets are exposed in logs
+- Test data is temporary and cleaned up
+## Performance
+- Tests run in parallel where possible
+- Dependencies are cached
+- Only necessary system packages are installed
+- Test data is created efficiently
+## Monitoring
+- Workflow status is visible in GitHub Actions tab
+- Coverage reports are uploaded to Codecov
+- Test results are available as artifacts
+- Security reports are generated and stored

.github/workflows/archive_workflows/multi-os-test.yml ADDED Viewed

	@@ -0,0 +1,115 @@

+name: Multi-OS Test
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+permissions:
+  contents: read
+  actions: read
+jobs:
+  test:
+    runs-on: ${{ matrix.os }}
+    env:
+      SHOW_VLM_MODEL_OPTIONS: "False"
+    strategy:
+      matrix:
+        os: [ubuntu-latest, macos-latest] # windows-latest, not included as tesseract cannot be installed silently
+        python-version: ["3.11", "3.12", "3.13"]
+        exclude:
+          # Exclude some combinations to reduce CI time
+          #- os: windows-latest
+          #  python-version: ["3.12", "3.13"]
+          - os: macos-latest
+            python-version: ["3.12", "3.13"]
+    steps:
+    - uses: actions/checkout@v6
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v6
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install system dependencies (Ubuntu)
+      if: matrix.os == 'ubuntu-latest'
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y \
+          tesseract-ocr \
+          tesseract-ocr-eng \
+          poppler-utils \
+          libgl1-mesa-dri \
+          libglib2.0-0 \
+          libsm6 \
+          libxext6 \
+          libxrender-dev \
+          libgomp1
+    - name: Install system dependencies (macOS)
+      if: matrix.os == 'macos-latest'
+      run: |
+        brew install tesseract poppler
+    - name: Install system dependencies (Windows)
+      if: matrix.os == 'windows-latest'
+      run: |
+        # Create tools directory
+        if (!(Test-Path "C:\tools")) {
+            mkdir C:\tools
+        }
+        # Download and install Tesseract
+        $tesseractUrl = "https://github.com/tesseract-ocr/tesseract/releases/download/5.5.0/tesseract-ocr-w64-setup-5.5.0.20241111.exe"
+        $tesseractInstaller = "C:\tools\tesseract-installer.exe"
+        Invoke-WebRequest -Uri $tesseractUrl -OutFile $tesseractInstaller
+        # Install Tesseract silently
+        Start-Process -FilePath $tesseractInstaller -ArgumentList "/S", "/D=C:\tools\tesseract" -Wait
+        # Download and extract Poppler
+        $popplerUrl = "https://github.com/oschwartz10612/poppler-windows/releases/download/v25.07.0-0/Release-25.07.0-0.zip"
+        $popplerZip = "C:\tools\poppler.zip"
+        Invoke-WebRequest -Uri $popplerUrl -OutFile $popplerZip
+        # Extract Poppler
+        Expand-Archive -Path $popplerZip -DestinationPath C:\tools\poppler -Force
+        # Add to PATH
+        echo "C:\tools\tesseract" >> $env:GITHUB_PATH
+        echo "C:\tools\poppler\poppler-25.07.0\Library\bin" >> $env:GITHUB_PATH
+        # Set environment variables for your application
+        echo "TESSERACT_FOLDER=C:\tools\tesseract" >> $env:GITHUB_ENV
+        echo "POPPLER_FOLDER=C:\tools\poppler\poppler-25.07.0\Library\bin" >> $env:GITHUB_ENV
+        echo "TESSERACT_DATA_FOLDER=C:\tools\tesseract\tessdata" >> $env:GITHUB_ENV
+        # Verify installation using full paths (since PATH won't be updated in current session)
+        & "C:\tools\tesseract\tesseract.exe" --version
+        & "C:\tools\poppler\poppler-25.07.0\Library\bin\pdftoppm.exe" -v
+    - name: Install Python dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements.txt
+        pip install pytest pytest-cov reportlab pillow
+    - name: Download spaCy model
+      run: |
+        python -m spacy download en_core_web_lg
+    - name: Setup test data
+      run: |
+        python .github/scripts/setup_test_data.py
+    - name: Run CLI tests
+      run: |
+        cd test
+        python cli_epilog_suite.py
+    - name: Run tests with pytest
+      run: |
+        pytest test/ -v --tb=short

.github/workflows/ci.yml ADDED Viewed

	@@ -0,0 +1,269 @@

+name: CI/CD Pipeline
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+  workflow_dispatch:
+  #schedule:
+  # Run tests daily at 2 AM UTC
+  #  - cron: '0 2 * * *'
+permissions:
+  contents: read
+  actions: read
+  pull-requests: write
+  issues: write
+env:
+  PYTHON_VERSION: "3.11"
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v6
+    - name: Set up Python
+      uses: actions/setup-python@v6
+      with:
+        python-version: ${{ env.PYTHON_VERSION }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install ruff black
+    - name: Run Ruff linter
+      run: ruff check .
+    - name: Run Black formatter check
+      run: black --check .
+  test-unit:
+    runs-on: ubuntu-latest
+    env:
+      # Avoid optional VLM/torch import path in tools.run_vlm (not installed in lightweight CI deps)
+      SHOW_VLM_MODEL_OPTIONS: "False"
+    strategy:
+      matrix:
+        python-version: [3.11, 3.12, 3.13]
+    steps:
+    - uses: actions/checkout@v6
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v6
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Cache pip dependencies
+      uses: actions/cache@v5
+      with:
+        path: ~/.cache/pip
+        key: ${{ runner.os }}-pip-${{ hashFiles('requirements_lightweight.txt') }}
+        restore-keys: |
+          ${{ runner.os }}-pip-
+    - name: Install system dependencies
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y \
+          tesseract-ocr \
+          tesseract-ocr-eng \
+          poppler-utils \
+          libgl1-mesa-dri \
+          libglib2.0-0 \
+          libsm6 \
+          libxext6 \
+          libxrender-dev \
+          libgomp1
+    - name: Install Python dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements_lightweight.txt
+        pip install pytest pytest-cov pytest-html pytest-xdist reportlab pillow
+    - name: Download spaCy model
+      run: |
+        python -m spacy download en_core_web_lg
+    - name: Setup test data
+      run: |
+        python .github/scripts/setup_test_data.py
+        echo "Setup script completed. Checking results:"
+        ls -la doc_redaction/example_data/ || echo "doc_redaction/example_data directory not found"
+    - name: Verify test data files
+      run: |
+        echo "Checking if critical test files exist:"
+        ls -la doc_redaction/example_data/
+        echo "Checking for specific PDF files:"
+        ls -la doc_redaction/example_data/*.pdf || echo "No PDF files found"
+        echo "Checking file sizes:"
+        find doc_redaction/example_data -name "*.pdf" -exec ls -lh {} \;
+    - name: Clean up problematic config files
+      run: |
+        rm -f config*.py || true
+    - name: Run CLI tests
+      run: |
+        cd test
+        python cli_epilog_suite.py
+    - name: Run tests with pytest (JUnit and coverage)
+      run: |
+        pytest test/ -v --tb=short \
+          --junitxml=test-results.xml \
+          --cov=. --cov-config=.coveragerc \
+          --cov-report=xml --cov-report=html --cov-report=term
+    #- name: Upload coverage to Codecov - not necessary
+    #  uses: codecov/codecov-action@v3
+    #  if: matrix.python-version == '3.11'
+    #  with:
+    #    file: ./coverage.xml
+    #    flags: unittests
+    #    name: codecov-umbrella
+    #    fail_ci_if_error: false
+    - name: Upload test results
+      uses: actions/upload-artifact@v6
+      if: always()
+      with:
+        name: test-results-python-${{ matrix.python-version }}
+        path: |
+          test-results.xml
+          htmlcov/
+          coverage.xml
+  test-integration:
+    runs-on: ubuntu-latest
+    needs: [lint, test-unit]
+    env:
+      SHOW_VLM_MODEL_OPTIONS: "False"
+    steps:
+    - uses: actions/checkout@v6
+    - name: Set up Python
+      uses: actions/setup-python@v6
+      with:
+        python-version: ${{ env.PYTHON_VERSION }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements_lightweight.txt
+        pip install pytest pytest-cov reportlab pillow
+    - name: Install system dependencies
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y \
+          tesseract-ocr \
+          tesseract-ocr-eng \
+          poppler-utils \
+          libgl1-mesa-dri \
+          libglib2.0-0 \
+          libsm6 \
+          libxext6 \
+          libxrender-dev \
+          libgomp1
+    - name: Download spaCy model
+      run: |
+        python -m spacy download en_core_web_lg
+    - name: Setup test data
+      run: |
+        python .github/scripts/setup_test_data.py
+        echo "Setup script completed. Checking results:"
+        ls -la doc_redaction/example_data/ || echo "doc_redaction/example_data directory not found"
+    - name: Verify test data files
+      run: |
+        echo "Checking if critical test files exist:"
+        ls -la doc_redaction/example_data/
+        echo "Checking for specific PDF files:"
+        ls -la doc_redaction/example_data/*.pdf || echo "No PDF files found"
+        echo "Checking file sizes:"
+        find doc_redaction/example_data -name "*.pdf" -exec ls -lh {} \;
+    - name: Run integration tests
+      run: |
+        cd test
+        python demo_single_test.py
+    - name: Test CLI help
+      run: |
+        python cli_redact.py --help
+    - name: Test CLI version
+      run: |
+        python -c "import sys; print(f'Python {sys.version}')"
+  security:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v6
+    - name: Set up Python
+      uses: actions/setup-python@v6
+      with:
+        python-version: ${{ env.PYTHON_VERSION }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install safety bandit
+    #- name: Run safety scan - removed as now requires login
+    #  run: |
+    #    safety scan -r requirements.txt
+    - name: Run bandit security check
+      run: |
+        bandit -r . -f json -o bandit-report.json || true
+    - name: Upload security report
+      uses: actions/upload-artifact@v6
+      if: always()
+      with:
+        name: security-report
+        path: bandit-report.json
+  build:
+    runs-on: ubuntu-latest
+    needs: [lint, test-unit]
+    if: github.event_name == 'push' && github.ref == 'refs/heads/main'
+    steps:
+    - uses: actions/checkout@v6
+    - name: Set up Python
+      uses: actions/setup-python@v6
+      with:
+        python-version: ${{ env.PYTHON_VERSION }}
+    - name: Install build dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install build twine
+    - name: Build package
+      run: |
+        python -m build
+    - name: Check package
+      run: |
+        twine check dist/*
+    - name: Upload build artifacts
+      uses: actions/upload-artifact@v6
+      with:
+        name: dist
+        path: dist/

.github/workflows/simple-test.yml ADDED Viewed

	@@ -0,0 +1,74 @@

+name: Simple Test Run
+on:
+  push:
+    branches: [ dev ]
+  pull_request:
+    branches: [ dev ]
+  workflow_dispatch:
+permissions:
+  contents: read
+  actions: read
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    env:
+      SHOW_VLM_MODEL_OPTIONS: "False"
+    steps:
+    - uses: actions/checkout@v6
+    - name: Set up Python 3.12
+      uses: actions/setup-python@v6
+      with:
+        python-version: "3.12"
+    - name: Install system dependencies
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y \
+          tesseract-ocr \
+          tesseract-ocr-eng \
+          poppler-utils \
+          libgl1-mesa-dri \
+          libglib2.0-0 \
+          libsm6 \
+          libxext6 \
+          libxrender-dev \
+          libgomp1
+    - name: Install Python dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements_lightweight.txt
+        pip install pytest pytest-cov reportlab pillow
+    - name: Download spaCy model
+      run: |
+        python -m spacy download en_core_web_lg
+    - name: Setup test data
+      run: |
+        python .github/scripts/setup_test_data.py
+        echo "Setup script completed. Checking results:"
+        ls -la doc_redaction/example_data/ || echo "doc_redaction/example_data directory not found"
+    - name: Verify test data files
+      run: |
+        echo "Checking if critical test files exist:"
+        ls -la doc_redaction/example_data/
+        echo "Checking for specific PDF files:"
+        ls -la doc_redaction/example_data/*.pdf || echo "No PDF files found"
+        echo "Checking file sizes:"
+        find doc_redaction/example_data -name "*.pdf" -exec ls -lh {} \;
+    - name: Run CLI tests
+      run: |
+        cd test
+        python cli_epilog_suite.py
+    - name: Run tests with pytest
+      run: |
+        pytest test/ -v --tb=short

.github/workflows/sync-pi-agent-space.yml ADDED Viewed

	@@ -0,0 +1,64 @@

+name: Sync Pi agent to Hugging Face Space
+on:
+  push:
+    branches: [dev]
+    paths:
+      - "agent-redact/**"
+      - "skills/**"
+      - "tools/**"
+      - "intros/**"
+      - "doc_redaction/example_data/**"
+      - "AGENTS.md"
+      - "config/**"
+      - ".github/workflows/sync-pi-agent-space.yml"
+  workflow_dispatch:
+permissions:
+  contents: read
+jobs:
+  sync-pi-agent-space:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v6
+        with:
+          fetch-depth: 1
+          lfs: true
+      - name: Install Git LFS
+        run: git lfs install
+      - name: Materialize example PDFs (Git LFS)
+        run: |
+          git lfs pull --include="doc_redaction/example_data/*.pdf"
+          for f in \
+            doc_redaction/example_data/example_of_emails_sent_to_a_professor_before_applying.pdf \
+            doc_redaction/example_data/graduate-job-example-cover-letter.pdf; do
+            if head -1 "$f" | grep -q "^version https://git-lfs.github.com/spec/v1"; then
+              echo "Example PDF is still an LFS pointer (not materialized): $f" >&2
+              exit 1
+            fi
+          done
+      - name: Flatten Pi agent Space tree
+        run: |
+          chmod +x agent-redact/pi-agent/sync_to_space.sh
+          agent-redact/pi-agent/sync_to_space.sh /tmp/pi-agent-space
+      - name: Push to Hugging Face Space
+        run: |
+          COMMIT_MSG=$(git log -1 --pretty=%B)
+          echo "Syncing Pi agent Space: seanpedrickcase/agentic_document_redaction"
+          cd /tmp/pi-agent-space
+          git init -b main
+          git config user.name "$HF_USERNAME"
+          git config user.email "$HF_EMAIL"
+          git add .
+          git commit -m "Sync Pi agent Space: $COMMIT_MSG"
+          git remote add hf "https://${HF_USERNAME}:${HF_TOKEN}@huggingface.co/spaces/${HF_USERNAME}/agentic_document_redaction"
+          git push --force hf main
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          HF_USERNAME: ${{ secrets.HF_USERNAME }}
+          HF_EMAIL: ${{ secrets.HF_EMAIL }}

.github/workflows/sync_to_hf.yml ADDED Viewed

	@@ -0,0 +1,54 @@

+name: Sync to Hugging Face hub
+on:
+  push:
+    branches: [dev]
+  workflow_dispatch:
+permissions:
+  contents: read
+jobs:
+  sync-to-hub:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v6
+        with:
+          fetch-depth: 1      # Only get the latest state
+          lfs: true           # Download actual LFS files so they can be pushed
+      - name: Install Git LFS
+        run: git lfs install
+      - name: Recreate repo history (single-commit force push)
+        run: |
+          # 1. Capture the message BEFORE we delete the .git folder
+          COMMIT_MSG=$(git log -1 --pretty=%B)
+          echo "Syncing commit message: $COMMIT_MSG"
+          # 2. DELETE the .git folder.
+          # This turns the repo into a standard folder of files.
+          rm -rf .git
+          # 3. Re-initialize a brand new git repo
+          git init -b main
+          git config --global user.name "$HF_USERNAME"
+          git config --global user.email "$HF_EMAIL"
+          # 4. Re-install LFS (needs to be done after git init)
+          git lfs install
+          # 5. Add the remote
+          git remote add hf https://$HF_USERNAME:$HF_TOKEN@huggingface.co/spaces/$HF_USERNAME/$HF_REPO_ID
+          # 6. Add all files
+          # Since this is a fresh init, Git sees EVERY file as "New"
+          git add .
+          # 7. Commit and Force Push
+          git commit -m "Sync: $COMMIT_MSG"
+          git push --force hf main
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          HF_USERNAME: ${{ secrets.HF_USERNAME }}
+          HF_EMAIL: ${{ secrets.HF_EMAIL }}
+          HF_REPO_ID: ${{ secrets.HF_REPO_ID }}

.github/workflows/sync_to_hf_zero_gpu.yml ADDED Viewed

	@@ -0,0 +1,59 @@

+name: Sync to Hugging Face hub Zero GPU
+on:
+  push:
+    branches: [dev]
+  workflow_dispatch:
+permissions:
+  contents: read
+jobs:
+  sync-to-hub-zero-gpu:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v6
+        with:
+          fetch-depth: 1      # Only get the latest state
+          lfs: true           # Download actual LFS files so they can be pushed
+      - name: Install Git LFS
+        run: git lfs install
+      # HF Spaces read Space config from README.md front matter. The repo README
+      # targets GitHub (e.g. docker); patch only this CI checkout before HF push.
+      - name: Apply HF Zero GPU Space README front matter
+        run: python3 tools/apply_hf_zero_gpu_readme_frontmatter.py
+      - name: Recreate repo history (single-commit force push)
+        run: |
+          # 1. Capture the message BEFORE we delete the .git folder
+          COMMIT_MSG=$(git log -1 --pretty=%B)
+          echo "Syncing commit message: $COMMIT_MSG"
+          # 2. DELETE the .git folder.
+          # This turns the repo into a standard folder of files.
+          rm -rf .git
+          # 3. Re-initialize a brand new git repo
+          git init -b main
+          git config --global user.name "$HF_USERNAME"
+          git config --global user.email "$HF_EMAIL"
+          # 4. Re-install LFS (needs to be done after git init)
+          git lfs install
+          # 5. Add the remote
+          git remote add hf https://$HF_USERNAME:$HF_TOKEN@huggingface.co/spaces/$HF_USERNAME/$HF_REPO_ID_ZERO_GPU
+          # 6. Add all files
+          # Since this is a fresh init, Git sees EVERY file as "New"
+          git add .
+          # 7. Commit and Force Push
+          git commit -m "Sync: $COMMIT_MSG"
+          git push --force hf main
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          HF_USERNAME: ${{ secrets.HF_USERNAME }}
+          HF_EMAIL: ${{ secrets.HF_EMAIL }}
+          HF_REPO_ID_ZERO_GPU: ${{ secrets.HF_REPO_ID_ZERO_GPU }}

.gitignore ADDED Viewed

	@@ -0,0 +1,62 @@

+*.url
+*.ipynb
+*.pyc
+*.qmd
+_quarto.yml
+quarto_site/*
+src/*
+redaction_deps/*
+.venv/*
+examples/*
+processing/*
+input/*
+output/*
+tools/__pycache__/*
+old_code/*
+tesseract/*
+poppler/*
+build/*
+dist/*
+build_deps/*
+logs/*
+usage/*
+feedback/*
+config/*
+!config/pi_agent.env.example
+!config/docker_app_config.env.example
+!config/app_config.env.example
+workspace/*
+user_guide/*
+_extensions/*
+doc_redaction.egg-info/*
+.venv_pypi_test/*
+cdk/config/*
+cdk/cdk.out/*
+cdk/archive/*
+tld/*
+tmp/*
+docs/*
+.pi/*
+cdk.out/*
+cdk.json
+cdk.context.json
+precheck.context.json
+.quarto/*
+/.quarto/
+/_site/
+test/config/*
+test/feedback/*
+test/input/*
+test/logs/*
+test/output/*
+test/tmp/*
+test/usage/*
+.ruff_cache/*
+model_cache/*
+sanitized_file/*
+src/doc_redaction.egg-info/*
+docker_compose/*
+**/*.quarto_ipynb
+skills/example_prompts/*
+.pi/sessions/
+agent-redact/pi/agent/sessions/

AGENTS.md ADDED Viewed

	@@ -0,0 +1,113 @@

+# AGENTS.md
+Context for AI coding agents working on **doc_redaction** (PII redaction for PDFs, images, Word, and tabular files). Human-oriented docs: [README.md](README.md). User guide: [doc_redaction user guide](https://seanpedrick-case.github.io/doc_redaction/src/user_guide.html).
+## Project overview
+- **Stack**: Python 3.10+, Gradio UI ([app.py](app.py)), optional FastAPI when `RUN_FASTAPI` is enabled, AWS/LLM integrations via [tools/config.py](tools/config.py) and env files under `config/`.
+- **License**: AGPL-3.0-only (see [pyproject.toml](pyproject.toml)). Respect license terms when adding dependencies.
+- **Accuracy**: Outputs are not guaranteed complete; downstream use should assume **human review** of redacted material.
+## Cursor skills: redaction workflow (optional)
+For agents operating the deployed app (Gradio Client, review CSV, `/review_apply`), these repo-local playbooks are a suggested ladder:
+0. **[`skills/doc-redaction-task-prompt/TASK_PROMPT_TEMPLATE.md`](skills/doc-redaction-task-prompt/TASK_PROMPT_TEMPLATE.md)** — copy-paste user task prompt (Pass 1 default, Pass 2 gated); **user redaction requirements go at the end of the prompt**.
+1. **[`skills/doc-redaction-app/SKILL.md`](skills/doc-redaction-app/SKILL.md)** — first-pass redaction (`/doc_redact` / `/redact_document`) and downloading artifacts.
+2. **[`skills/doc-redact-page-review/SKILL.md`](skills/doc-redact-page-review/SKILL.md)** — after outputs exist: **parallel per-page** child agents, merge into one full-document `*_review_file.csv`, **single** `/review_apply` from the parent.
+3. **[`skills/doc-redaction-modifications/SKILL.md`](skills/doc-redaction-modifications/SKILL.md)** — CSV mechanics, `preview_redaction_boxes`, `/review_apply` patterns, verification, VLM and PyMuPDF fallbacks (single-thread edits and the **technical** reference for page-review children).
+## Setup
+1. **System**: Install **Tesseract** and **Poppler** (required for OCR/PDF). See [README.md](README.md) (Windows/Linux sections).
+2. **Python**: Create a venv, then install the project (e.g. `pip install -e ".[dev]"` or follow README).
+3. **Configuration**: Copy or edit environment/config as described in README / `config/` (e.g. `app_config.env`). Do not commit secrets.
+## Run locally
+- Gradio/FastAPI entrypoint is [app.py](app.py). With FastAPI enabled, typical pattern is `uvicorn app:app --host 0.0.0.0 --port 7860` (exact host/port from your config).
+- OpenAPI docs: `/docs` when the FastAPI app is mounted.
+## Tests
+- Run from repo root: `pytest` (optional: `pytest test/`).
+- Fix failures related to your changes before opening a PR.
+## Line order (local OCR and simple text extraction)
+Multi-column layouts use shared logic in [`tools/ocr_reading_order.py`](tools/ocr_reading_order.py). Controlled by **`LOCAL_OCR_READING_ORDER`** (`column` default, `legacy` for previous top-left behaviour).
+### Local OCR (Paddle/Tesseract)
+Word boxes are merged into line-level CSV rows in [`combine_ocr_results`](tools/custom_image_analyser_engine.py).
+- **`column`**: detect text columns, assign line numbers down each column left-to-right; full-width lines (headers) first. Stops cross-column merging that produced wide erroneous lines on multi-column PDFs. **Auto-fallback**: the page is treated as single-column unless a *consecutive cluster* of gutter rows (y-gap between adjacent rows ≤ `OCR_COLUMN_MAX_CONSECUTIVE_GUTTER_GAP`, default `0.06` of page height) has ≥ `OCR_COLUMN_MIN_GUTTER_ROWS` (default `3`) rows **and** the cluster's topmost row is above the footer zone (`OCR_COLUMN_FOOTER_ZONE_FRACTION`, default `0.75`). This prevents isolated header bands (logo | title, 1 gutter row), signature-only blocks at the page bottom (cluster starts at y ≥ 0.75), or the combination of both, from forcing column mode on the single-column body text between them.
+- **`PADDLE_PRESERVE_LINE_BOXES=True`** or **`CONVERT_LINE_TO_WORD_LEVEL=False`** with Paddle: keep Paddle line boxes (skip word split + regrouping); line numbers still use column reading order.
+### Simple text extraction (PyMuPDF)
+[`redact_text_pdf`](tools/file_redaction.py) → [`process_page_to_structured_ocr_pymupdf`](tools/file_redaction.py) calls [`reorder_structured_text_lines`](tools/ocr_reading_order.py) after collecting lines, using **`page.mediabox`** width/height for full-span header detection.
+`reorder_structured_text_lines` now mirrors `build_line_groups` (local OCR route):
+1. **Column-aware sort** (`sort_reading_order` / `assign_layout_boxes` / `detect_column_split_xpoints`) — or legacy top-left for single-column pages.
+2. **Y-band grouping** (`group_into_lines`) — merges any same-row PyMuPDF lines that were emitted as separate objects (e.g. mixed-font spans) and splits horizontally-disparate boxes via `_finalize_line`.  *Column mode only.*
+3. **Secondary sub-column pass** (`_reorder_lines_column_major`) — ensures correct column-major order when sub-columns sit within a single macro-column.  *Column mode only.*
+4. When a group contains more than one box, constituent boxes are **merged** into a single `OCRResult` (union bbox, joined text, concatenated chars/words).
+In single-column / legacy mode only step 1 is applied; PyMuPDF lines are pre-formed so no merging is needed.
+### Tunables (both routes)
+`OCR_FULL_SPAN_WIDTH_RATIO`, `OCR_COLUMN_GAP_MIN_FRACTION`, `OCR_COLUMN_GUTTER_MIN_FRACTION`, `OCR_COLUMN_SUBGUTTER_MIN_FRACTION` (default `0.015` — fine-grained gutter scan in `assign_layout_boxes`; lower = detects narrower sub-column boundaries), `OCR_COLUMN_MIN_GUTTER_ROWS`, `OCR_COLUMN_MAX_BOX_HEIGHT_RATIO`, `OCR_COLUMN_MAX_CONSECUTIVE_GUTTER_GAP`, `OCR_COLUMN_FOOTER_ZONE_FRACTION`, `OCR_LINE_SPLIT_GAP_FRACTION` (default 0.025 — horizontal gap fraction that forces a line split; must be below the narrowest column gutter, ~0.030 for two-page spreads; also used as the gap threshold for the secondary sub-column sort in `build_line_groups`), `OCR_LINE_Y_THRESHOLD_FRACTION` (default 0.013 — row-alignment tolerance as a fraction of page height; reduced from 0.015 to correctly separate tightly-set 10 pt body text whose row spacing is ~0.014), `OCR_LINE_Y_THRESHOLD_MIN_PX`.
+**Sub-column ordering** (`build_line_groups`): after the primary word-level column sort, a second pass (`_reorder_lines_column_major`) clusters the produced line groups by their leftmost x-position using `OCR_LINE_SPLIT_GAP_FRACTION` as the gap threshold. This ensures that adjacent narrow sub-columns whose word-level centre gap is below `column_gap_threshold` (e.g. two columns on a spread where each page is already one macro-column) are still output in left-to-right column-major order rather than interleaved by y-position.
+**Fine-grained gutter-based column assignment** (`assign_layout_boxes`): before falling back to centre-gap clustering, `detect_column_split_xpoints` scans the page for structural gutters at the finer `OCR_COLUMN_SUBGUTTER_MIN_FRACTION` threshold (default 0.015). Each qualifying gutter cluster produces a `(split_x, y_min)` pair — the split point is only applied to boxes whose `top ≥ y_min`, preventing a narrow sub-column gutter (visible only in the lower two-column section) from mis-splitting a full-width introductory paragraph that sits above it. This correctly separates narrow adjacent columns (e.g. 1.9 % gutter on a two-page spread) without fragmenting full-width headings or paragraphs.
+Changing line order affects PII page text, duplicate-page detection, and review CSV line indices on multi-column documents; re-review after upgrading.
+## Agentic / programmatic access (two surfaces)
+### 1. FastAPI Agent API (recommended for LLM agents: small JSON bodies)
+When `RUN_FASTAPI` is true, routes are mounted under **`/agent`** ([agent_routes.py](agent_routes.py)).
+- **Catalog**: `GET /agent/operations` — maps each Gradio `api_name` to an HTTP path and notes whether the route is implemented via CLI or returns HTTP 501 for Gradio-only flows.
+- **Implemented POST routes** (CLI- or [tools/simplified_api.py](tools/simplified_api.py)-backed where noted):
+  `redact_document`, `redact_data`, `find_duplicate_pages`, `find_duplicate_tabular`, `summarise_document`, `combine_review_pdfs`, `combine_review_csvs`, `export_review_redaction_overlay`, `export_review_page_ocr_visualisation`, `apply_review_redactions`, **`verify_redaction_coverage`** (Pass 1 QA: `must_redact` / `must_not_redact` regex lists, optional `redacted_pdf_path`, optional `auto_prune_suspicious` + `pruned_output_path`; returns `pass_strict`, `pass_with_cleanup`, `pages_flagged_for_vlm`, `pages_needing_csv_cleanup`), **`word_level_ocr_text_search`** (headless word OCR search with optional review-box overlap flags).
+**Optional post-redaction Pass 1 QA (main app / CLI):** When `POST_REDACT_PASS1_QA=True` in [`tools/config.py`](tools/config.py) (or `config/app_config.env`), initial redaction emits `*_coverage_report.json` beside the review CSV and optionally `*_review_file_pruned.csv` (sibling, when `POST_REDACT_PASS1_AUTO_PRUNE=True`). Uses deny/allow lists and/or `POST_REDACT_PASS1_MUST_REDACT_PATH` / `POST_REDACT_PASS1_MUST_NOT_REDACT_PATH`. CLI overrides: `--post-redact-pass1-qa`, `--post-redact-pass1-auto-prune`. This is pre-review-apply sanity QA only — agent Pass 1 (policy edits + `/review_apply`) remains separate.
+  Note: on Gradio ([app.py](app.py)), the Review-tab visual exports use `api_name` **`page_redaction_review_image`** and **`page_ocr_review_image`**; the **`/agent`** routes above keep the explicit `export_review_*` names for the same operations.
+- **Gradio-only stubs** (501 + JSON hint): `load_and_prepare_documents_or_data`.
+- **Auth**: If `AGENT_API_KEY` is set in the environment, send header `X-Agent-API-Key` with that value.
+- **Paths**: Inputs must resolve to files under the repo root, `INPUT_FOLDER`, or `OUTPUT_FOLDER` (see router validation).
+Implementation uses **`cli_redact.main(direct_mode_args=...)`** where a CLI task exists (same behaviour as [cli_redact.py](cli_redact.py)); `apply_review_redactions` calls [tools/simplified_api.py](tools/simplified_api.py) instead.
+### 2. Gradio Client API (e.g. Hugging Face Spaces)
+For remote Spaces or any Gradio deployment exposing the HTTP API:
+- **Schema**: `GET https://<host>/gradio_api/info`
+- **Call**: `POST https://<host>/gradio_api/call/{api_name}` with body `{"data":[...]}` (argument order matches the named endpoint’s component list).
+- **Poll**: `GET https://<host>/gradio_api/call/{api_name}/{event_id}`
+- **Hugging Face**: `Authorization: Bearer $HF_TOKEN`
+Named `api_name` values in this app include: `redact_document`, `load_and_prepare_documents_or_data`, `apply_review_redactions`, **`doc_redact`** (simple `gr.api`: one PDF/image + optional OCR/PII knobs; returns `(output_paths, message)`; `api_name='/doc_redact'`; parameters include `document_file`, `redact_entities`, `output_dir`, `ocr_method`, `pii_method`, `allow_list`, `deny_list`, `page_min`, `page_max`, **`handwrite_signature_checkbox`** — AWS Textract extraction options such as `Extract handwriting` / `Extract signatures`), **`review_apply`** (simple `gr.api`: PDF + `*_review_file.csv`; returns `(output_paths, message)`; `api_name='/review_apply'`), **`preview_boxes`** (simple `gr.api`: PDF + `*_review_file.csv`; renders proposed boxes onto the original PDF and returns `(zip_path, message)` — use to verify coordinates *before* calling `review_apply`, no redaction applied; `api_name='/preview_boxes'`), **`pdf_summarise`** (simple `gr.api`: PDF + optional summarisation/OCR knobs; returns `(output_paths, status_message, summary_text)`; `api_name='/pdf_summarise'`), **`tabular_redact`** (simple `gr.api`: one tabular file (CSV/XLSX/Parquet/DOCX) + optional knobs; returns `(output_paths, message)`; `api_name='/tabular_redact'`), **`page_redaction_review_image`** (short review overlay export; `api_name='/page_redaction_review_image'`), **`page_ocr_review_image`** (short OCR visualisation export; `api_name='/page_ocr_review_image'`), `word_level_ocr_text_search`, `redact_data`, `find_duplicate_pages`, `find_duplicate_tabular`, `summarise_document`, `combine_review_csvs`, `combine_review_pdfs`. The matching **`POST /agent`** names for those two visual exports are `export_review_redaction_overlay` and `export_review_page_ocr_visualisation` (§1). Many endpoints require **many positional arguments** (full Gradio state); prefer the short `gr.api` routes above or **`POST /agent/apply_review_redactions`** where applicable instead of building the full `data` array from `/gradio_api/info`.
+## CLI parity
+For scripting and tests, `python cli_redact.py` with flags is authoritative; programmatic merges use `get_cli_default_args_dict()` in [cli_redact.py](cli_redact.py).
+## Security and data handling
+- Do not commit API keys, tokens, or customer data.
+- Treat paths as untrusted outside validated roots (see [tools/secure_path_utils.py](tools/secure_path_utils.py)).
+- Optional `instruction` / LLM fields must not be passed into shell or unconstrained config keys.
+## Conventions for PRs
+- Keep changes focused; avoid drive-by refactors.
+- Match existing naming and patterns in [app.py](app.py) and [tools/](tools/).
+- Update tests when behaviour changes; run `pytest` before merge.

Dockerfile ADDED Viewed

	@@ -0,0 +1,232 @@

+# Stage 1: Build dependencies and download models
+FROM public.ecr.aws/docker/library/python:3.12.13-slim-trixie AS builder
+# Install system dependencies
+RUN apt-get update \
+    && apt-get upgrade -y \
+    && apt-get install -y --no-install-recommends \
+        g++ \
+        make \
+        cmake \
+        unzip \
+        libcurl4-openssl-dev \
+        git \
+    && pip install --upgrade pip \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+WORKDIR /src
+COPY requirements_lightweight.txt .
+RUN pip install --verbose --no-cache-dir --target=/install -r requirements_lightweight.txt && rm requirements_lightweight.txt
+ARG INSTALL_GRADIO_MCP=False
+ENV INSTALL_GRADIO_MCP=${INSTALL_GRADIO_MCP}
+RUN if [ "$INSTALL_GRADIO_MCP" = "True" ]; then \
+    pip install --verbose --no-cache-dir --force-reinstall --target=/install "gradio[mcp]<=6.10.0"; \
+fi
+# Optionally install PaddleOCR if the INSTALL_PADDLEOCR environment variable is set to True. Note that GPU-enabled PaddleOCR is unlikely to work in the same environment as a GPU-enabled version of PyTorch, so it is recommended to install PaddleOCR as a CPU-only version if you want to use GPU-enabled PyTorch.
+ARG INSTALL_PADDLEOCR=False
+ENV INSTALL_PADDLEOCR=${INSTALL_PADDLEOCR}
+ARG PADDLE_GPU_ENABLED=False
+ENV PADDLE_GPU_ENABLED=${PADDLE_GPU_ENABLED}
+RUN if [ "$INSTALL_PADDLEOCR" = "True" ] && [ "$PADDLE_GPU_ENABLED" = "False" ]; then \
+    pip install --verbose --no-cache-dir --target=/install "protobuf<=7.34.0" && \
+    pip install --verbose --no-cache-dir --target=/install "paddlepaddle<=3.2.1" && \
+    pip install --verbose --no-cache-dir --target=/install "paddleocr<=3.3.0"; \
+elif [ "$INSTALL_PADDLEOCR" = "True" ] && [ "$PADDLE_GPU_ENABLED" = "True" ]; then \
+    pip install --verbose --no-cache-dir --target=/install "protobuf<=7.34.0" && \
+    pip install --verbose --no-cache-dir --target=/install "paddlepaddle-gpu<=3.2.1" --index-url https://www.paddlepaddle.org.cn/packages/stable/cu129/ && \
+    pip install --verbose --no-cache-dir --target=/install "paddleocr<=3.3.0"; \
+fi
+ARG INSTALL_VLM=False
+ENV INSTALL_VLM=${INSTALL_VLM}
+ARG TORCH_GPU_ENABLED=False
+ENV TORCH_GPU_ENABLED=${TORCH_GPU_ENABLED}
+# Optionally install VLM/LLM packages if the INSTALL_VLM environment variable is set to True.
+RUN if [ "$INSTALL_VLM" = "True" ] && [ "$TORCH_GPU_ENABLED" = "False" ]; then \
+    pip install --verbose --no-cache-dir --target=/install \
+    "torch==2.9.1+cpu" \
+    "torchvision==0.24.1+cpu" \
+    "transformers<=5.5.4" \
+    "accelerate<=1.13.0" \
+    "bitsandbytes<=0.49.2" \
+    "sentencepiece<=0.2.1" \
+    --extra-index-url https://download.pytorch.org/whl/cpu; \
+elif [ "$INSTALL_VLM" = "True" ] && [ "$TORCH_GPU_ENABLED" = "True" ]; then \
+    pip install --verbose --no-cache-dir --target=/install "torch<=2.8.0" --index-url https://download.pytorch.org/whl/cu129 && \
+    pip install --verbose --no-cache-dir --target=/install "torchvision<=0.23.0" --index-url https://download.pytorch.org/whl/cu129 && \
+    pip install --verbose --no-cache-dir --target=/install \
+        "transformers<=5.5.4" \
+        "accelerate<=1.13.0" \
+        "bitsandbytes<=0.49.2" \
+        "sentencepiece<=0.2.1" && \
+    pip install --verbose --no-cache-dir --target=/install "optimum<=2.1.0" && \
+    pip install --verbose --no-cache-dir --target=/install  https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu12torch2.8cxx11abiTRUE-cp312-cp312-linux_x86_64.whl && \
+    pip install --verbose --no-cache-dir --target=/install  https://github.com/ModelCloud/GPTQModel/releases/download/v5.8.0/gptqmodel-5.8.0+cu128torch2.8-cp312-cp312-linux_x86_64.whl; \
+fi
+# ===================================================================
+# Stage 2: A common base for both Lambda and Gradio
+# ===================================================================
+FROM public.ecr.aws/docker/library/python:3.12.13-slim-trixie AS base
+# MUST re-declare ARGs in every stage where they are used in RUN commands
+ARG TORCH_GPU_ENABLED=False
+ARG PADDLE_GPU_ENABLED=False
+ENV TORCH_GPU_ENABLED=${TORCH_GPU_ENABLED}
+ENV PADDLE_GPU_ENABLED=${PADDLE_GPU_ENABLED}
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    tesseract-ocr \
+    poppler-utils \
+    libgl1 \
+    libglib2.0-0 && \
+    if [ "$TORCH_GPU_ENABLED" = "True" ] || [ "$PADDLE_GPU_ENABLED" = "True" ]; then \
+        apt-get install -y --no-install-recommends libgomp1; \
+    fi && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+ENV APP_HOME=/home/user
+# Set env variables for Gradio & other apps
+ENV GRADIO_TEMP_DIR=/tmp/gradio_tmp/ \
+    MPLCONFIGDIR=/tmp/matplotlib_cache/ \
+    GRADIO_OUTPUT_FOLDER=$APP_HOME/app/output/ \
+    GRADIO_INPUT_FOLDER=$APP_HOME/app/input/ \
+    FEEDBACK_LOGS_FOLDER=$APP_HOME/app/feedback/ \
+    ACCESS_LOGS_FOLDER=$APP_HOME/app/logs/ \
+    USAGE_LOGS_FOLDER=$APP_HOME/app/usage/ \
+    CONFIG_FOLDER=$APP_HOME/app/config/ \
+    XDG_CACHE_HOME=/tmp/xdg_cache/user_1000 \
+    TESSERACT_DATA_FOLDER=/usr/share/tessdata \
+    GRADIO_SERVER_NAME=0.0.0.0 \
+    GRADIO_SERVER_PORT=7860 \
+    PATH=$APP_HOME/.local/bin:$PATH \
+    PYTHONPATH=$APP_HOME/app \
+    PYTHONUNBUFFERED=1 \
+    PYTHONDONTWRITEBYTECODE=1 \
+    GRADIO_ALLOW_FLAGGING=never \
+    GRADIO_NUM_PORTS=1 \
+    GRADIO_ANALYTICS_ENABLED=False
+# Copy Python packages from the builder stage
+COPY --from=builder /install /usr/local/lib/python3.12/site-packages/
+COPY --from=builder /install/bin /usr/local/bin/
+# Reinstall protobuf into the final site-packages. Builder uses multiple `pip install --target=/install`
+# passes; that can break the `google` namespace so `google.protobuf` is missing and Paddle fails at import.
+RUN pip install --no-cache-dir "protobuf<=7.34.0"
+# English pipeline is not a normal PyPI dependency; bundle it in the image so runtime works offline.
+# Placed before COPY app code so application changes do not invalidate this layer.
+RUN python -m spacy download en_core_web_lg
+# Copy your application code and entrypoint
+COPY . ${APP_HOME}/app
+COPY entrypoint.sh ${APP_HOME}/app/entrypoint.sh
+# Fix line endings and set execute permissions
+RUN sed -i 's/\r$//' ${APP_HOME}/app/entrypoint.sh \
+    && chmod +x ${APP_HOME}/app/entrypoint.sh
+WORKDIR ${APP_HOME}/app
+# ===================================================================
+# FINAL Stage 3: The Lambda Image (runs as root for simplicity)
+# ===================================================================
+FROM base AS lambda
+# Set runtime ENV for Lambda mode
+ENV APP_MODE=lambda
+ENTRYPOINT ["/home/user/app/entrypoint.sh"]
+CMD ["lambda_entrypoint.lambda_handler"]
+# ===================================================================
+# FINAL Stage 4: The Gradio Image (runs as a secure, non-root user)
+# ===================================================================
+FROM base AS gradio
+# Set runtime ENV for Gradio mode
+ENV APP_MODE=gradio
+# Create non-root user
+RUN useradd -m -u 1000 user
+# Create the base application directory and set its ownership
+RUN mkdir -p ${APP_HOME}/app && chown user:user ${APP_HOME}/app
+# Create required sub-folders within the app directory and set their permissions
+# This ensures these specific directories are owned by 'user'
+RUN mkdir -p \
+    ${APP_HOME}/app/output \
+    ${APP_HOME}/app/input \
+    ${APP_HOME}/app/logs \
+    ${APP_HOME}/app/usage \
+    ${APP_HOME}/app/feedback \
+    ${APP_HOME}/app/config \
+    && chown user:user \
+    ${APP_HOME}/app/output \
+    ${APP_HOME}/app/input \
+    ${APP_HOME}/app/logs \
+    ${APP_HOME}/app/usage \
+    ${APP_HOME}/app/feedback \
+    ${APP_HOME}/app/config \
+    && chmod 755 \
+    ${APP_HOME}/app/output \
+    ${APP_HOME}/app/input \
+    ${APP_HOME}/app/logs \
+    ${APP_HOME}/app/usage \
+    ${APP_HOME}/app/feedback \
+    ${APP_HOME}/app/config
+# Now handle the /tmp and /var/tmp directories and their subdirectories, paddle, spacy, tessdata
+RUN mkdir -p /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache /tmp /var/tmp ${XDG_CACHE_HOME} \
+    && chown user:user /tmp /var/tmp /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache ${XDG_CACHE_HOME} \
+    && chmod 1777 /tmp /var/tmp /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache \
+    && chmod 700 ${XDG_CACHE_HOME} \
+    && mkdir -p ${APP_HOME}/.paddlex \
+    && chown user:user ${APP_HOME}/.paddlex \
+    && chmod 755 ${APP_HOME}/.paddlex \
+    && mkdir -p ${APP_HOME}/.local/share/spacy/data \
+    && chown user:user ${APP_HOME}/.local/share/spacy/data \
+    && chmod 755 ${APP_HOME}/.local/share/spacy/data \
+    && mkdir -p /usr/share/tessdata \
+    && chown user:user /usr/share/tessdata \
+    && chmod 755 /usr/share/tessdata
+# Fix apply user ownership to all files in the home directory
+RUN chown -R user:user /home/user
+# Set permissions for Python executable
+RUN chmod 755 /usr/local/bin/python
+# Declare volumes (NOTE: runtime mounts will override permissions — handle with care)
+VOLUME ["/tmp/matplotlib_cache"]
+VOLUME ["/tmp/gradio_tmp"]
+VOLUME ["/tmp/tld"]
+VOLUME ["/home/user/app/output"]
+VOLUME ["/home/user/app/input"]
+VOLUME ["/home/user/app/logs"]
+VOLUME ["/home/user/app/usage"]
+VOLUME ["/home/user/app/feedback"]
+VOLUME ["/home/user/app/config"]
+VOLUME ["/home/user/.paddlex"]
+VOLUME ["/home/user/.local/share/spacy/data"]
+VOLUME ["/usr/share/tessdata"]
+VOLUME ["/tmp"]
+VOLUME ["/var/tmp"]
+USER user
+EXPOSE $GRADIO_SERVER_PORT
+ENTRYPOINT ["/home/user/app/entrypoint.sh"]
+CMD ["python", "app.py"]

Dockerfile.pi ADDED Viewed

	@@ -0,0 +1,40 @@

+# syntax=docker/dockerfile:1
+FROM node:22-bookworm-slim
+ENV NODE_ENV=production
+ENV DEBIAN_FRONTEND=noninteractive
+ENV NPM_CONFIG_LOGLEVEL=warn
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONPATH=/workspace/doc_redaction
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    bash \
+    git \
+    curl \
+    ca-certificates \
+    procps \
+    python3 \
+    python3-pip \
+    python3-venv \
+    && rm -rf /var/lib/apt/lists/*
+RUN npm install -g --ignore-scripts @earendil-works/pi-coding-agent
+COPY agent-redact/requirements_pi_agent.txt /tmp/requirements_pi_agent.txt
+RUN pip3 install --no-cache-dir --break-system-packages \
+    -r /tmp/requirements_pi_agent.txt \
+    && rm /tmp/requirements_pi_agent.txt
+RUN mkdir -p /home/node/.pi/agent/sessions /workspace/doc_redaction \
+    && chown -R node:node /home/node/.pi /workspace
+WORKDIR /workspace/doc_redaction
+USER node
+RUN pi --version
+ENTRYPOINT ["pi"]
+CMD []

MANIFEST.in ADDED Viewed

	@@ -0,0 +1,4 @@

+recursive-include doc_redaction/assets *.png
+recursive-include doc_redaction/example_data *
+recursive-include intros *.txt

README.md ADDED Viewed

	@@ -0,0 +1,344 @@

+---
+title: Document redaction
+emoji: 📝
+colorFrom: blue
+colorTo: yellow
+sdk: docker
+app_file: app.py
+pinned: true
+license: agpl-3.0
+short_description: OCR / redact PDF documents and tabular data
+---
+# Document redaction (doc_redaction)
+<a href="https://pypi.org/project/doc-redaction/" target="_blank"><img alt="PyPI - Version" src="https://img.shields.io/pypi/v/doc-redaction"></a>
+Redact personally identifiable information (PII) from documents (PDF, PNG, JPG), Word files (DOCX), or tabular data (XLSX/CSV/Parquet). Please see the [User Guide](https://seanpedrick-case.github.io/doc_redaction/src/user_guide.html) for a full walkthrough of all the features in the app.
+---
+## 🚀 Quick Start - Installation and first run
+Follow these instructions to get the document redaction application running on your local machine.
+### 1. Package installation
+#### Option 1 - Recommended: Install from source repo
+Clone the repository and install in editable mode:
+```bash
+git clone https://github.com/seanpedrick-case/doc_redaction.git
+cd doc_redaction
+pip install -e .
+```
+##### Install extras (Paddle or Transformers/Torch VLM)
+To install with PaddleOCR:
+```bash
+pip install -e ".[paddle]"
+```
+Note that the versions of both PaddleOCR and Torch installed by default are the CPU-only versions. If you want to install the equivalent GPU versions, you will need to run the following commands:
+```bash
+pip install paddlepaddle-gpu==3.2.1 --index-url https://www.paddlepaddle.org.cn/packages/stable/cu129/
+```
+If you want to run VLMs / LLMs with the transformers package:
+```bash
+pip install -e ".[vlm]"
+```
+**Note:** It is difficult to get paddlepaddle gpu working in an environment alongside torch. You may well need to reinstall the cpu version to ensure compatibility, and run paddlepaddle-gpu in a separate environment without torch installed. If you get errors related to .dll files following paddle gpu install, you may need to install the latest c++ redistributables. For Windows, you can find them [here](https://learn.microsoft.com/en-us/cpp/windows/latest-supported-vc-redist?view=msvc-170)
+```bash
+pip install torch==2.8.0 --index-url https://download.pytorch.org/whl/cu129
+pip install torchvision --index-url https://download.pytorch.org/whl/cu129
+```
+#### Option 2 - Install from PyPI
+Create a virtual environment (recommended) and install **doc_redaction**.
+```bash
+python -m venv venv
+# Windows:
+.\venv\Scripts\activate
+# macOS/Linux:
+source venv/bin/activate
+```
+The package is published on PyPI as **`doc-redaction`** (import name **`doc_redaction`**):
+```bash
+pip install doc_redaction
+```
+Optional extras (same as in `pyproject.toml`). For installing paddleOCR:
+```bash
+pip install "doc_redaction[paddle]"
+```
+For running VLMs / LLMs with the transformers package:
+```bash
+pip install "doc_redaction[vlm]"
+```
+For programmatic use (CLI-first API matching Gradio `api_name` routes), see **[Python Package usage (Python)](https://seanpedrick-case.github.io/doc_redaction/src/python_package_usage.html)**. The console script **`cli_redact`** is available after install.
+**Web UI from a PyPI install:** You *can* start the Gradio UI after `pip install doc_redaction` by running (note that the prerequisites tesseract and poppler will need to be correctly installed following step 2 below):
+```bash
+python -m app
+```
+**Important: your working directory matters.** When you run `python -m app`, the app treats your *current folder* as the “app folder”:
+- It will look for configuration at `config/app_config.env` *relative to the folder you run it from* (and `python -m doc_redaction.install_deps` will also write `config/app_config.env` there).
+- It may create new folders in that location (for example `config/`, `output/`, `input/`, `logs/`, `usage/`, `feedback/`, and temporary/cache folders depending on your settings).
+- The UI example files and bundled assets are packaged with the PyPI install (they live inside the installed `doc_redaction` package). If you run from a “random” directory after a PyPI install, the app can still locate its packaged examples; your working directory mainly affects where `config/`, `input/`, `output/`, logs, and temp folders are created.
+In practice, the **smoothest UI experience** (examples, bundled assets, docs links, predictable relative paths) is still usually via a **repository checkout** or **Docker**, but PyPI install is sufficient to launch the UI as long as you run it from a suitable working folder and have the system dependencies available (or run `python -m doc_redaction.install_deps` first).
+#### Option 3 - Docker installation
+The doc_redaction Redaction app can be installed by using the [Dockerfile](https://github.com/seanpedrick-case/doc_redaction/blob/main/Dockerfile) or Docker compose files ([llama.cpp](https://github.com/ggml-org/llama.cpp), [vLLM](https://docs.vllm.ai/en/stable/)) provided in the repo.
+##### With Llama.cpp / vLLM inference server
+The project now has Docker and Docker compose files available to pair running the Redaction app with local inference servers powered by [llama.cpp](https://github.com/ggml-org/llama.cpp), or [vLLM](https://docs.vllm.ai/en/stable/). Llama.cpp is more flexible than vLLM for low VRAM systems, as Llama.cpp will offload to cpu/system RAM automatically rather than failing as vLLM tends to do.
+For Llama.cpp, you can use the [docker-compose_llama.yml](https://github.com/seanpedrick-case/doc_redaction/blob/main/docker-compose_llama.yml) file, and for vLLM, you can use the [docker-compose_vllm.yml](https://github.com/seanpedrick-case/doc_redaction/blob/main/docker-compose_vllm.yml) file. To run, Docker / Docker Desktop should be installed, and then you can run the commands suggested in the top of the files to run the servers.
+You will need ~40 GB of disk space to run everything depending on the model chosen from the compose file. For the vLLM server, you will need 24 GB VRAM. For the Llama.cpp server, 24 GB VRAM is needed to run at full speed, but the n-gpu-layers and n-cpu-moe parameters in the Docker compose file can be adjusted to fit into your system. I would suggest that 8 GB VRAM is needed as a bare minimum for decent inference speed. See the [Unsloth guide](https://unsloth.ai/docs/models/qwen3.5) for more details on working with GGUF files for Qwen 3.5.
+##### Without Llama.cpp / vLLM inference server
+If you want a working Docker installation without GPU support, you can install from the [Dockerfile](https://github.com/seanpedrick-case/doc_redaction/blob/main/Dockerfile) in the repo. A working example of this, with the CPU version of PaddleOCR, can be found on [Hugging Face](https://huggingface.co/spaces/seanpedrickcase/document_redaction). You can adjust the INSTALL_PADDLEOCR, PADDLE_GPU_ENABLED, INSTALL_VLM, and TORCH_GPU_ENABLED config variables to adjust for PaddleOCR and Transformers packages for local VLM support. Note that GPU-enabled PaddleOCR, and GPU-enabled Transformers/Torch often don't work well together, which is one reason why a Llama.cpp/vLLM inference server Docker installation option is provided below.
+### 2. Install prerequisites: Tesseract and Poppler
+This application relies on two external tools for OCR (Tesseract) and PDF processing (Poppler). Please install them on your system before proceeding. To run the Document Redaction app successfully, these tools need to be installed and either 1. added to PATH, or 2. be in a folder that is directly referenced in the config/app_config.env file with the variables TESSERACT_FOLDER and POPPLER_FOLDER (defined [here](https://github.com/seanpedrick-case/doc_redaction/blob/main/tools/config.py) if you want to see the code). The instructions below will guide you through diffferent ways to install these dependencies.
+---
+#### Automated dependency setup (recommended)
+If you **don’t have admin rights** (or you just want the simplest setup), you can have the project download and configure **Tesseract** and **Poppler** into a local `redaction_deps/` folder inside the doc_redaction folder.
+You need the installer script available first, which means either:
+- **Repository checkout**: `git clone ...` and run the command from the repo root (recommended for the web UI), or
+- **PyPI install**: `pip install doc_redaction` and run from a writable folder where you want `redaction_deps/` and `config/app_config.env` to be created/updated.
+From the repository root (or your chosen working folder) after creating/activating your venv and installing Python requirements:
+```bash
+python -m doc_redaction.install_deps
+```
+This writes `TESSERACT_FOLDER` / `POPPLER_FOLDER` into `config/app_config.env` so the app can find the binaries without you editing your system PATH.
+To just check whether your machine can already see the tools:
+```bash
+python -m doc_redaction.install_deps --verify-only
+```
+#### **On Windows**
+If you don’t use the automated setup above, you can install the dependencies manually by downloading installers and adding the programs to your system's PATH.
+1.  **Install Tesseract OCR:**
+    *   Download the installer from the official Tesseract at [UB Mannheim page](https://github.com/UB-Mannheim/tesseract/wiki) (e.g., `tesseract-ocr-w64-setup-v5.X.X...exe`).
+    *   Run the installer.
+    *   **IMPORTANT:** During installation, ensure you select the option to "Add Tesseract to system PATH for all users" or a similar option. This is crucial for the application to find the Tesseract executable.
+2.  **Install Poppler:**
+    *   Download the latest Poppler binary for Windows. A common source is the [Poppler for Windows](https://github.com/oschwartz10612/poppler-windows) GitHub releases page. Download the `.zip` file (e.g., `poppler-25.07.0-win.zip`).
+    *   Extract the contents of the zip file to a permanent location on your computer, for example, `C:\Program Files\poppler\`.
+    *   You must add the `bin` folder from your Poppler installation to your system's PATH environment variable.
+        *   Search for "Edit the system environment variables" in the Windows Start Menu and open it.
+        *   Click the "Environment Variables..." button.
+        *   In the "System variables" section, find and select the `Path` variable, then click "Edit...".
+        *   Click "New" and add the full path to the `bin` directory inside your Poppler folder (e.g., `C:\Program Files\poppler\poppler-24.02.0\bin`).
+        *   Click OK on all windows to save the changes.
+    To verify, open a new Command Prompt and run `tesseract --version` and `pdftoppm -v`. If they both return version information, you have successfully installed the prerequisites.
+---
+#### **On Linux (Debian/Ubuntu)**
+Open your terminal and run the following command to install Tesseract and Poppler:
+```bash
+sudo apt-get update && sudo apt-get install -y tesseract-ocr poppler-utils
+```
+#### **On Linux (Fedora/CentOS/RHEL)**
+Open your terminal and use the `dnf` or `yum` package manager:
+```bash
+sudo dnf install -y tesseract poppler-utils
+```
+---
+### 3. Run the Application
+With all dependencies installed, you can now start the Gradio application GUI. For a guide on how to use this, please go [here](https://seanpedrick-case.github.io/doc_redaction/src/user_guide.html).
+```bash
+python app.py
+```
+After running the command, the application will start, and you will see a local URL in your terminal (usually `http://127.0.0.1:7860`).
+Open this URL in your web browser to use the document redaction tool
+#### Command line interface
+For example CLI commands, please refer to [this guide](https://seanpedrick-case.github.io/doc_redaction/src/user_guide.html#command-line-interface-cli) or the examples in [cli_redact.py](https://github.com/seanpedrick-case/doc_redaction/blob/main/cli_redact.py#L321)
+If you installed from **PyPI**, use the installed console script:
+```bash
+cli_redact --help
+```
+From a **repository checkout**, you can also run:
+```bash
+python cli_redact.py --help
+```
+#### Python package commands
+For Python examples in using the Python package, please see [Python Package usage (Python)](https://seanpedrick-case.github.io/doc_redaction/src/python_package_usage.html).
+---
+### 4. ⚙️ Configuration (Optional)
+You can customise the application's behavior by creating a configuration file. This allows you to change settings without modifying the source code, such as enabling AWS features, changing logging behavior, or pointing to local Tesseract/Poppler installations. A full overview of all the potential settings you can modify in the app_config.env file can be seen in tools/config.py, with explanation on the documentation website for [the github repo](https://seanpedrick-case.github.io/doc_redaction/)
+To get started:
+1.  Copy `config/app_config.env.example` to `config/app_config.env`.
+2.  Modify the values in `config/app_config.env` to suit your needs. The application will automatically load these settings on startup.
+If you do not create this file, the application will run with default settings.
+#### Configuration Breakdown
+Here is an overview of the most important settings, separated by whether they are for local use or require AWS.
+---
+#### **Local & General Settings (No AWS Required)**
+These settings are useful for all users, regardless of whether you are using AWS.
+*   `TESSERACT_FOLDER` / `POPPLER_FOLDER`
+    *   Use these if you installed Tesseract or Poppler to a custom location on **Windows** and did not add them to the system PATH.
+    *   Provide the path to the respective installation folders (for Poppler, point to the `bin` sub-directory).
+    *   **Examples:** `POPPLER_FOLDER=C:/Program Files/poppler-24.02.0/bin/` `TESSERACT_FOLDER=tesseract/`
+*   `TESSERACT_DATA_FOLDER`
+    *   If Tesseract runs but you see an error like `Error opening data file ./eng.traineddata` or `Tesseract couldn't load any languages`, this is usually because it can't find the `tessdata/` language files.
+    *   Set this to the folder that contains `eng.traineddata` (typically a `tessdata` directory).
+    *   **Examples (Windows):** `TESSERACT_DATA_FOLDER=C:/Program Files/Tesseract-OCR/tessdata`
+*   `SHOW_LANGUAGE_SELECTION=True`
+    *   Set to `True` to display a language selection dropdown in the UI for OCR processing.
+*   `DEFAULT_LOCAL_OCR_MODEL=tesseract`"
+    *   Choose the backend for local OCR. Options are `tesseract`, `paddle`, or `hybrid`. "Tesseract" is the default, and is recommended. "hybrid-paddle" is a combination of the two - first pass through the redactions will be done with Tesseract, and then a second pass will be done with PaddleOCR on words with low confidence. "paddle" will only return whole line text extraction, and so will only work for OCR, not redaction.
+*   `SESSION_OUTPUT_FOLDER=False`
+    *   If `True`, redacted files will be saved in unique subfolders within the `output/` directory for each session.
+*   `DISPLAY_FILE_NAMES_IN_LOGS=False`
+    *   For privacy, file names are not recorded in usage logs by default. Set to `True` to include them.
+---
+#### **AWS-Specific Settings**
+These settings are only relevant if you intend to use AWS services like Textract for OCR and Comprehend for PII detection.
+*   `RUN_AWS_FUNCTIONS=True`
+    *   **This is the master switch.** You must set this to `True` to enable any AWS functionality. If it is `False`, all other AWS settings will be ignored.
+*   **UI Options:**
+    *   `SHOW_AWS_TEXT_EXTRACTION_OPTIONS=True`: Adds "AWS Textract" as an option in the text extraction dropdown.
+    *   `SHOW_AWS_PII_DETECTION_OPTIONS=True`: Adds "AWS Comprehend" as an option in the PII detection dropdown.
+*   **Core AWS Configuration:**
+    *   `AWS_REGION=example-region`: Set your AWS region (e.g., `us-east-1`).
+    *   `DOCUMENT_REDACTION_BUCKET=example-bucket`: The name of the S3 bucket the application will use for temporary file storage and processing.
+*   **AWS Logging:**
+    *   `SAVE_LOGS_TO_DYNAMODB=True`: If enabled, usage and feedback logs will be saved to DynamoDB tables.
+    *   `ACCESS_LOG_DYNAMODB_TABLE_NAME`, `USAGE_LOG_DYNAMODB_TABLE_NAME`, etc.: Specify the names of your DynamoDB tables for logging.
+*   **Advanced AWS Textract Features:**
+    *   `SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS=True`: Enables UI components for large-scale, asynchronous document processing via Textract.
+    *   `TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET=example-bucket-output`: A separate S3 bucket for the final output of asynchronous Textract jobs.
+    *   `LOAD_PREVIOUS_TEXTRACT_JOBS_S3=True`: If enabled, the app will try to load the status of previously submitted asynchronous jobs from S3.
+*   **Cost Tracking (for internal accounting):**
+    *   `SHOW_COSTS=True`: Displays an estimated cost for AWS operations. Can be enabled even if AWS functions are off.
+    *   `GET_COST_CODES=True`: Enables a dropdown for users to select a cost code before running a job.
+    *   `COST_CODES_PATH=config/cost_codes.csv`: The local path to a CSV file containing your cost codes.
+    *   `ENFORCE_COST_CODES=True`: Makes selecting a cost code mandatory before starting a redaction.
+Now you have the app installed, please refer to the [User Guide](https://seanpedrick-case.github.io/doc_redaction/src/user_guide.html) for more information on how to use it for basic and advanced redaction.
+## For agents (API quickstart)
+If you are an LLM/agent interacting with this app over HTTP (e.g. Hugging Face Spaces), **do not guess inputs** from the UI. Use the Gradio schema as the source of truth:
+- **Discover schema**: `GET /gradio_api/info`
+- **Upload files**: `POST /gradio_api/upload` (multipart field `files`) → returns server-internal paths like `/tmp/gradio_tmp/...`
+- **Call**: `POST /gradio_api/call/{api_name}` with body `{"data":[...]}` (argument order must match `/gradio_api/info`)
+- **Poll**: `GET /gradio_api/call/{api_name}/{event_id}` until complete
+- **Download outputs**: `GET /gradio_api/file={path}` (note: some deployments return 403 without session cookies)
+### Choose the correct route (prefer short `gr.api` endpoints)
+Fetch `/gradio_api/info` and then prefer the simplest route that exists:
+- **Apply edited review CSV to a PDF**: `/review_apply`
+- **Redact a PDF/image document**: `/doc_redact` — optional `handwrite_signature_checkbox` for AWS Textract (e.g. `Extract handwriting`, `Extract signatures`)
+- **Summarise a PDF**: `/pdf_summarise`
+- **Redact tabular files (CSV/XLSX/Parquet/DOCX)**: `/tabular_redact`
+If those endpoints are not present in your deployment, fall back to the long UI-chained routes (`/apply_review_redactions`, `/redact_data`, etc.) and build `data[]` strictly from `/gradio_api/info`.
+### Common gotchas
+- **Arity errors** (`needed: N, got: M`) mean you called a session-heavy UI handler with the wrong `data[]`. Prefer the short endpoints above.
+- **`handle_file()` gotcha** (for `gradio_client` users): do **not** wrap server-internal upload paths (e.g. `/tmp/gradio_tmp/...`) with `handle_file()`. Pass them as plain strings.
+- **Container-only outputs**: outputs may be written to container paths (e.g. `/home/user/app/output/`). Plan to download via `file=...` or use a mounted output directory in Docker.
+### Optional: MCP server
+If you want external agents to call this app reliably without re-implementing Gradio upload/call/poll/download details, consider an **MCP server** that wraps the main tasks (`redact_document`, `apply_review_redactions`, `redact_tabular`, `summarise_document`) behind a small tool interface. See the [relevant documentation](https://github.com/seanpedrick-case/doc_redaction/blob/main/mcp_doc_redaction/README.md).
+**Use as a library:** After installing from [PyPI](https://pypi.org/project/doc-redaction/) (`pip install doc_redaction`), you can call the same workflows as the Gradio `api_name` routes from Python. See the documentation: [Python Package usage (Python)](https://seanpedrick-case.github.io/doc_redaction/src/python_package_usage.html).
+To extract text from documents, the 'Local' options are PikePDF for PDFs with selectable text, and OCR with Tesseract. Use AWS Textract to extract more complex elements e.g. handwriting, signatures, or unclear text. PaddleOCR and VLM support is also provided (see the installation instructions below).
+For PII identification, 'Local' (based on spaCy) gives good results if you are looking for common names or terms, or a custom list of terms to redact (see Redaction settings).  AWS Comprehend gives better results at a small cost.
+Additional options on the 'Redaction settings' include, the type of information to redact (e.g. people, places), custom terms to include/ exclude from redaction, fuzzy matching, language settings, and whole page redaction. After redaction is complete, you can view and modify suggested redactions on the 'Review redactions' tab to quickly create a final redacted document.
+NOTE: The app is not 100% accurate, and it will miss some personal information. It is essential that all outputs are reviewed **by a human** before using the final outputs.

README_PYPI.md ADDED Viewed

	@@ -0,0 +1,328 @@

+# Document redaction (doc_redaction)
+<a href="https://pypi.org/project/doc-redaction/" target="_blank"><img alt="PyPI - Version" src="https://img.shields.io/pypi/v/doc-redaction"></a>
+Redact personally identifiable information (PII) from documents (PDF, PNG, JPG), Word files (DOCX), or tabular data (XLSX/CSV/Parquet). Please see the [User Guide](https://seanpedrick-case.github.io/doc_redaction/src/user_guide.html) for a full walkthrough of all the features in the app.
+---
+## 🚀 Quick Start - Installation and first run
+Follow these instructions to get the document redaction application running on your local machine.
+### 1. Package installation
+#### Option 1 - Recommended: Install from source repo
+Clone the repository and install in editable mode:
+```bash
+git clone https://github.com/seanpedrick-case/doc_redaction.git
+cd doc_redaction
+pip install -e .
+```
+##### Install extras (Paddle or Transformers/Torch VLM)
+To install with PaddleOCR:
+```bash
+pip install -e ".[paddle]"
+```
+Note that the versions of both PaddleOCR and Torch installed by default are the CPU-only versions. If you want to install the equivalent GPU versions, you will need to run the following commands:
+```bash
+pip install paddlepaddle-gpu==3.2.1 --index-url https://www.paddlepaddle.org.cn/packages/stable/cu129/
+```
+If you want to run VLMs / LLMs with the transformers package:
+```bash
+pip install -e ".[vlm]"
+```
+**Note:** It is difficult to get paddlepaddle gpu working in an environment alongside torch. You may well need to reinstall the cpu version to ensure compatibility, and run paddlepaddle-gpu in a separate environment without torch installed. If you get errors related to .dll files following paddle gpu install, you may need to install the latest c++ redistributables. For Windows, you can find them [here](https://learn.microsoft.com/en-us/cpp/windows/latest-supported-vc-redist?view=msvc-170)
+```bash
+pip install torch==2.8.0 --index-url https://download.pytorch.org/whl/cu129
+pip install torchvision --index-url https://download.pytorch.org/whl/cu129
+```
+#### Option 2 - Install from PyPI
+Create a virtual environment (recommended) and install **doc_redaction**.
+```bash
+python -m venv venv
+# Windows:
+.\venv\Scripts\activate
+# macOS/Linux:
+source venv/bin/activate
+```
+The package is published on PyPI as **`doc-redaction`** (import name **`doc_redaction`**):
+```bash
+pip install doc_redaction
+```
+Optional extras (same as in `pyproject.toml`). For installing paddleOCR:
+```bash
+pip install "doc_redaction[paddle]"
+```
+For running VLMs / LLMs with the transformers package:
+```bash
+pip install "doc_redaction[vlm]"
+```
+For programmatic use (CLI-first API matching Gradio `api_name` routes), see **[Python Package usage (Python)](https://seanpedrick-case.github.io/doc_redaction/src/python_package_usage.html)**. The console script **`cli_redact`** is available after install.
+**Web UI from a PyPI install:** You *can* start the Gradio UI after `pip install doc_redaction` by running (note that the prerequisites tesseract and poppler will need to be correctly installed following step 2 below):
+```bash
+python -m app
+```
+**Important: your working directory matters.** When you run `python -m app`, the app treats your *current folder* as the “app folder”:
+- It will look for configuration at `config/app_config.env` *relative to the folder you run it from* (and `python -m doc_redaction.install_deps` will also write `config/app_config.env` there).
+- It may create new folders in that location (for example `config/`, `output/`, `input/`, `logs/`, `usage/`, `feedback/`, and temporary/cache folders depending on your settings).
+- The UI example files and bundled assets are packaged with the PyPI install (they live inside the installed `doc_redaction` package). If you run from a “random” directory after a PyPI install, the app can still locate its packaged examples; your working directory mainly affects where `config/`, `input/`, `output/`, logs, and temp folders are created.
+In practice, the **smoothest UI experience** (examples, bundled assets, docs links, predictable relative paths) is still usually via a **repository checkout** or **Docker**, but PyPI install is sufficient to launch the UI as long as you run it from a suitable working folder and have the system dependencies available (or run `python -m doc_redaction.install_deps` first).
+#### Option 3 - Docker installation
+The doc_redaction Redaction app can be installed by using the [Dockerfile](https://github.com/seanpedrick-case/doc_redaction/blob/main/Dockerfile) or Docker compose files ([llama.cpp](https://github.com/ggml-org/llama.cpp), [vLLM](https://docs.vllm.ai/en/stable/)) provided in the repo.
+##### With Llama.cpp / vLLM inference server
+The project now has Docker and Docker compose files available to pair running the Redaction app with local inference servers powered by [llama.cpp](https://github.com/ggml-org/llama.cpp), or [vLLM](https://docs.vllm.ai/en/stable/). Llama.cpp is more flexible than vLLM for low VRAM systems, as Llama.cpp will offload to cpu/system RAM automatically rather than failing as vLLM tends to do.
+For Llama.cpp, you can use the [docker-compose_llama.yml](https://github.com/seanpedrick-case/doc_redaction/blob/main/docker-compose_llama.yml) file, and for vLLM, you can use the [docker-compose_vllm.yml](https://github.com/seanpedrick-case/doc_redaction/blob/main/docker-compose_vllm.yml) file. To run, Docker / Docker Desktop should be installed, and then you can run the commands suggested in the top of the files to run the servers.
+You will need ~40 GB of disk space to run everything depending on the model chosen from the compose file. For the vLLM server, you will need 24 GB VRAM. For the Llama.cpp server, 24 GB VRAM is needed to run at full speed, but the n-gpu-layers and n-cpu-moe parameters in the Docker compose file can be adjusted to fit into your system. I would suggest that 8 GB VRAM is needed as a bare minimum for decent inference speed. See the [Unsloth guide](https://unsloth.ai/docs/models/qwen3.5) for more details on working with GGUF files for Qwen 3.5.
+##### Without Llama.cpp / vLLM inference server
+If you want a working Docker installation without GPU support, you can install from the [Dockerfile](https://github.com/seanpedrick-case/doc_redaction/blob/main/Dockerfile) in the repo. A working example of this, with the CPU version of PaddleOCR, can be found on [Hugging Face](https://huggingface.co/spaces/seanpedrickcase/document_redaction). You can adjust the INSTALL_PADDLEOCR, PADDLE_GPU_ENABLED, INSTALL_VLM, and TORCH_GPU_ENABLED config variables to adjust for PaddleOCR and Transformers packages for local VLM support. Note that GPU-enabled PaddleOCR, and GPU-enabled Transformers/Torch often don't work well together, which is one reason why a Llama.cpp/vLLM inference server Docker installation option is provided below.
+### 2. Install prerequisites: Tesseract and Poppler
+This application relies on two external tools for OCR (Tesseract) and PDF processing (Poppler). Please install them on your system before proceeding.
+---
+#### Automated dependency setup (recommended)
+If you **don’t have admin rights** (or you just want the simplest setup), you can have the project download and configure **Tesseract** and **Poppler** into a local `redaction_deps/` folder inside the doc_redaction folder.
+You need the installer script available first, which means either:
+- **Repository checkout**: `git clone ...` and run the command from the repo root (recommended for the web UI), or
+- **PyPI install**: `pip install doc_redaction` and run from a writable folder where you want `redaction_deps/` and `config/app_config.env` to be created/updated.
+From the repository root (or your chosen working folder) after creating/activating your venv and installing Python requirements:
+```bash
+python -m doc_redaction.install_deps
+```
+This writes `TESSERACT_FOLDER` / `POPPLER_FOLDER` into `config/app_config.env` so the app can find the binaries without you editing your system PATH.
+To just check whether your machine can already see the tools:
+```bash
+python -m doc_redaction.install_deps --verify-only
+```
+#### **On Windows**
+If you don’t use the automated setup above, you can install the dependencies manually by downloading installers and adding the programs to your system's PATH.
+1.  **Install Tesseract OCR:**
+    *   Download the installer from the official Tesseract at [UB Mannheim page](https://github.com/UB-Mannheim/tesseract/wiki) (e.g., `tesseract-ocr-w64-setup-v5.X.X...exe`).
+    *   Run the installer.
+    *   **IMPORTANT:** During installation, ensure you select the option to "Add Tesseract to system PATH for all users" or a similar option. This is crucial for the application to find the Tesseract executable.
+2.  **Install Poppler:**
+    *   Download the latest Poppler binary for Windows. A common source is the [Poppler for Windows](https://github.com/oschwartz10612/poppler-windows) GitHub releases page. Download the `.zip` file (e.g., `poppler-25.07.0-win.zip`).
+    *   Extract the contents of the zip file to a permanent location on your computer, for example, `C:\Program Files\poppler\`.
+    *   You must add the `bin` folder from your Poppler installation to your system's PATH environment variable.
+        *   Search for "Edit the system environment variables" in the Windows Start Menu and open it.
+        *   Click the "Environment Variables..." button.
+        *   In the "System variables" section, find and select the `Path` variable, then click "Edit...".
+        *   Click "New" and add the full path to the `bin` directory inside your Poppler folder (e.g., `C:\Program Files\poppler\poppler-24.02.0\bin`).
+        *   Click OK on all windows to save the changes.
+    To verify, open a new Command Prompt and run `tesseract --version` and `pdftoppm -v`. If they both return version information, you have successfully installed the prerequisites.
+---
+#### **On Linux (Debian/Ubuntu)**
+Open your terminal and run the following command to install Tesseract and Poppler:
+```bash
+sudo apt-get update && sudo apt-get install -y tesseract-ocr poppler-utils
+```
+#### **On Linux (Fedora/CentOS/RHEL)**
+Open your terminal and use the `dnf` or `yum` package manager:
+```bash
+sudo dnf install -y tesseract poppler-utils
+```
+---
+### 3. Run the Application
+With all dependencies installed, you can now start the Gradio application GUI. For a guide on how to use this, please go [here](https://seanpedrick-case.github.io/doc_redaction/src/user_guide.html).
+```bash
+python app.py
+```
+After running the command, the application will start, and you will see a local URL in your terminal (usually `http://127.0.0.1:7860`).
+Open this URL in your web browser to use the document redaction tool
+#### Command line interface
+For example CLI commands, please refer to [this guide](https://seanpedrick-case.github.io/doc_redaction/src/user_guide.html#command-line-interface-cli) or the examples in [cli_redact.py](https://github.com/seanpedrick-case/doc_redaction/blob/main/cli_redact.py#L321)
+If you installed from **PyPI**, use the installed console script:
+```bash
+cli_redact --help
+```
+From a **repository checkout**, you can also run:
+```bash
+python cli_redact.py --help
+```
+#### Python package commands
+For Python examples in using the Python package, please see [Python Package usage (Python)](https://seanpedrick-case.github.io/doc_redaction/src/python_package_usage.html).
+---
+### 4. ⚙️ Configuration (Optional)
+You can customise the application's behavior by creating a configuration file. This allows you to change settings without modifying the source code, such as enabling AWS features, changing logging behavior, or pointing to local Tesseract/Poppler installations. A full overview of all the potential settings you can modify in the app_config.env file can be seen in tools/config.py, with explanation on the documentation website for [the github repo](https://seanpedrick-case.github.io/doc_redaction/)
+To get started:
+1.  Copy `config/app_config.env.example` to `config/app_config.env`.
+2.  Modify the values in `config/app_config.env` to suit your needs. The application will automatically load these settings on startup.
+If you do not create this file, the application will run with default settings.
+#### Configuration Breakdown
+Here is an overview of the most important settings, separated by whether they are for local use or require AWS.
+---
+#### **Local & General Settings (No AWS Required)**
+These settings are useful for all users, regardless of whether you are using AWS.
+*   `TESSERACT_FOLDER` / `POPPLER_FOLDER`
+    *   Use these if you installed Tesseract or Poppler to a custom location on **Windows** and did not add them to the system PATH.
+    *   Provide the path to the respective installation folders (for Poppler, point to the `bin` sub-directory).
+    *   **Examples:** `POPPLER_FOLDER=C:/Program Files/poppler-24.02.0/bin/` `TESSERACT_FOLDER=tesseract/`
+*   `SHOW_LANGUAGE_SELECTION=True`
+    *   Set to `True` to display a language selection dropdown in the UI for OCR processing.
+*   `DEFAULT_LOCAL_OCR_MODEL=tesseract`"
+    *   Choose the backend for local OCR. Options are `tesseract`, `paddle`, or `hybrid`. "Tesseract" is the default, and is recommended. "hybrid-paddle" is a combination of the two - first pass through the redactions will be done with Tesseract, and then a second pass will be done with PaddleOCR on words with low confidence. "paddle" will only return whole line text extraction, and so will only work for OCR, not redaction.
+*   `SESSION_OUTPUT_FOLDER=False`
+    *   If `True`, redacted files will be saved in unique subfolders within the `output/` directory for each session.
+*   `DISPLAY_FILE_NAMES_IN_LOGS=False`
+    *   For privacy, file names are not recorded in usage logs by default. Set to `True` to include them.
+---
+#### **AWS-Specific Settings**
+These settings are only relevant if you intend to use AWS services like Textract for OCR and Comprehend for PII detection.
+*   `RUN_AWS_FUNCTIONS=True`
+    *   **This is the master switch.** You must set this to `True` to enable any AWS functionality. If it is `False`, all other AWS settings will be ignored.
+*   **UI Options:**
+    *   `SHOW_AWS_TEXT_EXTRACTION_OPTIONS=True`: Adds "AWS Textract" as an option in the text extraction dropdown.
+    *   `SHOW_AWS_PII_DETECTION_OPTIONS=True`: Adds "AWS Comprehend" as an option in the PII detection dropdown.
+*   **Core AWS Configuration:**
+    *   `AWS_REGION=example-region`: Set your AWS region (e.g., `us-east-1`).
+    *   `DOCUMENT_REDACTION_BUCKET=example-bucket`: The name of the S3 bucket the application will use for temporary file storage and processing.
+*   **AWS Logging:**
+    *   `SAVE_LOGS_TO_DYNAMODB=True`: If enabled, usage and feedback logs will be saved to DynamoDB tables.
+    *   `ACCESS_LOG_DYNAMODB_TABLE_NAME`, `USAGE_LOG_DYNAMODB_TABLE_NAME`, etc.: Specify the names of your DynamoDB tables for logging.
+*   **Advanced AWS Textract Features:**
+    *   `SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS=True`: Enables UI components for large-scale, asynchronous document processing via Textract.
+    *   `TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET=example-bucket-output`: A separate S3 bucket for the final output of asynchronous Textract jobs.
+    *   `LOAD_PREVIOUS_TEXTRACT_JOBS_S3=True`: If enabled, the app will try to load the status of previously submitted asynchronous jobs from S3.
+*   **Cost Tracking (for internal accounting):**
+    *   `SHOW_COSTS=True`: Displays an estimated cost for AWS operations. Can be enabled even if AWS functions are off.
+    *   `GET_COST_CODES=True`: Enables a dropdown for users to select a cost code before running a job.
+    *   `COST_CODES_PATH=config/cost_codes.csv`: The local path to a CSV file containing your cost codes.
+    *   `ENFORCE_COST_CODES=True`: Makes selecting a cost code mandatory before starting a redaction.
+Now you have the app installed, please refer to the [User Guide](https://seanpedrick-case.github.io/doc_redaction/src/user_guide.html) for more information on how to use it for basic and advanced redaction.
+## For agents (API quickstart)
+If you are an LLM/agent interacting with this app over HTTP (e.g. Hugging Face Spaces), **do not guess inputs** from the UI. Use the Gradio schema as the source of truth:
+- **Discover schema**: `GET /gradio_api/info`
+- **Upload files**: `POST /gradio_api/upload` (multipart field `files`) → returns server-internal paths like `/tmp/gradio_tmp/...`
+- **Call**: `POST /gradio_api/call/{api_name}` with body `{"data":[...]}` (argument order must match `/gradio_api/info`)
+- **Poll**: `GET /gradio_api/call/{api_name}/{event_id}` until complete
+- **Download outputs**: `GET /gradio_api/file={path}` (note: some deployments return 403 without session cookies)
+### Choose the correct route (prefer short `gr.api` endpoints)
+Fetch `/gradio_api/info` and then prefer the simplest route that exists:
+- **Apply edited review CSV to a PDF**: `/review_apply`
+- **Redact a PDF/image document**: `/doc_redact` — optional `handwrite_signature_checkbox` for AWS Textract (e.g. `Extract handwriting`, `Extract signatures`)
+- **Summarise a PDF**: `/pdf_summarise`
+- **Redact tabular files (CSV/XLSX/Parquet/DOCX)**: `/tabular_redact`
+If those endpoints are not present in your deployment, fall back to the long UI-chained routes (`/apply_review_redactions`, `/redact_data`, etc.) and build `data[]` strictly from `/gradio_api/info`.
+### Common gotchas
+- **Arity errors** (`needed: N, got: M`) mean you called a session-heavy UI handler with the wrong `data[]`. Prefer the short endpoints above.
+- **`handle_file()` gotcha** (for `gradio_client` users): do **not** wrap server-internal upload paths (e.g. `/tmp/gradio_tmp/...`) with `handle_file()`. Pass them as plain strings.
+- **Container-only outputs**: outputs may be written to container paths (e.g. `/home/user/app/output/`). Plan to download via `file=...` or use a mounted output directory in Docker.
+### Optional: MCP server
+If you want external agents to call this app reliably without re-implementing Gradio upload/call/poll/download details, consider an **MCP server** that wraps the main tasks (`redact_document`, `apply_review_redactions`, `redact_tabular`, `summarise_document`) behind a small tool interface. See the [relevant documentation](https://github.com/seanpedrick-case/doc_redaction/blob/main/mcp_doc_redaction/README.md).
+**Use as a library:** After installing from [PyPI](https://pypi.org/project/doc-redaction/) (`pip install doc_redaction`), you can call the same workflows as the Gradio `api_name` routes from Python. See the documentation: [Python Package usage (Python)](https://seanpedrick-case.github.io/doc_redaction/src/python_package_usage.html).
+To extract text from documents, the 'Local' options are PikePDF for PDFs with selectable text, and OCR with Tesseract. Use AWS Textract to extract more complex elements e.g. handwriting, signatures, or unclear text. PaddleOCR and VLM support is also provided (see the installation instructions below).
+For PII identification, 'Local' (based on spaCy) gives good results if you are looking for common names or terms, or a custom list of terms to redact (see Redaction settings).  AWS Comprehend gives better results at a small cost.
+Additional options on the 'Redaction settings' include, the type of information to redact (e.g. people, places), custom terms to include/ exclude from redaction, fuzzy matching, language settings, and whole page redaction. After redaction is complete, you can view and modify suggested redactions on the 'Review redactions' tab to quickly create a final redacted document.
+NOTE: The app is not 100% accurate, and it will miss some personal information. It is essential that all outputs are reviewed **by a human** before using the final outputs.

agent-redact/README.md ADDED Viewed

	@@ -0,0 +1,25 @@

+# Agent redaction (Pi)
+Pi-based agentic document redaction: local Docker orchestration and Hugging Face Space packaging.
+| Path | Purpose |
+|------|---------|
+| [`pi/`](pi/) | Gradio UI, Pi RPC client, remote redaction helpers, runtime config |
+| [`pi-agent/`](pi-agent/) | HF Space Dockerfile, sync script, and manifest |
+| [`requirements_pi_agent.txt`](requirements_pi_agent.txt) | Python deps for the Pi agent image |
+Per-user output isolation uses Gradio `session_hash` subfolders under `PI_WORKSPACE_DIR` (see `agent-redact/pi/session_workspace.py`). Enabled by default locally and on HF Spaces. Set `PI_SESSION_WORKSPACE=false` only if you want one shared workspace tree for all sessions.
+## Local Docker
+Use the `pi-agent` service in [`docker-compose_llama_agentic.yml`](../docker-compose_llama_agentic.yml) (profile `27b_36`). See [`pi/agent/README.md`](pi/agent/README.md).
+## Hugging Face Space
+Build from repo root:
+```bash
+docker build -f agent-redact/pi-agent/Dockerfile .
+```
+Sync to Space on pushes to `dev` via [`.github/workflows/sync-pi-agent-space.yml`](../.github/workflows/sync-pi-agent-space.yml).

agent-redact/pi-agent/.dockerignore ADDED Viewed

	@@ -0,0 +1,10 @@

+.git
+.github
+**/__pycache__
+**/*.pyc
+**/.pytest_cache
+**/node_modules
+workspace
+output
+input
+config/pi_agent.env

agent-redact/pi-agent/.gitattributes ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Example PDFs must be plain files in the Space repo (not Git LFS pointers).
2	+ *.pdf -filter -diff -merge

agent-redact/pi-agent/Dockerfile ADDED Viewed

	@@ -0,0 +1,70 @@

+# syntax=docker/dockerfile:1
+# Pi agent Gradio UI for Hugging Face Docker Space (remote doc_redaction backend).
+# Build from monorepo root: docker build -f agent-redact/pi-agent/Dockerfile .
+FROM node:22-bookworm-slim
+ENV NODE_ENV=production
+ENV DEBIAN_FRONTEND=noninteractive
+ENV NPM_CONFIG_LOGLEVEL=warn
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONPATH=/workspace/doc_redaction:/workspace/doc_redaction/agent-redact/pi
+ENV PI_DEPLOYMENT_PROFILE=hf-space
+ENV PI_DEFAULT_PROVIDER=google-gemini
+ENV PI_DEFAULT_MODEL=gemini-flash-lite-latest
+ENV DOC_REDACTION_GRADIO_URL=https://seanpedrickcase-document-redaction.hf.space
+ENV GRADIO_SERVER_NAME=0.0.0.0
+ENV GRADIO_SERVER_PORT=7860
+ENV PI_WORKSPACE_DIR=/home/user/app/workspace
+ENV PI_WORKDIR=/workspace/doc_redaction
+ENV PI_UPLOAD_ROOT=/tmp/gradio
+ENV PI_SESSION_DIR=/tmp/pi-sessions
+ENV PI_OFFLINE=1
+ENV PI_SKIP_VERSION_CHECK=1
+ENV PI_GRADIO_SHOW_EXAMPLES=true
+ENV HOME=/home/node
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    bash \
+    git \
+    curl \
+    ca-certificates \
+    procps \
+    python3 \
+    python3-pip \
+    python3-venv \
+    && rm -rf /var/lib/apt/lists/*
+RUN npm install -g --ignore-scripts @earendil-works/pi-coding-agent
+COPY agent-redact/requirements_pi_agent.txt /tmp/requirements_pi_agent.txt
+RUN pip3 install --no-cache-dir --break-system-packages \
+    -r /tmp/requirements_pi_agent.txt \
+    && rm /tmp/requirements_pi_agent.txt
+WORKDIR /workspace/doc_redaction
+COPY agent-redact/pi agent-redact/pi
+COPY skills skills
+COPY tools tools
+COPY config config
+COPY intros intros
+COPY AGENTS.md AGENTS.md
+COPY doc_redaction/example_data doc_redaction/example_data
+RUN test -f doc_redaction/example_data/example_of_emails_sent_to_a_professor_before_applying.pdf \
+    && test -f doc_redaction/example_data/graduate-job-example-cover-letter.pdf \
+    && ! head -1 doc_redaction/example_data/example_of_emails_sent_to_a_professor_before_applying.pdf \
+        | grep -q "^version https://git-lfs.github.com/spec/v1"
+RUN mkdir -p /home/node/.pi/agent /home/user/app/workspace /tmp/gradio /tmp/pi-sessions \
+    && chown -R node:node /home/node/.pi /home/user/app /tmp/gradio /tmp/pi-sessions /workspace
+USER node
+RUN pi --version
+EXPOSE 7860
+CMD ["bash", "-c", "python3 agent-redact/pi/pi_agent_config.py && exec python3 agent-redact/pi/gradio_app.py"]

agent-redact/pi-agent/README.md ADDED Viewed

	@@ -0,0 +1,45 @@

+---
+title: Agentic Document Redaction
+emoji: 🤖
+colorFrom: blue
+colorTo: indigo
+sdk: docker
+app_port: 7860
+pinned: false
+license: agpl-3.0
+---
+# Pi agent — agentic document redaction
+Orchestrate document redaction with **[Pi](https://github.com/earendil-works/pi)** and **Google Gemini**. Heavy redaction runs on a separate **private [doc_redaction](https://huggingface.co/spaces/seanpedrickcase/document_redaction)** Hugging Face Space (simple text extraction + Local PII).
+## Before you start
+1. **Gemini API key** — paste in **Agent backend** → **Apply backend** (session-only; not stored on disk).
+2. **HF token** — Space admin should set `HF_TOKEN` under **Settings → Secrets** so this Space can call the private redaction backend. Users may optionally override per session in the UI.
+## Limitations
+- **No face or signature VLM** — text-layer PII only via Local spaCy/Presidio on the remote Space.
+- **No Pass 2 VLM** on this deployment.
+- **Ephemeral storage** — download deliverables from **Workspace output files** before the Space restarts.
+- **Human review** — outputs are not guaranteed complete; review redacted PDFs before release.
+## Defaults
+| Setting | Value |
+|---------|--------|
+| Pi LLM | Gemini (`gemini-flash-latest` default) |
+| Redaction backend | `https://seanpedrickcase-document-redaction.hf.space` |
+| Text extraction | `Local model - selectable text` |
+| PII detection | `Local` |
+## Examples
+Two sample PDFs load in **Redaction task** → **Try an example** (same demos as the main doc_redaction app). Examples are **on by default**; set Space variable `PI_GRADIO_SHOW_EXAMPLES=false` to hide them. (`SHOW_PI_EXAMPLES` is also accepted.)
+If examples do not appear, the UI shows a short status message (usually missing PDFs in the image — rebuild after a successful sync with LFS materialization).
+## Development
+This Space is synced from the [doc_redaction monorepo](https://github.com/seanpedrick-case/doc_redaction) on pushes to **`dev`** (see `.github/workflows/sync-pi-agent-space.yml`). Space: [seanpedrickcase/agentic_document_redaction](https://huggingface.co/spaces/seanpedrickcase/agentic_document_redaction).

agent-redact/pi-agent/sync-manifest.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+# Paths copied from the monorepo root into the flattened Pi agent HF Space repo.
+agent-redact/requirements_pi_agent.txt
+agent-redact/pi
+skills
+tools
+config/pi_agent.env.example
+intros/pi_intro.txt
+AGENTS.md
+doc_redaction/example_data/example_of_emails_sent_to_a_professor_before_applying.pdf
+doc_redaction/example_data/graduate-job-example-cover-letter.pdf

agent-redact/pi-agent/sync_to_space.sh ADDED Viewed

	@@ -0,0 +1,42 @@

+#!/usr/bin/env bash
+# Flatten monorepo paths into a temp directory for the Pi agent HF Space repo.
+# Usage (from repo root):
+#   agent-redact/pi-agent/sync_to_space.sh /path/to/output-dir
+set -euo pipefail
+ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
+OUT="${1:?Output directory required}"
+MANIFEST="$(dirname "$0")/sync-manifest.txt"
+_is_lfs_pointer() {
+  [[ -f "$1" ]] && head -1 "$1" 2>/dev/null | grep -q "^version https://git-lfs.github.com/spec/v1"
+}
+rm -rf "$OUT"
+mkdir -p "$OUT"
+cp "$(dirname "$0")/Dockerfile" "$OUT/Dockerfile"
+cp "$(dirname "$0")/README.md" "$OUT/README.md"
+cp "$(dirname "$0")/.dockerignore" "$OUT/.dockerignore"
+cp "$(dirname "$0")/.gitattributes" "$OUT/.gitattributes"
+while IFS= read -r line || [[ -n "$line" ]]; do
+  line="${line%%#*}"
+  line="$(echo "$line" | xargs)"
+  [[ -z "$line" ]] && continue
+  src="$ROOT/$line"
+  if [[ ! -e "$src" ]]; then
+    echo "Missing: $src" >&2
+    exit 1
+  fi
+  dest="$OUT/$line"
+  mkdir -p "$(dirname "$dest")"
+  cp -a "$src" "$dest"
+  if [[ "$line" == *.pdf ]] && _is_lfs_pointer "$dest"; then
+    echo "Copied file is a Git LFS pointer, not a PDF: $line" >&2
+    echo "Run 'git lfs pull' in the monorepo before syncing." >&2
+    exit 1
+  fi
+done < "$MANIFEST"
+echo "Flattened Pi agent Space tree: $OUT"

agent-redact/pi/agent/README.md ADDED Viewed

	@@ -0,0 +1,183 @@

+# Pi agent config (Docker)
+Runtime Pi config is **generated at container start** by [`agent-redact/pi/pi_agent_config.py`](../pi_agent_config.py) into `~/.pi/agent/models.json` and `~/.pi/agent/settings.json`.
+Files in this folder (`settings.json`, `models.json`) are **templates/references** only — they are no longer bind-mounted into the container.
+## LLM backends (Pi orchestration)
+The Pi agent (chat + redaction orchestration) can use:
+| Provider key | Label | Pi API | Auth |
+|--------------|-------|--------|------|
+| `llama-cpp` | Local (llama-cpp) | `openai-completions` | None (local llama-inference) |
+| `google-gemini` | Gemini | `google-generative-ai` | `GEMINI_API_KEY` or `GOOGLE_API_KEY` |
+| `amazon-bedrock` | AWS Bedrock | `bedrock-converse-stream` | AWS SDK credentials (`AWS_ACCESS_KEY_ID`, etc.) |
+This is separate from doc_redaction **Pass 2 VLM** (`{VLM_BASE_URL}` in redaction prompts), which still targets local llama-inference by default.
+### Environment variables
+Copy [`config/pi_agent.env.example`](../../../config/pi_agent.env.example) to `config/pi_agent.env` (gitignored) or set on the host before `docker compose up`:
+| Variable | Purpose |
+|----------|---------|
+| `PI_DEFAULT_PROVIDER` | `llama-cpp` \| `google-gemini` \| `amazon-bedrock` |
+| `PI_DEFAULT_MODEL` | Model id within provider |
+| `PI_LLAMA_BASE_URL` | Local OpenAI-compatible URL (default `http://llama-inference:8080/v1`) |
+| `PI_LLAMA_MODEL_ID` | Local model id |
+| `GEMINI_API_KEY` / `GOOGLE_API_KEY` | Gemini API key |
+| `AWS_REGION` / `AWS_DEFAULT_REGION` | Bedrock region |
+| `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, `AWS_SESSION_TOKEN` | Bedrock credentials (when not using SSO) |
+| `AWS_PROFILE` | Named profile for SSO / shared credentials file (**required for Pi Bedrock with SSO**) |
+| `PI_AWS_PROFILE` | Alternative to `AWS_PROFILE`; also used to auto-select profile when only `~/.aws` is mounted |
+| `RUN_AWS_FUNCTIONS` | When `True`, use the AWS default credential chain (SSO, profile, role) |
+| `PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS` | When `True` with `RUN_AWS_FUNCTIONS`, prefer SSO/chain over static env keys (default `True`, same as main app) |
+| `PI_MAX_PAGES` | Maximum PDF pages allowed per redaction upload (falls back to `MAX_PAGES` / `MAX_DOC_PAGES`, default `3000`) |
+| `PI_MAX_RETRIES` | Gemini quota / rate-limit retries for Pi auto-retry and Gradio backoff (default `5`; alias `PI_QUOTA_RETRY_ATTEMPTS`) |
+| `PI_QUOTA_RETRY_DELAY_S` | Seconds between Gradio quota retries (default `60`) |
+| `PI_COMPACTION_ENABLED` | Pi session auto-compaction in `settings.json` (`true` / `false`; unset uses template default, enabled) |
+| `PI_COMPACTION_RESERVE_TOKENS` | Optional compaction `reserveTokens` (default `32768` from template) |
+| `PI_COMPACTION_KEEP_RECENT_TOKENS` | Optional compaction `keepRecentTokens` (default `20000` from template) |
+### Usage logging (CSV / DynamoDB / S3)
+Each completed Pi agent run (chat message or redaction task) writes **one row** to the **same usage log schema** as the main redaction app (`USAGE_LOG_FILE_NAME`, `USAGE_LOGS_FOLDER`, `S3_USAGE_LOGS_FOLDER`, `USAGE_LOG_DYNAMODB_TABLE_NAME`). Key fields:
+| Log column | Pi agent value |
+|------------|----------------|
+| `task` | `agent` |
+| `llm_model_name` | Pi provider/model (e.g. `amazon-bedrock/anthropic.claude-sonnet-4-6`) |
+| `text_extraction_method` / `pii_detection_method` | From redaction task settings when applicable |
+| `actual_time_taken_number` | Wall-clock seconds for the Pi RPC turn |
+| `total_page_count` | Pages in scope for PDF redaction tasks |
+| `llm_total_input_tokens` / `llm_total_output_tokens` | Pi orchestration LLM usage for that turn (from Pi `get_session_stats` delta, or assistant `usage` in session JSONL). Includes cache read/write in the input column. **VLM/tokens from doc_redaction Pass 1 are not included** (those stay on the main app usage log when you run redaction there directly). |
+Toggle with `SAVE_LOGS_TO_CSV`, `SAVE_LOGS_TO_DYNAMODB`, and `RUN_AWS_FUNCTIONS` (required for S3 log upload). Access logs on session load use the main app access log paths separately.
+At startup, if only `GOOGLE_API_KEY` is set, it is mirrored to `GEMINI_API_KEY` for Pi.
+### Gradio UI
+Open **http://localhost:7862** → **Agent backend** accordion:
+- Select provider and model
+- Optionally enter Gemini / AWS credentials (**session-only** — not written to disk)
+- Click **Apply backend** — regenerates config, restarts the Pi RPC subprocess, and starts a new session
+Credential fields are cleared after apply.
+## Local model id
+After the llama.cpp service is healthy, confirm the model id:
+```bash
+curl http://localhost:8000/v1/models
+```
+If the returned `id` differs from `unsloth/Qwen3.6-27B-MTP-GGUF`, set `PI_LLAMA_MODEL_ID` in `config/pi_agent.env` or compose environment and restart `pi-agent`.
+## In-container URLs for task prompts
+When filling [`skills/doc-redaction-task-prompt/TASK_PROMPT_TEMPLATE.md`](../../../skills/doc-redaction-task-prompt/TASK_PROMPT_TEMPLATE.md) inside the Pi container, use:
+| Placeholder | In-container value |
+|-------------|-------------------|
+| `{GRADIO_URL}` | `http://redaction-app-llama:7860` |
+| `{VLM_BASE_URL}` | `http://llama-inference:8080` |
+| `{INPUT_PATH}` | `/home/user/app/workspace/{session_hash}/{FILE_NAME}` (when `PI_SESSION_WORKSPACE=true`) |
+| `{OUTPUT_BASE}` | `/home/user/app/workspace/{session_hash}/redact/{FILE_NAME}/` |
+Host-side examples (`host.docker.internal`, `localhost:7861`) do not apply inside the compose network.
+## Usage
+Start the stack (27B profile):
+```powershell
+docker compose -f docker-compose_llama_agentic.yml --profile 27b_36 up -d --build
+```
+Interactive Pi TUI:
+```powershell
+docker compose -f docker-compose_llama_agentic.yml exec -it pi-agent pi
+```
+Gradio chat UI (browser):
+Open **http://localhost:7862**. Use the **Redaction task** panel to upload a document, enter bullet-point requirements, and click **Start redaction task**. Pi receives the filled prompt from [`skills/Example prompt partnership.txt`](../../../skills/Example%20prompt%20partnership.txt) (file copied to `/home/user/app/workspace/`). The full prompt appears in the chat; Pi’s reply streams in the chat panel.
+The UI also shows:
+- **Agent backend** — switch between local, Gemini, and Bedrock
+- **Chat** — streamed assistant text
+- **Activity** — agent/turn lifecycle, compaction, auto-retry, tool start/end
+- **Tool output** — live bash/read output from `tool_execution_update` / `tool_execution_end`
+- **Thinking** — optional stream (`PI_GRADIO_SHOW_THINKING=true`)
+- **Abort** — sends Pi RPC `abort` and cancels the in-flight Gradio handler
+- **Workspace output files** — browse and download redaction artifacts
+Optional env vars on `pi-agent`: `PI_GRADIO_SHOW_THINKING`, `PI_GRADIO_SHOW_TOOL_OUTPUT`, `PI_GRADIO_TOOL_OUTPUT_MAX`, `PI_GRADIO_ACTIVITY_MAX_LINES`.
+When a Pi run completes, the chat shows an **Agent finished** (or **Agent stopped**) line, a Gradio info toast appears, and the browser tab title flashes for ~15 seconds. Desktop notifications are shown when the browser has granted notification permission (requested on first click/keypress in the Pi UI).
+Run the UI locally (outside Docker):
+```powershell
+cd agent-redact/pi
+pip install -r ../requirements_pi_agent.txt
+# Pi orchestration subprocess (required for Apply backend / chat):
+npm install -g @earendil-works/pi-coding-agent
+python pi_agent_config.py
+python gradio_app.py
+```
+**Apply backend** starts `pi --mode rpc`. If you see `FileNotFoundError` / “Pi CLI not found”, install Node.js, run the `npm install` line above, and ensure `pi` (or `pi.cmd` on Windows) is on `PATH`. Optional: `PI_EXECUTABLE=C:\Users\you\AppData\Roaming\npm\pi.cmd` in `config/pi_agent.env`.
+RPC mode (automation, no Gradio):
+```powershell
+docker compose -f docker-compose_llama_agentic.yml exec -T pi-agent pi --mode rpc
+```
+Skills are synced from the repo `skills/` tree into **`{PI_WORKSPACE_DIR}/.pi/skills/`** on startup (read-only). Pi runs with `cwd` in the user’s session subfolder and `--no-skills` so it does not load skills from the git checkout. Use `/skill:doc-redaction-app` etc. Set `PI_SKILLS_RESYNC=true` to refresh copies from the repo.
+Sessions persist in the **`pi-agent-sessions`** Docker volume at **`~/.pi/agent/sessions/`** (Pi’s default session location inside the container). Override with `PI_SESSION_DIR` if needed.
+On **HF Space** (`PI_DEPLOYMENT_PROFILE=hf-space`), sessions go to **`/tmp/pi-sessions`** instead (ephemeral; lost on restart).
+## Python dependencies
+The Pi image installs [`requirements_pi_agent.txt`](../requirements_pi_agent.txt) — Gradio UI + `gradio-client`, HTTP clients, CSV/PDF review helpers (`pandas`, `pymupdf`), and common utilities. It **does not** include spaCy, Presidio, or OCR; heavy redaction runs in `redaction-app-llama`.
+Rebuild after changing that file:
+```powershell
+docker compose -f docker-compose_llama_agentic.yml --profile 27b_36 build pi-agent
+```
+## HF Space profile (remote redaction backend)
+Set `PI_DEPLOYMENT_PROFILE=hf-space` to run the Pi Gradio UI as a **Hugging Face Docker Space** that orchestrates with **Gemini only** and calls a **remote** doc_redaction Space over HTTPS.
+| Area | HF Space value |
+|------|----------------|
+| Pi LLM | Gemini only (`PI_DEFAULT_PROVIDER=google-gemini`) |
+| Redaction app | `DOC_REDACTION_GRADIO_URL` (default `https://seanpedrickcase-document-redaction.hf.space`) |
+| Auth to redaction | `HF_TOKEN` / `DOC_REDACTION_HF_TOKEN` (Space secret + optional UI override) |
+| Text extraction / PII | Locked to `Local model - selectable text` + `Local` |
+| VLM faces / signatures | Disabled |
+| Port | `7860` |
+| Pi session logs | `/tmp/pi-sessions` (`PI_SESSION_DIR`; ephemeral) |
+Package and Dockerfile: [`agent-redact/pi-agent/`](../../pi-agent/). Pushes to [agentic_document_redaction](https://huggingface.co/spaces/seanpedrickcase/agentic_document_redaction) on **`dev`** branch via [`.github/workflows/sync-pi-agent-space.yml`](../../../.github/workflows/sync-pi-agent-space.yml) (GitHub secrets: `HF_TOKEN`, `HF_USERNAME`, `HF_EMAIL`).
+Local build test from monorepo root:
+```powershell
+docker build -f agent-redact/pi-agent/Dockerfile -t pi-agent-hf-space .
+docker run --rm -p 7860:7860 -e GEMINI_API_KEY=... -e HF_TOKEN=... pi-agent-hf-space
+```
+Pi uses `gradio_client` + `agent-redact/pi/remote_redaction.py` to upload/download from the remote Space; prompts include `{REMOTE_BACKEND_GUIDANCE}` (see [`redaction_prompt.py`](../redaction_prompt.py)).

agent-redact/pi/agent/models.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "providers": {
+    "llama-cpp": {
+      "baseUrl": "http://llama-inference:8080/v1",
+      "api": "openai-completions",
+      "apiKey": "llama-cpp",
+      "compat": {
+        "supportsDeveloperRole": false,
+        "supportsReasoningEffort": false,
+        "supportsUsageInStreaming": false,
+        "maxTokensField": "max_tokens"
+      },
+      "models": [
+        {
+          "id": "unsloth/Qwen3.6-27B-MTP-GGUF",
+          "name": "Qwen 3.6 27B (local)",
+          "reasoning": false,
+          "input": ["text", "image"],
+          "contextWindow": 114688,
+          "maxTokens": 32768,
+          "cost": {
+            "input": 0,
+            "output": 0,
+            "cacheRead": 0,
+            "cacheWrite": 0
+          }
+        }
+      ]
+    }
+  }
+}

agent-redact/pi/agent/settings.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "defaultProvider": "llama-cpp",
+  "defaultModel": "unsloth/Qwen3.6-27B-MTP-GGUF",
+  "defaultThinkingLevel": "off",
+  "hideThinkingBlock": true,
+  "compaction": {
+    "enabled": true,
+    "reserveTokens": 32768,
+    "keepRecentTokens": 20000
+  },
+  "branchSummary": {
+    "skipPrompt": true,
+    "reserveTokens": 32768
+  },
+  "retry": {
+    "enabled": true,
+    "maxRetries": 5,
+    "baseDelayMs": 2000,
+    "provider": {
+      "timeoutMs": 3600000,
+      "maxRetries": 5,
+      "maxRetryDelayMs": 60000
+    }
+  },
+  "enableSkillCommands": true,
+  "sessionDir": "sessions",
+  "steeringMode": "one-at-a-time",
+  "followUpMode": "one-at-a-time",
+  "terminal": {
+    "showTerminalProgress": false
+  }
+}

agent-redact/pi/bootstrap_pi_config.py ADDED Viewed

	@@ -0,0 +1,151 @@

+"""Pi agent process bootstrap (env file + workspace) before ``tools.config`` import."""
+from __future__ import annotations
+import os
+from pathlib import Path
+from dotenv import load_dotenv
+_DOCKER_WORKSPACE = Path("/home/user/app/workspace")
+_DOCKER_UPLOAD_ROOT = Path("/tmp/gradio")
+_DOCKER_PI_WORKDIR = Path("/workspace/doc_redaction")
+_PARTNERSHIP_TEMPLATE = Path("skills") / "Example prompt partnership.txt"
+def _pi_running_in_container() -> bool:
+    """
+    True when the Pi process is inside Docker / HF Space, not local Windows dev.
+    Avoids treating ``C:\\home\\user\\app\\workspace`` (created by mistake on Windows)
+    as the compose mount.
+    """
+    if Path("/.dockerenv").is_file():
+        return True
+    return _DOCKER_PI_WORKDIR.is_dir() and _partnership_template_exists(
+        _DOCKER_PI_WORKDIR
+    )
+def ensure_pi_workspace_dir(repo_root: Path | None = None) -> str:
+    """
+    Resolve ``PI_WORKSPACE_DIR``, create it, and sync ``os.environ``.
+    - Explicit ``PI_WORKSPACE_DIR`` wins.
+    - Else use the Docker mount only when running in a container.
+    - Else ``{repo_root}/workspace`` (local Windows/macOS/Linux dev).
+    """
+    root = (repo_root or Path(__file__).resolve().parents[2]).resolve()
+    raw = (os.environ.get("PI_WORKSPACE_DIR") or "").strip()
+    if raw:
+        path = Path(raw)
+    elif _pi_running_in_container() and _DOCKER_WORKSPACE.is_dir():
+        path = _DOCKER_WORKSPACE
+    else:
+        path = root / "workspace"
+    path.mkdir(parents=True, exist_ok=True)
+    resolved = str(path.resolve())
+    os.environ["PI_WORKSPACE_DIR"] = resolved
+    return resolved
+def ensure_pi_upload_root(repo_root: Path | None = None) -> str:
+    """
+    Resolve where Gradio stores ``gr.File`` uploads and sync ``os.environ``.
+    Must run before ``import gradio`` so ``GRADIO_TEMP_DIR`` matches validation
+    in ``redaction_prompt._resolve_and_validate_upload_path``.
+    - Explicit ``PI_UPLOAD_ROOT`` wins.
+    - Else ``GRADIO_TEMP_DIR`` if already set.
+    - Else Docker ``/tmp/gradio`` when that directory exists.
+    - Else ``{repo}/workspace/.gradio_uploads`` (local dev; stays inside the app tree
+      so ``tools.config.ensure_folder_within_app_directory`` accepts ``GRADIO_TEMP_DIR``).
+    """
+    root = (repo_root or Path(__file__).resolve().parents[2]).resolve()
+    raw = (os.environ.get("PI_UPLOAD_ROOT") or "").strip()
+    if raw:
+        path = Path(raw)
+    else:
+        gradio_temp = (os.environ.get("GRADIO_TEMP_DIR") or "").strip()
+        if gradio_temp:
+            path = Path(gradio_temp)
+        elif _pi_running_in_container() and _DOCKER_UPLOAD_ROOT.is_dir():
+            path = _DOCKER_UPLOAD_ROOT
+        else:
+            path = root / "workspace" / ".gradio_uploads"
+    path.mkdir(parents=True, exist_ok=True)
+    resolved = str(path.resolve())
+    os.environ["PI_UPLOAD_ROOT"] = resolved
+    if not (os.environ.get("GRADIO_TEMP_DIR") or "").strip():
+        os.environ["GRADIO_TEMP_DIR"] = resolved
+    return resolved
+def _partnership_template_exists(repo: Path) -> bool:
+    return (repo / _PARTNERSHIP_TEMPLATE).is_file()
+def ensure_pi_workdir(repo_root: Path | None = None) -> str:
+    """
+    Resolve ``PI_WORKDIR`` (monorepo root for skills/ and Pi RPC cwd).
+    - Explicit ``PI_WORKDIR`` wins when the partnership prompt template exists there.
+    - Else use the checkout root (``agent-redact/pi`` → parents[2]).
+    - Docker images set ``PI_WORKDIR=/workspace/doc_redaction`` via env or ``start.sh``.
+    """
+    root = (repo_root or Path(__file__).resolve().parents[2]).resolve()
+    raw = (os.environ.get("PI_WORKDIR") or "").strip()
+    if raw:
+        candidate = Path(raw)
+        if _partnership_template_exists(candidate):
+            resolved = str(candidate.resolve())
+            os.environ["PI_WORKDIR"] = resolved
+            return resolved
+    if _pi_running_in_container() and _partnership_template_exists(_DOCKER_PI_WORKDIR):
+        resolved = str(_DOCKER_PI_WORKDIR.resolve())
+        os.environ["PI_WORKDIR"] = resolved
+        return resolved
+    resolved = str(root)
+    os.environ["PI_WORKDIR"] = resolved
+    return resolved
+def pi_repo_root_path(repo_root: Path | None = None) -> Path:
+    """Return ``PI_WORKDIR`` as a :class:`~pathlib.Path` (calls :func:`ensure_pi_workdir`)."""
+    return Path(ensure_pi_workdir(repo_root))
+def load_pi_agent_env_file(config_path: str | Path | None = None) -> bool:
+    """
+    Load ``config/pi_agent.env`` into ``os.environ`` (does not override existing vars).
+    Must run before ``import pi_agent_config`` so module-level defaults see the file.
+    """
+    path = Path(config_path or os.environ.get("APP_CONFIG_PATH", "")).expanduser()
+    if not path.is_file():
+        return False
+    load_dotenv(path, override=False)
+    return True
+def ensure_pi_config_env(repo_root: Path | None = None) -> str:
+    """
+    Set process env so ``tools.config`` loads the Pi agent env file.
+    Must run before any ``from pi_agent_config import ...`` or ``tools.config`` import
+    that depends on Pi env vars. Safe to call multiple times; does not override
+    existing environment variables.
+    """
+    root = (repo_root or Path(__file__).resolve().parents[2]).resolve()
+    os.environ.setdefault("APP_TYPE", "pi")
+    if not os.environ.get("APP_CONFIG_PATH", "").strip():
+        os.environ["APP_CONFIG_PATH"] = str(root / "config" / "pi_agent.env")
+    load_pi_agent_env_file()
+    ensure_pi_workdir(root)
+    ensure_pi_workspace_dir(root)
+    ensure_pi_upload_root(root)
+    from pi_workspace_skills import ensure_workspace_skills
+    ensure_workspace_skills()
+    return os.environ["APP_CONFIG_PATH"]

agent-redact/pi/gradio_app.py ADDED Viewed

	@@ -0,0 +1,1769 @@

+#!/usr/bin/env python3
+"""
+Gradio chat UI for Pi (RPC mode).
+Streams Pi RPC events into a chatbot, activity log, tool output panel, and
+optional thinking trace. Includes a redaction task panel driven by the
+partnership prompt template.
+"""
+from __future__ import annotations
+import os
+import sys
+import time
+from pathlib import Path
+from typing import Any
+from fastapi import FastAPI
+_REPO_ROOT = Path(__file__).resolve().parents[2]
+if str(_REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(_REPO_ROOT))
+sys.path.insert(0, str(Path(__file__).resolve().parent))
+from bootstrap_pi_config import ensure_pi_config_env
+ensure_pi_config_env(_REPO_ROOT)
+import gradio as gr
+from output_files import (
+    collect_final_output_files,
+    gradio_allowed_paths,
+    refresh_workspace_output_files_stub,
+    refresh_workspace_panel,
+    workspace_files_download_fn,
+)
+from pi_agent_config import (
+    apply_session_credentials,
+    configure_aws_credentials,
+    credential_status_markdown,
+    default_model_for_provider,
+    gemini_api_key_configured,
+    get_default_provider,
+    is_hf_space_profile,
+    mirror_hf_token_from_env,
+    models_for_provider,
+    normalize_provider,
+    provider_choices,
+    provider_label,
+    resolved_default_model,
+    write_runtime_config,
+)
+from pi_examples import example_rows, examples_status_markdown
+from pi_rpc_client import (
+    PiRpcClient,
+    PiRpcError,
+    PiStreamEvent,
+    assistant_text_since_last_user,
+    default_client,
+    is_rate_limit_error,
+    last_assistant_turn_error,
+)
+from redaction_prompt import (
+    DEFAULT_OCR_METHOD,
+    DEFAULT_PII_METHOD,
+    OCR_METHOD_CHOICES,
+    PII_METHOD_CHOICES,
+    RedactionTaskSettings,
+    pages_to_process_count,
+    pdf_page_count,
+    prepare_redaction_task,
+)
+from session_logs import collect_session_log_download, persist_session_log
+# Before any ``tools.config`` import (e.g. session_workspace): compose may inject
+# empty AWS_REGION= which would freeze a blank region in tools.config.AWS_REGION.
+mirror_hf_token_from_env()
+configure_aws_credentials()
+from pi_session_usage import resolve_session_token_usage, usage_for_completed_turn
+from session_workspace import (
+    init_session_workspace,
+    prepare_session_workspace,
+    session_workspace_dir,
+    workspace_base_dir,
+    workspace_context_prefix,
+)
+from tools.aws_functions import export_outputs_to_s3, s3_outputs_upload_ready
+from tools.config import (
+    ACTIVITY_MAX_LINES,
+    EMPTY_SEND_WITH_FILE_HINT,
+    HOST_NAME,
+    PI_GRADIO_PORT,
+    PI_INTRO_TEXT,
+    PI_UI_HOST,
+    PI_UI_TITLE,
+    QUOTA_CONTINUE_PROMPT,
+    QUOTA_RETRY_ATTEMPTS,
+    QUOTA_RETRY_DELAY_S,
+    RUN_FASTAPI,
+    SAVE_OUTPUTS_TO_S3,
+    SHOW_THINKING,
+    SHOW_TOOL_OUTPUT,
+    THINKING_DISPLAY_MAX,
+    THINKING_PANEL_CSS,
+    TOOL_OUTPUT_MAX,
+)
+from tools.gradio_platform import (
+    create_fastapi_app,
+    log_agent_usage_event,
+    log_platform_access,
+    mount_or_launch,
+)
+IS_HF_SPACE = is_hf_space_profile()
+# Use PI_GRADIO_PORT only — GRADIO_SERVER_PORT is the main app's default (7860) and is
+# written into os.environ during tools.config import, which would override 7862 here.
+PI_UI_PORT = PI_GRADIO_PORT
+AGENT_FINISH_SIGNAL_NONE = ""
+AGENT_FINISH_SIGNAL_FINISHED = "finished"
+AGENT_FINISH_SIGNAL_ABORTED = "aborted"
+AGENT_FINISH_SIGNAL_ERROR = "error"
+PI_AGENT_FINISH_HEAD_HTML = """
+<script>
+(function () {
+  function requestNotificationPermissionOnce() {
+    if (typeof Notification === "undefined") return;
+    if (Notification.permission !== "default") return;
+    try { Notification.requestPermission(); } catch (e) {}
+  }
+  document.addEventListener("click", requestNotificationPermissionOnce, { once: true });
+  document.addEventListener("keydown", requestNotificationPermissionOnce, { once: true });
+})();
+</script>
+"""
+PI_AGENT_FINISH_NOTIFY_JS = """
+async (...outputs) => {
+  const finishSignal = outputs[outputs.length - 1];
+  if (!finishSignal) {
+    return outputs;
+  }
+  const isAborted = finishSignal === "aborted";
+  const isError = finishSignal === "error";
+  const title = isAborted ? "Agent stopped" : (isError ? "Agent error" : "Agent finished");
+  const body = isAborted
+    ? "The Pi agent run was aborted."
+    : (isError
+      ? "The Pi agent run ended with an error."
+      : "The Pi agent has finished its task. Review the chat for results.");
+  const originalTitle = document.title;
+  let flashOn = true;
+  const flashInterval = setInterval(() => {
+    document.title = flashOn ? ("✓ " + title) : originalTitle;
+    flashOn = !flashOn;
+  }, 1000);
+  setTimeout(() => {
+    clearInterval(flashInterval);
+    document.title = originalTitle;
+  }, 15000);
+  if (typeof Notification !== "undefined") {
+    try {
+      if (Notification.permission === "granted") {
+        new Notification(title, { body: body, tag: "pi-agent-finish" });
+      } else if (Notification.permission === "default") {
+        const perm = await Notification.requestPermission();
+        if (perm === "granted") {
+          new Notification(title, { body: body, tag: "pi-agent-finish" });
+        }
+      }
+    } catch (e) {}
+  }
+  outputs[outputs.length - 1] = "";
+  return outputs;
+}
+"""
+app = None
+def _agent_finish_chat_notice(*, aborted: bool = False, error: bool = False) -> str:
+    if aborted:
+        return (
+            "---\n\n"
+            "**Agent stopped** — the run was aborted. You can send a follow-up message "
+            "or start a new task."
+        )
+    if error:
+        return (
+            "---\n\n"
+            "**Agent stopped** — the run ended with an error. Review the activity log "
+            "and send a follow-up if needed."
+        )
+    return (
+        "---\n\n"
+        "**Agent finished** — the task is complete. Review the outputs below or send "
+        "a follow-up message if you need changes."
+    )
+def _show_agent_finish_toast(*, aborted: bool = False, error: bool = False) -> None:
+    try:
+        if aborted:
+            gr.Info("Agent stopped (aborted).", duration=8)
+        elif error:
+            gr.Info("Agent stopped with an error.", duration=8)
+        else:
+            gr.Info("Agent finished — task complete.", duration=8)
+    except Exception:
+        pass
+def _agent_finish_signal_value(*, aborted: bool = False, error: bool = False) -> str:
+    if error:
+        return AGENT_FINISH_SIGNAL_ERROR
+    if aborted:
+        return AGENT_FINISH_SIGNAL_ABORTED
+    return AGENT_FINISH_SIGNAL_FINISHED
+def _notify_agent_finished(*, aborted: bool = False, error: bool = False) -> str:
+    """Show Gradio toast and return browser-notify signal for the finish handler."""
+    _show_agent_finish_toast(aborted=aborted, error=error)
+    return _agent_finish_signal_value(aborted=aborted, error=error)
+def _append_agent_finish_notice(
+    history: list[dict[str, Any]],
+    completed_segments: list[str],
+    streaming_text: str,
+    *,
+    aborted: bool = False,
+    error: bool = False,
+) -> tuple[list[dict[str, Any]], list[str], str]:
+    note = _agent_finish_chat_notice(aborted=aborted, error=error)
+    completed_segments, streaming_text = _append_chat_segment(
+        completed_segments, streaming_text, note
+    )
+    if history and history[-1].get("role") == "assistant":
+        history[-1]["content"] = _assistant_display_text(
+            completed_segments, streaming_text
+        )
+    return history, completed_segments, streaming_text
+def _passthrough_chat_outputs(*outputs: Any) -> tuple[Any, ...]:
+    """Passthrough for ``.then(js=...)`` — Gradio forces ``queue=False`` when ``fn is None``."""
+    return outputs
+def _client_provider_model(client: PiRpcClient | None) -> tuple[str, str]:
+    if client is None:
+        return "", ""
+    try:
+        state = client.get_state()
+    except PiRpcError:
+        return "", ""
+    model = state.get("model") or {}
+    provider = str(model.get("provider") or state.get("provider") or "")
+    model_label = str(model.get("id") or model.get("name") or "")
+    return provider, model_label
+def _llm_model_label(client: PiRpcClient | None) -> str:
+    provider, model = _client_provider_model(client)
+    if provider and model:
+        return f"{provider}/{model}"
+    return model or provider
+def _after_pi_task(
+    *,
+    session_hash: str,
+    client: PiRpcClient | None,
+    s3_output_folder: str,
+    save_outputs_to_s3: bool,
+    document_name: str = "",
+    started_at: float | None = None,
+    base_file: str | None = None,
+    ocr_method: str = "",
+    pii_method: str = "",
+    total_page_count: int = 0,
+    vlm_model_name: str | None = None,
+    llm_input_tokens: int = 0,
+    llm_output_tokens: int = 0,
+) -> None:
+    duration = round(time.time() - started_at, 2) if started_at else ""
+    log_agent_usage_event(
+        session_hash=session_hash,
+        duration_seconds=duration,
+        document_name=document_name,
+        total_page_count=total_page_count,
+        ocr_method=ocr_method,
+        pii_method=pii_method,
+        llm_model_name=_llm_model_label(client),
+        vlm_model_name=vlm_model_name or os.environ.get("PI_VLM_MODEL", ""),
+        llm_input_tokens=llm_input_tokens,
+        llm_output_tokens=llm_output_tokens,
+        task="agent",
+    )
+    persist_session_log(client, session_hash=session_hash)
+    file_paths = collect_final_output_files(session_hash)
+    if (
+        file_paths
+        and s3_output_folder
+        and s3_outputs_upload_ready(save_outputs_to_s3=save_outputs_to_s3)
+    ):
+        export_outputs_to_s3(
+            file_paths,
+            s3_output_folder,
+            save_outputs_to_s3,
+            base_file,
+        )
+def _export_workspace_outputs(
+    session_hash: str,
+    s3_output_folder: str,
+    save_outputs_to_s3: bool,
+    base_file: str | None = None,
+) -> None:
+    file_paths = collect_final_output_files(session_hash)
+    if (
+        file_paths
+        and s3_output_folder
+        and s3_outputs_upload_ready(save_outputs_to_s3=save_outputs_to_s3)
+    ):
+        export_outputs_to_s3(
+            file_paths,
+            s3_output_folder,
+            save_outputs_to_s3,
+            base_file,
+        )
+def _clone_history(history: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    return [{"role": item["role"], "content": item["content"]} for item in history]
+def _truncate_thinking(text: str, limit: int = THINKING_DISPLAY_MAX) -> str:
+    if len(text) <= limit:
+        return text
+    hidden = len(text) - limit
+    return f"… [{hidden:,} earlier chars hidden]\n\n{text[-limit:]}"
+def _assistant_display_text(completed_segments: list[str], current: str) -> str:
+    parts = [segment.strip() for segment in completed_segments if segment.strip()]
+    if current.strip():
+        parts.append(current.strip())
+    return "\n\n".join(parts)
+def _finalize_assistant_chat(
+    client: PiRpcClient,
+    history: list[dict[str, Any]],
+    *,
+    completed_segments: list[str],
+    streaming_text: str,
+    activity: list[str],
+) -> None:
+    """Fill an empty assistant bubble after tool-only Gemini turns."""
+    if not history or history[-1].get("role") != "assistant":
+        return
+    if _assistant_display_text(completed_segments, streaming_text).strip():
+        history[-1]["content"] = _assistant_display_text(
+            completed_segments, streaming_text
+        )
+        return
+    if history[-1].get("content", "").strip():
+        return
+    try:
+        fallback = assistant_text_since_last_user(client.get_messages())
+    except PiRpcError:
+        fallback = ""
+    if fallback.strip():
+        history[-1]["content"] = fallback
+        return
+    if activity:
+        history[-1]["content"] = (
+            "_This run completed using tools only (no assistant prose was streamed). "
+            "See **Thinking log** for step-by-step activity._"
+        )
+def _gemini_key_error() -> str | None:
+    if IS_HF_SPACE and not gemini_api_key_configured():
+        return (
+            "**Gemini API key required.** Paste your key in **Agent backend** and click "
+            "**Apply backend** before chatting or starting a redaction task."
+        )
+    return None
+def _ensure_client(
+    client: PiRpcClient | None,
+    session_hash: str = "",
+) -> PiRpcClient:
+    key_error = _gemini_key_error()
+    if key_error:
+        raise PiRpcError(key_error)
+    if isinstance(client, PiRpcClient) and client.running:
+        return client
+    client = default_client(session_hash or None)
+    client.start()
+    provider = normalize_provider(get_default_provider())
+    model = resolved_default_model(provider)
+    try:
+        client.set_model(provider, model)
+    except PiRpcError:
+        pass
+    return client
+def _coerce_client(client: Any) -> PiRpcClient | None:
+    return client if isinstance(client, PiRpcClient) else None
+def _truncate(text: str, limit: int = TOOL_OUTPUT_MAX) -> str:
+    if len(text) <= limit:
+        return text
+    return text[: limit - 40] + f"\n\n… [{len(text) - limit + 40} chars truncated]"
+def _format_activity(lines: list[str]) -> str:
+    if not lines:
+        return "_No activity yet._"
+    return "\n".join(f"- {line}" for line in lines[-ACTIVITY_MAX_LINES:])
+def _append_activity(lines: list[str], text: str) -> list[str]:
+    text = text.strip()
+    if text:
+        lines.append(text)
+    return lines
+def _append_chat_segment(
+    completed_segments: list[str],
+    streaming_text: str,
+    segment: str,
+) -> tuple[list[str], str]:
+    """Append a new visible chat segment (tool line or prose), preserving prior segments."""
+    segment = segment.strip()
+    if not segment:
+        return completed_segments, streaming_text
+    if streaming_text.strip():
+        completed_segments = completed_segments + [streaming_text.strip()]
+        streaming_text = ""
+    if not completed_segments or completed_segments[-1] != segment:
+        completed_segments = completed_segments + [segment]
+    return completed_segments, streaming_text
+def _apply_event(
+    event: PiStreamEvent,
+    *,
+    history: list[dict[str, Any]],
+    activity: list[str],
+    thinking: str,
+    tool_output: str,
+    tool_heading: str,
+    completed_segments: list[str],
+    streaming_text: str,
+) -> tuple[list[dict[str, Any]], list[str], str, str, str, list[str], str]:
+    if event.kind == "text_snapshot":
+        if event.text.strip().startswith("**") and ":" in event.text.split("\n", 1)[0]:
+            completed_segments, streaming_text = _append_chat_segment(
+                completed_segments, streaming_text, event.text
+            )
+        else:
+            streaming_text = event.text
+        history[-1]["content"] = _assistant_display_text(
+            completed_segments, streaming_text
+        )
+    elif event.kind == "text_delta":
+        streaming_text += event.text
+        history[-1]["content"] = _assistant_display_text(
+            completed_segments, streaming_text
+        )
+    elif event.kind == "thinking_snapshot":
+        if SHOW_THINKING:
+            thinking = event.text
+    elif event.kind == "thinking_delta":
+        if SHOW_THINKING:
+            thinking += event.text
+    elif event.kind == "status":
+        activity = _append_activity(activity, event.text)
+    elif event.kind == "turn_end":
+        activity = _append_activity(activity, event.text)
+    elif event.kind == "tool_start":
+        if streaming_text.strip():
+            completed_segments.append(streaming_text.strip())
+            streaming_text = ""
+        label = event.tool_name or "tool"
+        detail = event.text or label
+        tool_line = f"**{label}:** {detail}" if detail != label else f"**{label}**"
+        completed_segments, streaming_text = _append_chat_segment(
+            completed_segments, streaming_text, tool_line
+        )
+        history[-1]["content"] = _assistant_display_text(
+            completed_segments, streaming_text
+        )
+        activity = _append_activity(activity, f"**Tool start:** `{label}` — {detail}")
+        tool_heading = f"### {label}\n{detail}\n\n```\n"
+        tool_output = ""
+    elif event.kind in {"tool_update", "tool_end"} and SHOW_TOOL_OUTPUT:
+        if event.tool_output is not None:
+            tool_output = _truncate(event.tool_output)
+        if event.kind == "tool_end":
+            status = "failed" if event.is_error else "completed"
+            activity = _append_activity(
+                activity,
+                f"**Tool {status}:** `{event.tool_name or 'tool'}`",
+            )
+    elif event.kind == "error":
+        activity = _append_activity(activity, f"**Error:** {event.text}")
+        history[-1]["content"] = _assistant_display_text(
+            completed_segments,
+            streaming_text,
+        )
+        history[-1]["content"] += f"\n\n**Error:** {event.text}"
+    elif event.kind == "done":
+        if streaming_text.strip():
+            completed_segments.append(streaming_text)
+            streaming_text = ""
+        aborted = event.text.strip().lower().startswith("agent aborted")
+        history, completed_segments, streaming_text = _append_agent_finish_notice(
+            history,
+            completed_segments,
+            streaming_text,
+            aborted=aborted,
+        )
+        activity = _append_activity(activity, event.text)
+    return (
+        history,
+        activity,
+        thinking,
+        tool_output,
+        tool_heading,
+        completed_segments,
+        streaming_text,
+    )
+def _format_tool_panel(heading: str, body: str) -> str:
+    if not heading and not body:
+        return ""
+    if heading.endswith("```\n") and body:
+        return f"{heading}{body}\n```"
+    if heading and not body:
+        return heading.rstrip("`") + "…`\n```" if heading.endswith("```\n") else heading
+    return heading + body
+def _pi_agent_model_label(client: PiRpcClient | None) -> str:
+    """Active Pi orchestration model, or configured defaults before Apply backend."""
+    if client is not None and client.running:
+        try:
+            state = client.get_state()
+            model = state.get("model") or {}
+            provider = str(model.get("provider") or state.get("provider") or "")
+            model_label = str(model.get("id") or model.get("name") or "")
+            if provider and model_label:
+                return f"{provider_label(provider)} / {model_label}"
+            return model_label or provider or "—"
+        except PiRpcError:
+            pass
+    provider = normalize_provider(get_default_provider())
+    model = resolved_default_model(provider)
+    return f"{provider_label(provider)} / {model} (default until backend applied)"
+def _agent_status_markdown(client: PiRpcClient | None = None) -> str:
+    """Redaction backend URL, Pi model, and credentials — shown at top of the UI."""
+    from redaction_prompt import doc_redaction_gradio_url
+    lines = [
+        f"**Redaction backend:** `{doc_redaction_gradio_url()}`",
+        f"**Pi agent model:** `{_pi_agent_model_label(client)}`",
+    ]
+    if client is None or not client.running:
+        lines.insert(0, "**Status:** Ready")
+        lines.append("")
+        lines.append(
+            "_Set `DOC_REDACTION_GRADIO_URL` in `config/pi_agent.env` if the doc_redaction "
+            "app is not at the URL above. Apply **Agent backend** to start Pi._"
+        )
+    else:
+        lines.insert(0, "**Status:** Pi agent connected")
+    lines.append("")
+    lines.append(credential_status_markdown())
+    return "  \n".join(lines)
+def _session_summary(client: PiRpcClient) -> str:
+    try:
+        state = client.get_state()
+    except PiRpcError as exc:
+        return f"{_agent_status_markdown(client)}  \n\n_Could not read Pi state: {exc}_"
+    session_file = state.get("sessionFile") or "—"
+    streaming = state.get("isStreaming")
+    compacting = state.get("isCompacting")
+    return (
+        f"{_agent_status_markdown(client)}  \n\n"
+        f"**Streaming:** `{streaming}` · **Compacting:** `{compacting}`  \n"
+        f"**Session log:** `{session_file}`"
+    )
+def _backend_model_choices_update(provider: str):
+    normalized = normalize_provider(provider)
+    models = models_for_provider(normalized)
+    return gr.update(choices=models, value=default_model_for_provider(normalized))
+def apply_backend(
+    provider: str,
+    model_id: str,
+    gemini_api_key: str,
+    hf_token: str,
+    aws_region: str,
+    aws_access_key_id: str,
+    aws_secret_access_key: str,
+    aws_session_token: str,
+    client: PiRpcClient | None,
+    session_hash: str,
+):
+    normalized = normalize_provider(provider)
+    model = (model_id or default_model_for_provider(normalized)).strip()
+    if model not in models_for_provider(normalized):
+        model = default_model_for_provider(normalized)
+    apply_session_credentials(
+        gemini_api_key=gemini_api_key or None,
+        hf_token=hf_token or None,
+        aws_region=aws_region or None,
+        aws_access_key_id=aws_access_key_id or None,
+        aws_secret_access_key=aws_secret_access_key or None,
+        aws_session_token=aws_session_token or None,
+    )
+    if hf_token and hf_token.strip():
+        os.environ["_HF_TOKEN_FROM_UI"] = "1"
+    write_runtime_config(default_provider=normalized, default_model=model)
+    existing = _coerce_client(client)
+    if existing is not None:
+        existing.close()
+    key_error = _gemini_key_error()
+    if key_error:
+        return (
+            None,
+            key_error,
+            gr.update(value=""),
+            gr.update(value=""),
+            gr.update(value=""),
+            gr.update(value=""),
+        )
+    rpc = default_client(session_hash or None)
+    try:
+        rpc.start()
+        rpc.set_model(normalized, model)
+        rpc.new_session()
+        summary = (
+            f"**Backend applied:** `{provider_label(normalized)}` / `{model}`  \n\n"
+            f"{_session_summary(rpc)}"
+        )
+    except (PiRpcError, FileNotFoundError, OSError) as exc:
+        rpc.close()
+        rpc = None
+        summary = f"**Backend error:** {exc}  \n\n{credential_status_markdown()}"
+    return (
+        rpc,
+        summary,
+        gr.update(value=""),
+        gr.update(value=""),
+        gr.update(value=""),
+        gr.update(value=""),
+    )
+def _init_session_ui(
+    request: gr.Request,
+) -> tuple[str, Any, str, list[str] | None, str]:
+    session_hash, explorer, status, s3_prefix = init_session_workspace(request)
+    log_platform_access(session_hash, HOST_NAME)
+    return (
+        session_hash,
+        explorer,
+        status,
+        collect_final_output_files(session_hash),
+        s3_prefix,
+    )
+def _chat_yield(
+    history: list[dict[str, Any]],
+    client: PiRpcClient,
+    activity: list[str],
+    thinking: str,
+    tool_heading: str,
+    tool_output: str,
+    *,
+    msg: str = "",
+    send_enabled: bool = True,
+    abort_enabled: bool = False,
+    redact_enabled: bool = True,
+    session_info: str | None = None,
+    session_hash: str = "",
+    refresh_final_files: bool = False,
+    agent_finish_signal: str = AGENT_FINISH_SIGNAL_NONE,
+):
+    final_files: list[str] | None | dict[str, Any]
+    session_log: str | None | dict[str, Any]
+    if refresh_final_files:
+        final_files = collect_final_output_files(session_hash)
+        session_log = collect_session_log_download(client)
+    else:
+        final_files = gr.update()
+        session_log = gr.update()
+    return (
+        _clone_history(history),
+        client,
+        msg,
+        _format_activity(activity),
+        _format_tool_panel(tool_heading, tool_output),
+        _truncate_thinking(thinking),
+        session_info if session_info is not None else _session_summary(client),
+        gr.update(interactive=send_enabled),
+        gr.update(interactive=abort_enabled),
+        gr.update(interactive=redact_enabled),
+        final_files,
+        session_log,
+        agent_finish_signal,
+    )
+def _run_pi_chat(
+    message: str,
+    history: list[dict[str, Any]] | None,
+    client: PiRpcClient | None,
+    *,
+    chat_user_message: str | None = None,
+    session_hash: str = "",
+    initial_session_info: str | None = None,
+    s3_output_folder: str = "",
+    save_outputs_to_s3: bool = False,
+    document_name: str = "",
+    base_file: str | None = None,
+    ocr_method: str = "",
+    pii_method: str = "",
+    total_page_count: int = 0,
+    vlm_model_name: str | None = None,
+    redact_file: str | None = None,
+):
+    if not message or not message.strip():
+        client = client if client and client.running else None
+        hint_activity = [EMPTY_SEND_WITH_FILE_HINT] if redact_file else []
+        if client:
+            yield _chat_yield(
+                history or [],
+                client,
+                hint_activity,
+                "",
+                "",
+                "",
+                session_hash=session_hash,
+            )
+        else:
+            activity_text = (
+                _format_activity(hint_activity)
+                if hint_activity
+                else "_No activity yet._"
+            )
+            yield (
+                history or [],
+                None,
+                "",
+                activity_text,
+                "",
+                "",
+                "_Ready._",
+                gr.update(interactive=True),
+                gr.update(interactive=False),
+                gr.update(interactive=True),
+                gr.update(),
+                gr.update(),
+                AGENT_FINISH_SIGNAL_NONE,
+            )
+        return
+    history = list(history or [])
+    client = _ensure_client(client, session_hash)
+    activity: list[str] = []
+    thinking = ""
+    tool_output = ""
+    tool_heading = ""
+    completed_segments: list[str] = []
+    streaming_text = ""
+    task_started_at = time.time()
+    usage_baseline = resolve_session_token_usage(client)
+    def _complete_pi_task() -> None:
+        usage = usage_for_completed_turn(client, usage_baseline)
+        _after_pi_task(
+            session_hash=session_hash,
+            client=client,
+            s3_output_folder=s3_output_folder,
+            save_outputs_to_s3=save_outputs_to_s3,
+            document_name=document_name,
+            started_at=task_started_at,
+            base_file=base_file,
+            ocr_method=ocr_method,
+            pii_method=pii_method,
+            total_page_count=total_page_count,
+            vlm_model_name=vlm_model_name,
+            llm_input_tokens=usage.llm_input_tokens,
+            llm_output_tokens=usage.llm_output_tokens,
+        )
+    history.append({"role": "user", "content": chat_user_message or message.strip()})
+    history.append({"role": "assistant", "content": ""})
+    activity = _append_activity(activity, "Prompt sent.")
+    if initial_session_info:
+        activity = _append_activity(
+            activity,
+            f"Using workspace `{session_workspace_dir(session_hash).as_posix()}/`.",
+        )
+    session_info = _session_summary(client)
+    if initial_session_info:
+        session_info = f"{initial_session_info}\n\n{session_info}"
+    yield _chat_yield(
+        history,
+        client,
+        activity,
+        thinking,
+        tool_heading,
+        tool_output,
+        send_enabled=False,
+        abort_enabled=True,
+        redact_enabled=False,
+        session_info=session_info,
+        session_hash=session_hash,
+    )
+    from pi_workspace_skills import workspace_boundary_prefix
+    pi_message = (
+        workspace_boundary_prefix(session_hash)
+        + workspace_context_prefix(session_hash)
+        + message.strip()
+    )
+    prompt_to_send = pi_message
+    quota_failures = 0
+    finish_aborted = False
+    try:
+        while True:
+            turn_error: str | None = None
+            try:
+                for event in client.prompt_events(prompt_to_send):
+                    if event.kind == "done":
+                        finish_aborted = (
+                            event.text.strip().lower().startswith("agent aborted")
+                        )
+                    (
+                        history,
+                        activity,
+                        thinking,
+                        tool_output,
+                        tool_heading,
+                        completed_segments,
+                        streaming_text,
+                    ) = _apply_event(
+                        event,
+                        history=history,
+                        activity=activity,
+                        thinking=thinking,
+                        tool_output=tool_output,
+                        tool_heading=tool_heading,
+                        completed_segments=completed_segments,
+                        streaming_text=streaming_text,
+                    )
+                    yield _chat_yield(
+                        history,
+                        client,
+                        activity,
+                        thinking,
+                        tool_heading,
+                        tool_output,
+                        send_enabled=False,
+                        abort_enabled=True,
+                        redact_enabled=False,
+                        session_info=session_info,
+                        session_hash=session_hash,
+                    )
+                turn_error = last_assistant_turn_error(client.get_messages())
+            except PiRpcError as exc:
+                if not is_rate_limit_error(str(exc)):
+                    raise
+                turn_error = str(exc)
+            if turn_error and is_rate_limit_error(turn_error):
+                quota_failures += 1
+                if quota_failures >= QUOTA_RETRY_ATTEMPTS:
+                    err_summary = turn_error[:500].replace("\n", " ")
+                    history[-1]["content"] = (
+                        f"**Gemini rate limit / quota:** stopped after "
+                        f"{QUOTA_RETRY_ATTEMPTS} consecutive attempts.\n\n"
+                        f"{err_summary}"
+                    )
+                    activity = _append_activity(
+                        activity,
+                        f"**Quota retries exhausted** ({QUOTA_RETRY_ATTEMPTS} attempts).",
+                    )
+                    history, completed_segments, streaming_text = (
+                        _append_agent_finish_notice(
+                            history,
+                            completed_segments,
+                            streaming_text,
+                            error=True,
+                        )
+                    )
+                    _complete_pi_task()
+                    finish_signal = _notify_agent_finished(error=True)
+                    yield _chat_yield(
+                        history,
+                        client,
+                        activity,
+                        thinking,
+                        tool_heading,
+                        tool_output,
+                        send_enabled=True,
+                        abort_enabled=False,
+                        redact_enabled=True,
+                        session_info=_session_summary(client),
+                        session_hash=session_hash,
+                        refresh_final_files=True,
+                        agent_finish_signal=finish_signal,
+                    )
+                    return
+                activity = _append_activity(
+                    activity,
+                    (
+                        f"Gemini rate limit — waiting {QUOTA_RETRY_DELAY_S}s before "
+                        f"retry {quota_failures}/{QUOTA_RETRY_ATTEMPTS}…"
+                    ),
+                )
+                yield _chat_yield(
+                    history,
+                    client,
+                    activity,
+                    thinking,
+                    tool_heading,
+                    tool_output,
+                    send_enabled=False,
+                    abort_enabled=True,
+                    redact_enabled=False,
+                    session_info=session_info,
+                    session_hash=session_hash,
+                )
+                time.sleep(QUOTA_RETRY_DELAY_S)
+                prompt_to_send = QUOTA_CONTINUE_PROMPT
+                history.append({"role": "assistant", "content": ""})
+                completed_segments = []
+                streaming_text = ""
+                continue
+            break
+    except PiRpcError as exc:
+        history[-1]["content"] = f"**Pi error:** {exc}"
+        activity = _append_activity(activity, f"**Pi error:** {exc}")
+        history, completed_segments, streaming_text = _append_agent_finish_notice(
+            history,
+            completed_segments,
+            streaming_text,
+            error=True,
+        )
+        _complete_pi_task()
+        finish_signal = _notify_agent_finished(error=True)
+        yield _chat_yield(
+            history,
+            client,
+            activity,
+            thinking,
+            tool_heading,
+            tool_output,
+            send_enabled=True,
+            abort_enabled=False,
+            redact_enabled=True,
+            session_info=_session_summary(client),
+            session_hash=session_hash,
+            refresh_final_files=True,
+            agent_finish_signal=finish_signal,
+        )
+        return
+    except Exception:
+        if client.abort_requested:
+            activity = _append_activity(activity, "**Aborted.**")
+            history, completed_segments, streaming_text = _append_agent_finish_notice(
+                history,
+                completed_segments,
+                streaming_text,
+                aborted=True,
+            )
+            _complete_pi_task()
+            finish_signal = _notify_agent_finished(aborted=True)
+            yield _chat_yield(
+                history,
+                client,
+                activity,
+                thinking,
+                tool_heading,
+                tool_output,
+                send_enabled=True,
+                abort_enabled=False,
+                redact_enabled=True,
+                session_info=_session_summary(client),
+                session_hash=session_hash,
+                refresh_final_files=True,
+                agent_finish_signal=finish_signal,
+            )
+            return
+        raise
+    _finalize_assistant_chat(
+        client,
+        history,
+        completed_segments=completed_segments,
+        streaming_text=streaming_text,
+        activity=activity,
+    )
+    _complete_pi_task()
+    finish_signal = _notify_agent_finished(aborted=finish_aborted)
+    yield _chat_yield(
+        history,
+        client,
+        activity,
+        thinking,
+        tool_heading,
+        tool_output,
+        send_enabled=True,
+        abort_enabled=False,
+        redact_enabled=True,
+        session_info=_session_summary(client),
+        session_hash=session_hash,
+        refresh_final_files=True,
+        agent_finish_signal=finish_signal,
+    )
+def chat_respond(
+    message: str,
+    history: list[dict[str, Any]] | None,
+    client: PiRpcClient | None,
+    session_hash: str,
+    s3_output_folder: str,
+    save_outputs_to_s3: bool,
+    redact_file: str | None,
+):
+    yield from _run_pi_chat(
+        message,
+        history,
+        client,
+        session_hash=session_hash,
+        s3_output_folder=s3_output_folder,
+        save_outputs_to_s3=save_outputs_to_s3,
+        redact_file=redact_file,
+    )
+def _redaction_page_count(upload_file: str | None, page_range: str) -> int:
+    if not upload_file or not str(upload_file).lower().endswith(".pdf"):
+        return 0
+    try:
+        total = pdf_page_count(upload_file)
+        return pages_to_process_count(page_range or "all", total)
+    except (ValueError, OSError):
+        return 0
+def prepare_redaction_session_ui(
+    session_hash: str,
+    request: gr.Request,
+) -> tuple[str, str]:
+    """Create session workspace folder before redaction runs (updates UI immediately)."""
+    effective, _workspace, status = prepare_session_workspace(session_hash, request)
+    return effective, status
+def submit_redaction_task(
+    upload_file: str | None,
+    user_instructions: str,
+    page_range: str,
+    ocr_method: str,
+    pii_method: str,
+    encourage_vlm_faces: bool,
+    encourage_vlm_signatures: bool,
+    history: list[dict[str, Any]] | None,
+    client: PiRpcClient | None,
+    session_hash: str,
+    s3_output_folder: str,
+    save_outputs_to_s3: bool,
+    request: gr.Request,
+):
+    session_hash, _workspace_path, workspace_status = prepare_session_workspace(
+        session_hash, request
+    )
+    settings = (
+        RedactionTaskSettings.hf_space_defaults()
+        if IS_HF_SPACE
+        else RedactionTaskSettings.from_ui(
+            ocr_method,
+            pii_method,
+            encourage_vlm_faces,
+            encourage_vlm_signatures,
+        )
+    )
+    try:
+        _file_name, prompt, renamed_from = prepare_redaction_task(
+            upload_file,
+            user_instructions,
+            page_range=page_range or "all",
+            settings=settings,
+            workspace_dir=_workspace_path,
+        )
+    except (ValueError, FileNotFoundError, OSError) as exc:
+        history = list(history or [])
+        history.append(
+            {"role": "user", "content": f"_Redaction task not started: {exc}_"}
+        )
+        client = (
+            _ensure_client(client, session_hash)
+            if client and client.running
+            else client
+        )
+        yield (
+            _clone_history(history),
+            client,
+            "",
+            _format_activity([f"**Redaction task error:** {exc}"]),
+            "",
+            "",
+            (
+                _session_summary(client)
+                if client and client.running
+                else _agent_status_markdown(client)
+            ),
+            gr.update(interactive=True),
+            gr.update(interactive=False),
+            gr.update(interactive=True),
+            gr.update(),
+            gr.update(),
+            AGENT_FINISH_SIGNAL_NONE,
+        )
+        return
+    page_count = _redaction_page_count(upload_file, page_range or "all")
+    chat_summary = (
+        f"**Redaction task:** `{_file_name}`  \n"
+        f"**Page range:** `{page_range or 'all'}`  \n"
+        f"**OCR / text extraction:** `{settings.ocr_method}`  \n"
+        f"**PII model:** `{settings.pii_method}`  \n"
+        f"**VLM faces guidance:** {'on' if settings.encourage_vlm_faces else 'off'}  \n"
+        f"**VLM signature guidance:** {'on' if settings.encourage_vlm_signatures else 'off'}\n\n"
+        f"{user_instructions.strip()}"
+    )
+    if renamed_from:
+        chat_summary = (
+            f"_Your uploaded file `{renamed_from}` was saved as `{_file_name}` for this "
+            f"task because the original name contained characters that are unsafe for "
+            f"file paths._\n\n{chat_summary}"
+        )
+    yield from _run_pi_chat(
+        prompt,
+        history,
+        client,
+        chat_user_message=chat_summary,
+        session_hash=session_hash,
+        initial_session_info=workspace_status,
+        s3_output_folder=s3_output_folder,
+        save_outputs_to_s3=save_outputs_to_s3,
+        document_name=_file_name,
+        base_file=upload_file,
+        ocr_method=settings.ocr_method,
+        pii_method=settings.pii_method,
+        total_page_count=page_count,
+        vlm_model_name=os.environ.get("PI_VLM_MODEL"),
+    )
+def abort_agent(client: PiRpcClient | None):
+    rpc = _coerce_client(client)
+    if rpc is not None and rpc.running:
+        try:
+            rpc.abort()
+        except (PiRpcError, OSError, ValueError):
+            pass
+    return (
+        gr.update(interactive=True),
+        gr.update(interactive=False),
+        gr.update(interactive=True),
+    )
+def new_chat(
+    _history,
+    client: PiRpcClient | None,
+    session_hash: str,
+):
+    if client is not None:
+        try:
+            client.new_session()
+        except PiRpcError:
+            client.close()
+            client = default_client(session_hash or None)
+            client.start()
+    else:
+        client = default_client(session_hash or None)
+        client.start()
+    return _chat_yield(
+        [],
+        client,
+        ["New session."],
+        "",
+        "",
+        "",
+        session_hash=session_hash,
+        refresh_final_files=True,
+    )
+def _startup_session_info() -> str:
+    if IS_HF_SPACE:
+        return (
+            "**Hugging Face Space profile** — Gemini orchestration with remote Document Redaction App "
+            "backend.  \n\n"
+            "1. Paste your **Gemini API key** (and optional **HF token** for a private "
+            "redaction Space).  \n"
+            "2. Click **Apply backend**.  \n\n"
+            f"{_agent_status_markdown(None)}"
+        )
+    return _agent_status_markdown(None)
+def build_ui():
+    hf_redaction_blurb = (
+        "Upload a document and add bullet-point requirements. Redaction runs on a **remote** "
+        "Redaction App Hugging Face Space.  \n"
+        "When ready, use **Start redaction task** under the chat panel to the right."
+        if IS_HF_SPACE
+        else (
+            "Upload a PDF (or other supported document). Add bullet-point instructions for redaction below. \n"
+            "When ready, use **Start redaction task** under the chat panel to the right."
+        )
+    )
+    backend_blurb = (
+        "Gemini powers the Pi agent on this Space. Paste your **Gemini API key** "
+        "(session-only, not stored on disk). Optionally override the **HF token** used "
+        "to reach the private redaction backend."
+        if IS_HF_SPACE
+        else (
+            "Choose which LLM powers the Pi agent (chat and redaction orchestration). "
+            "Credentials from the UI apply **for this container session only**; "
+            "defaults can be set via `config/pi_agent.env` or compose environment."
+        )
+    )
+    hf_locked_settings_md = (
+        f"**Locked defaults (HF Space):**  \n"
+        f"- Text extraction: `{DEFAULT_OCR_METHOD}`  \n"
+        f"- PII model: `{DEFAULT_PII_METHOD}`  \n"
+        f"- Face/signature VLM: unavailable"
+        if IS_HF_SPACE
+        else ""
+    )
+    with gr.Blocks(
+        title=PI_UI_TITLE,
+        fill_height=True,
+    ) as demo:
+        gr.Markdown(PI_INTRO_TEXT)
+        client_state = gr.State(None)
+        session_hash_state = gr.State("")
+        s3_output_folder_state = gr.State("")
+        save_outputs_to_s3_state = gr.State(SAVE_OUTPUTS_TO_S3)
+        with gr.Accordion("View session info", open=False):
+            session_info = gr.Markdown(_startup_session_info())
+        with gr.Row(equal_height=False):
+            with gr.Column(scale=2):
+                with gr.Accordion("Redaction task", open=True):
+                    gr.Markdown(hf_redaction_blurb)
+                    pi_example_rows, pi_example_labels = example_rows()
+                    redact_file = gr.File(
+                        label="Document to redact",
+                        file_types=[
+                            ".pdf",
+                            ".png",
+                            ".jpg",
+                            ".jpeg",
+                            ".docx",
+                            ".csv",
+                            ".xlsx",
+                        ],
+                        type="filepath",
+                        render=False,
+                    )
+                    redact_instructions = gr.Textbox(
+                        label="Redaction requirements",
+                        placeholder=(
+                            "- Redact all personal names\n"
+                            "- Remove organisation addresses\n"
+                            "- Keep publication titles visible"
+                        ),
+                        lines=8,
+                        render=False,
+                    )
+                    page_range = gr.Textbox(
+                        label="Page range",
+                        value="all",
+                        placeholder="all or e.g. 1-56",
+                        render=False,
+                    )
+                    if IS_HF_SPACE:
+                        ocr_method = gr.State(DEFAULT_OCR_METHOD)
+                        pii_method = gr.State(DEFAULT_PII_METHOD)
+                        encourage_vlm_faces = gr.State(False)
+                        encourage_vlm_signatures = gr.State(False)
+                        settings_accordion = None
+                    else:
+                        settings_accordion = gr.Accordion(
+                            "Redaction settings (prompt defaults)",
+                            open=False,
+                            render=False,
+                        )
+                        with settings_accordion:
+                            gr.Markdown(
+                                "These values are injected into the task prompt under "
+                                "**Technical constraints** — they suggest defaults to Pi for "
+                                "`/doc_redact`, not hard-coded app settings."
+                            )
+                            ocr_method = gr.Dropdown(
+                                label="Default text extraction method",
+                                choices=list(OCR_METHOD_CHOICES),
+                                value=DEFAULT_OCR_METHOD,
+                                allow_custom_value=True,
+                            )
+                            pii_method = gr.Dropdown(
+                                label="Default PII identification model",
+                                choices=list(PII_METHOD_CHOICES),
+                                value=DEFAULT_PII_METHOD,
+                                allow_custom_value=True,
+                            )
+                            encourage_vlm_faces = gr.Checkbox(
+                                label="Encourage CUSTOM_VLM_FACES when user asks to redact faces",
+                                value=True,
+                            )
+                            encourage_vlm_signatures = gr.Checkbox(
+                                label=(
+                                    "Encourage CUSTOM_VLM_SIGNATURE when user asks "
+                                    "to redact signatures"
+                                ),
+                                value=True,
+                            )
+                    if pi_example_rows:
+                        gr.Markdown(
+                            "### Try an example\n"
+                            "Click a row to load the sample PDF and redaction instructions, "
+                            "then **Start redaction task** under the chat panel to the right."
+                        )
+                        gr.Examples(
+                            examples=pi_example_rows,
+                            inputs=[
+                                redact_file,
+                                redact_instructions,
+                                page_range,
+                                ocr_method,
+                                pii_method,
+                                encourage_vlm_faces,
+                                encourage_vlm_signatures,
+                            ],
+                            example_labels=pi_example_labels,
+                            examples_per_page=2,
+                            cache_examples=False,
+                        )
+                    else:
+                        gr.Markdown(examples_status_markdown())
+                    redact_file.render()
+                    redact_instructions.render()
+                    page_range.render()
+                    if IS_HF_SPACE:
+                        gr.Markdown(hf_locked_settings_md)
+                    elif settings_accordion is not None:
+                        settings_accordion.render()
+                    with gr.Accordion("Agent backend/API keys", open=IS_HF_SPACE):
+                        gr.Markdown(backend_blurb)
+                        backend_provider = gr.Radio(
+                            label="Provider",
+                            choices=[
+                                (provider_label(key), key) for key in provider_choices()
+                            ],
+                            value=get_default_provider(),
+                        )
+                        backend_model = gr.Dropdown(
+                            label="Model",
+                            choices=models_for_provider(get_default_provider()),
+                            value=default_model_for_provider(get_default_provider()),
+                            allow_custom_value=True,
+                        )
+                        gemini_api_key = gr.Textbox(
+                            label=(
+                                "Gemini API key (required on HF Space)"
+                                if IS_HF_SPACE
+                                else "Gemini API key (session override)"
+                            ),
+                            type="password",
+                            placeholder=(
+                                "Required — get a key from Google AI Studio"
+                                if IS_HF_SPACE
+                                else "Uses GEMINI_API_KEY / GOOGLE_API_KEY from env if empty"
+                            ),
+                        )
+                        hf_token = gr.Textbox(
+                            label="HF token for redaction Space (session override)",
+                            type="password",
+                            placeholder="Uses HF_TOKEN Space secret if empty",
+                            visible=IS_HF_SPACE,
+                        )
+                        with gr.Accordion("AWS credentials (optional)", open=False):
+                            aws_region = gr.Textbox(
+                                label="AWS region (session override)",
+                                placeholder="e.g. eu-west-2",
+                                visible=not IS_HF_SPACE,
+                            )
+                            aws_access_key_id = gr.Textbox(
+                                label="AWS access key ID (session override)",
+                                type="password",
+                                visible=not IS_HF_SPACE,
+                            )
+                            aws_secret_access_key = gr.Textbox(
+                                label="AWS secret access key (session override)",
+                                type="password",
+                                visible=not IS_HF_SPACE,
+                            )
+                            aws_session_token = gr.Textbox(
+                                label="AWS session token (optional)",
+                                type="password",
+                                visible=False,  # not IS_HF_SPACE,
+                            )
+                        apply_backend_btn = gr.Button(
+                            "Apply backend",
+                            variant="primary",
+                        )
+            with gr.Column(scale=3):
+                chatbot = gr.Chatbot(label="Task progress", height=480)
+                with gr.Row():
+                    start_redact_btn = gr.Button(
+                        "Start redaction task",
+                        variant="primary",
+                    )
+                    abort_btn = gr.Button("Abort", variant="stop", interactive=False)
+                clear = gr.Button("New session")
+                with gr.Accordion("Follow-up chat (optional)", open=False):
+                    msg = gr.Textbox(
+                        label="Message",
+                        placeholder=(
+                            "Optional message after a redaction task (e.g. fix page 3)"
+                        ),
+                        lines=3,
+                    )
+                    send = gr.Button("Send follow-up", variant="secondary")
+                with gr.Accordion("Thinking log", open=False):
+                    activity_log = gr.Markdown(
+                        value="_No activity yet._", max_height=480, height=480
+                    )
+                    tool_panel = gr.Markdown(value="", max_height=480, height=480)
+                    thinking_panel = gr.Textbox(
+                        label="Thinking (stream)",
+                        lines=12,
+                        max_lines=50,
+                        interactive=False,
+                        visible=SHOW_THINKING,
+                        elem_classes=["thinking-panel"],
+                        autoscroll=True,
+                    )
+        with gr.Accordion("Workspace output files", open=True):
+            workspace_session_info = gr.Markdown(
+                "_Loading your session workspace…_",
+            )
+            gr.Markdown(
+                "**Final outputs** will appear below. "
+                "Downloads below are available in your session's `output_final_download/` folder."
+                "Use the file explorer below to browse or download other workspace files."
+            )
+            workspace_output_download = gr.File(
+                label="Final deliverables (download)",
+                file_count="multiple",
+                file_types=[
+                    ".pdf",
+                    ".jpg",
+                    ".jpeg",
+                    ".png",
+                    ".csv",
+                    ".xlsx",
+                    ".xls",
+                    ".txt",
+                    ".doc",
+                    ".docx",
+                    ".json",
+                    ".zip",
+                ],
+                interactive=False,
+                height=200,
+            )
+            refresh_outputs_btn = gr.Button(
+                "Refresh workspace files",
+                variant="secondary",
+            )
+            workspace_output_explorer = gr.FileExplorer(
+                root_dir=str(workspace_base_dir()),
+                label="Browse session workspace",
+                file_count="multiple",
+                interactive=True,
+                max_height=400,
+            )
+        with gr.Accordion("Session log outputs", open=False):
+            gr.Markdown(
+                "Pi writes a **JSONL** transcript for the active agent session under "
+                "its `sessions/` directory. The file refreshes after each chat message "
+                "or redaction task completes."
+            )
+            session_log_download = gr.File(
+                label="Pi session log (JSONL)",
+                file_count="single",
+                file_types=[".jsonl"],
+                interactive=False,
+            )
+            agent_finish_signal = gr.State(AGENT_FINISH_SIGNAL_NONE)
+        chat_outputs = [
+            chatbot,
+            client_state,
+            msg,
+            activity_log,
+            tool_panel,
+            thinking_panel,
+            session_info,
+            send,
+            abort_btn,
+            start_redact_btn,
+            workspace_output_download,
+            session_log_download,
+            agent_finish_signal,
+        ]
+        run_chat_send = send.click(
+            chat_respond,
+            inputs=[
+                msg,
+                chatbot,
+                client_state,
+                session_hash_state,
+                s3_output_folder_state,
+                save_outputs_to_s3_state,
+                redact_file,
+            ],
+            outputs=chat_outputs,
+        )
+        run_chat_send.then(
+            _passthrough_chat_outputs,
+            outputs=chat_outputs,
+            js=PI_AGENT_FINISH_NOTIFY_JS,
+        )
+        run_chat_msg = msg.submit(
+            chat_respond,
+            inputs=[
+                msg,
+                chatbot,
+                client_state,
+                session_hash_state,
+                s3_output_folder_state,
+                save_outputs_to_s3_state,
+                redact_file,
+            ],
+            outputs=chat_outputs,
+        )
+        run_chat_msg.then(
+            _passthrough_chat_outputs,
+            outputs=chat_outputs,
+            js=PI_AGENT_FINISH_NOTIFY_JS,
+        )
+        run_redact_prepare = start_redact_btn.click(
+            prepare_redaction_session_ui,
+            inputs=[session_hash_state],
+            outputs=[session_hash_state, workspace_session_info],
+        )
+        run_redact_task = run_redact_prepare.then(
+            submit_redaction_task,
+            inputs=[
+                redact_file,
+                redact_instructions,
+                page_range,
+                ocr_method,
+                pii_method,
+                encourage_vlm_faces,
+                encourage_vlm_signatures,
+                chatbot,
+                client_state,
+                session_hash_state,
+                s3_output_folder_state,
+                save_outputs_to_s3_state,
+            ],
+            outputs=chat_outputs,
+        )
+        run_redact_task.then(
+            _passthrough_chat_outputs,
+            outputs=chat_outputs,
+            js=PI_AGENT_FINISH_NOTIFY_JS,
+        )
+        abort_btn.click(
+            abort_agent,
+            inputs=[client_state],
+            outputs=[send, abort_btn, start_redact_btn],
+            cancels=[run_chat_send, run_chat_msg, run_redact_task],
+            queue=False,
+        )
+        clear.click(
+            new_chat,
+            inputs=[chatbot, client_state, session_hash_state],
+            outputs=chat_outputs,
+        )
+        if not IS_HF_SPACE:
+            backend_provider.change(
+                _backend_model_choices_update,
+                inputs=[backend_provider],
+                outputs=[backend_model],
+            )
+        apply_backend_btn.click(
+            apply_backend,
+            inputs=[
+                backend_provider,
+                backend_model,
+                gemini_api_key,
+                hf_token,
+                aws_region,
+                aws_access_key_id,
+                aws_secret_access_key,
+                aws_session_token,
+                client_state,
+                session_hash_state,
+            ],
+            outputs=[
+                client_state,
+                session_info,
+                gemini_api_key,
+                hf_token,
+                aws_secret_access_key,
+                aws_session_token,
+            ],
+        )
+        refresh_outputs_btn.click(
+            fn=refresh_workspace_output_files_stub,
+            inputs=None,
+            outputs=workspace_output_explorer,
+        ).success(
+            fn=refresh_workspace_panel,
+            inputs=[session_hash_state],
+            outputs=[workspace_output_explorer, workspace_output_download],
+        ).success(
+            fn=_export_workspace_outputs,
+            inputs=[
+                session_hash_state,
+                s3_output_folder_state,
+                save_outputs_to_s3_state,
+            ],
+            outputs=None,
+        )
+        workspace_output_explorer.input(
+            fn=workspace_files_download_fn,
+            inputs=[workspace_output_explorer, session_hash_state],
+            outputs=workspace_output_download,
+        )
+        demo.load(
+            fn=_init_session_ui,
+            inputs=None,
+            outputs=[
+                session_hash_state,
+                workspace_output_explorer,
+                workspace_session_info,
+                workspace_output_download,
+                s3_output_folder_state,
+            ],
+        )
+    return demo
+def launch_pi_ui() -> FastAPI | None:
+    """Build UI and mount on FastAPI or launch Gradio directly."""
+    demo = build_ui()
+    demo.queue(default_concurrency_limit=1)
+    return mount_or_launch(
+        demo,
+        fastapi_app=create_fastapi_app() if RUN_FASTAPI else None,
+        allowed_paths=gradio_allowed_paths(),
+        css=THINKING_PANEL_CSS,
+        head_extra=PI_AGENT_FINISH_HEAD_HTML,
+        server_name=PI_UI_HOST,
+        server_port=PI_UI_PORT,
+    )
+if RUN_FASTAPI:
+    app = launch_pi_ui()
+else:
+    app = None
+if __name__ == "__main__":
+    if RUN_FASTAPI:
+        import uvicorn
+        uvicorn.run(
+            "gradio_app:app",
+            host=PI_UI_HOST,
+            port=PI_UI_PORT,
+            factory=False,
+        )
+    else:
+        launch_pi_ui()

agent-redact/pi/output_files.py ADDED Viewed

	@@ -0,0 +1,316 @@

+"""Browse and download files from the Pi agent shared workspace."""
+from __future__ import annotations
+import os
+import re
+import shutil
+from pathlib import Path
+from typing import Any
+import gradio as gr
+from bootstrap_pi_config import pi_repo_root_path
+from pi_examples import gradio_example_allowed_paths
+from session_logs import gradio_session_log_allowed_paths
+from session_workspace import (
+    sanitize_session_id,
+    session_workspace_dir,
+    workspace_base_dir,
+)
+REFRESH_STUB_DIR = Path(os.environ.get("PI_FILEEXPLORER_STUB_DIR", "/tmp"))
+# Folder names under ``.../review/`` where Pass 1 deliverables are saved (see partnership prompt).
+_DEFAULT_FINAL_OUTPUT_FOLDER_NAMES = ("output_review_final", "output_final")
+_DEFAULT_FINAL_DOWNLOAD_FOLDER = "output_final_download"
+_DEFAULT_GRADIO_PREFIX_MIN_LEN = 16
+def final_output_folder_names() -> frozenset[str]:
+    raw = os.environ.get("PI_FINAL_OUTPUT_FOLDER_NAMES", "").strip()
+    if raw:
+        names = {part.strip() for part in raw.split(",") if part.strip()}
+        if names:
+            return frozenset(names)
+    return frozenset(_DEFAULT_FINAL_OUTPUT_FOLDER_NAMES)
+def _is_under_final_output_dir(relative_path: Path) -> bool:
+    parts = relative_path.parts
+    names = final_output_folder_names()
+    for index, part in enumerate(parts):
+        if part == "review" and index + 1 < len(parts):
+            if parts[index + 1] in names:
+                return True
+    return False
+def final_download_folder_name() -> str:
+    raw = os.environ.get("PI_FINAL_DOWNLOAD_FOLDER", _DEFAULT_FINAL_DOWNLOAD_FOLDER)
+    stripped = raw.strip() if raw else ""
+    return stripped or _DEFAULT_FINAL_DOWNLOAD_FOLDER
+def final_download_dir(session_hash: str | None = None) -> Path:
+    """
+    Per-session staging folder for ``gr.File`` downloads.
+    Always ``{PI_WORKSPACE_DIR}/{session_id}/output_final_download/`` when a session
+    id is known, even if the broader workspace is shared (``PI_SESSION_WORKSPACE=false``).
+    """
+    base = workspace_base_dir().resolve()
+    folder = final_download_folder_name()
+    if not session_hash or not str(session_hash).strip():
+        return base / folder
+    safe_id = sanitize_session_id(str(session_hash))
+    return base / safe_id / folder
+def _remove_path(path: Path) -> None:
+    """Best-effort delete (handles read-only / OneDrive locks on Windows)."""
+    try:
+        if path.is_dir() and not path.is_symlink():
+            shutil.rmtree(path, ignore_errors=True)
+        else:
+            path.unlink(missing_ok=True)
+    except OSError:
+        if not path.exists():
+            return
+        try:
+            os.chmod(path, 0o666)
+            if path.is_dir() and not path.is_symlink():
+                shutil.rmtree(path, ignore_errors=True)
+            else:
+                path.unlink(missing_ok=True)
+        except OSError:
+            pass
+def _reset_download_dir(download_dir: Path) -> None:
+    """Clear staged downloads without removing the directory inode (safer on Windows)."""
+    download_dir.mkdir(parents=True, exist_ok=True)
+    for child in download_dir.iterdir():
+        _remove_path(child)
+def _gradio_prefix_min_len() -> int:
+    raw = os.environ.get(
+        "PI_GRADIO_FILENAME_PREFIX_MIN_LEN",
+        str(_DEFAULT_GRADIO_PREFIX_MIN_LEN),
+    )
+    try:
+        return max(1, int(raw))
+    except ValueError:
+        return _DEFAULT_GRADIO_PREFIX_MIN_LEN
+def strip_gradio_cache_prefix(filename: str) -> str:
+    """
+    Remove a leading Gradio cache id prefix (``{alphanumeric}_{name}``).
+    Gradio client downloads often prefix filenames with a long hash so repeated
+    exports do not collide; users expect the original basename instead.
+    """
+    pattern = re.compile(rf"^[A-Za-z0-9]{{{_gradio_prefix_min_len()},}}_(.+)$")
+    match = pattern.match(filename)
+    if match:
+        return match.group(1)
+    return filename
+def _file_created_timestamp(path: Path) -> float:
+    stat = path.stat()
+    birth = getattr(stat, "st_birthtime", None)
+    if birth is not None and birth > 0:
+        return float(birth)
+    return float(stat.st_mtime)
+def _collect_raw_final_output_files(
+    session_hash: str | None = None,
+) -> list[Path] | None:
+    """
+    Collect deliverable files from ``review/output_review_final/`` (and aliases)
+    anywhere under the session workspace.
+    """
+    root = workspace_root_from(session_hash)
+    if not root.is_dir():
+        return None
+    download_folder = final_download_folder_name()
+    candidates: list[Path] = []
+    try:
+        for path in root.rglob("*"):
+            if not path.is_file() or not _is_file_path(path.name):
+                continue
+            try:
+                relative = path.relative_to(root)
+            except ValueError:
+                continue
+            if download_folder in relative.parts:
+                continue
+            if not _is_under_final_output_dir(relative):
+                continue
+            try:
+                path.resolve(strict=False).relative_to(root)
+            except ValueError:
+                continue
+            candidates.append(path)
+    except OSError:
+        return None
+    if not candidates:
+        return None
+    return candidates
+def build_final_download_files(
+    session_hash: str | None = None,
+) -> list[str] | None:
+    """
+    Stage cleaned deliverables under ``{session_id}/output_final_download/``.
+    Copies files from agent final-output folders, strips Gradio cache prefixes,
+    deduplicates by basename (newest file wins), and returns paths for ``gr.File``.
+    """
+    raw_files = _collect_raw_final_output_files(session_hash)
+    if not raw_files:
+        return None
+    download_dir = final_download_dir(session_hash)
+    _reset_download_dir(download_dir)
+    ordered = sorted(raw_files, key=_file_created_timestamp)
+    latest_by_name: dict[str, Path] = {}
+    for path in ordered:
+        latest_by_name[strip_gradio_cache_prefix(path.name)] = path
+    staged: list[str] = []
+    for name in sorted(latest_by_name):
+        source = latest_by_name[name]
+        destination = download_dir / name
+        destination.parent.mkdir(parents=True, exist_ok=True)
+        shutil.copy2(source, destination)
+        staged.append(str(destination.resolve()))
+    return staged or None
+def collect_final_output_files(
+    session_hash: str | None = None,
+) -> list[str] | None:
+    """Return deduplicated, prefix-stripped deliverables for download and S3 export."""
+    return build_final_download_files(session_hash)
+def workspace_root_from(session_hash: str | None = None) -> Path:
+    """Resolve the session workspace from a sanitized Gradio session hash only."""
+    if not session_hash or not str(session_hash).strip():
+        return workspace_base_dir().resolve()
+    return session_workspace_dir(str(session_hash).strip())
+def _is_file_path(path: str) -> bool:
+    if not path or not path.strip():
+        return False
+    name = Path(path.rstrip("/\\")).name
+    if not name or "." not in name:
+        return False
+    ext = name.rsplit(".", 1)[-1]
+    return bool(ext and len(ext) <= 10 and ext.isalnum())
+def _is_safe_workspace_relative_path(path: str) -> bool:
+    """Reject absolute paths and traversal segments before joining under workspace."""
+    if not path or not path.strip():
+        return False
+    candidate = Path(path.strip())
+    if candidate.is_absolute() or candidate.anchor:
+        return False
+    return all(part not in ("", ".", "..") for part in candidate.parts)
+def _resolve_under_workspace(
+    path: str,
+    *,
+    workspace_root: Path | None = None,
+) -> Path | None:
+    if not path or not path.strip():
+        return None
+    root = (workspace_root or workspace_base_dir()).resolve()
+    stripped = path.strip()
+    try:
+        user_path = Path(stripped)
+        if user_path.is_absolute():
+            # Gradio FileExplorer may return absolute paths already under root_dir.
+            resolved = user_path.resolve(strict=False)
+        elif _is_safe_workspace_relative_path(stripped):
+            resolved = root.joinpath(*user_path.parts).resolve(strict=False)
+        else:
+            return None
+        resolved.relative_to(root)
+    except (ValueError, OSError):
+        return None
+    return resolved if resolved.is_file() else None
+def load_workspace_output_files(session_hash: str = ""):
+    root = workspace_root_from(session_hash or None)
+    root.mkdir(parents=True, exist_ok=True)
+    return gr.FileExplorer(root_dir=str(root))
+def refresh_workspace_output_files_stub():
+    return gr.FileExplorer(root_dir=str(REFRESH_STUB_DIR.resolve()))
+def gradio_allowed_paths() -> list[str]:
+    """Paths Gradio may serve via gr.File (must include the shared workspace)."""
+    paths: list[str] = []
+    for raw in (
+        workspace_base_dir(),
+        str(pi_repo_root_path()),
+        REFRESH_STUB_DIR,
+        "/tmp",
+    ):
+        try:
+            resolved = str(Path(raw).resolve())
+        except OSError:
+            continue
+        if resolved not in paths:
+            paths.append(resolved)
+    for raw in gradio_example_allowed_paths():
+        if raw not in paths:
+            paths.append(raw)
+    for raw in gradio_session_log_allowed_paths():
+        if raw not in paths:
+            paths.append(raw)
+    return paths
+def refresh_workspace_panel(
+    session_hash: str = "",
+) -> tuple[Any, list[str] | None]:
+    """Refresh file explorer and auto-detected final deliverables."""
+    return (
+        load_workspace_output_files(session_hash),
+        collect_final_output_files(session_hash),
+    )
+def workspace_files_download_fn(
+    selected: list[str] | None,
+    session_hash: str = "",
+) -> list[str] | None:
+    """Return only file paths under the session workspace (for gr.File download)."""
+    if not selected:
+        return None
+    root = workspace_root_from(session_hash or None)
+    downloads: list[str] = []
+    for raw in selected:
+        if not _is_file_path(raw):
+            continue
+        resolved = _resolve_under_workspace(raw, workspace_root=root)
+        if resolved is not None:
+            downloads.append(str(resolved))
+    return downloads or None

agent-redact/pi/pi_agent_config.py ADDED Viewed

	@@ -0,0 +1,715 @@

+"""Generate Pi agent models.json and settings.json at runtime."""
+from __future__ import annotations
+import json
+import os
+from pathlib import Path
+from typing import Any
+def resolve_agent_dir() -> Path:
+    return Path(os.environ.get("PI_CODING_AGENT_DIR", Path.home() / ".pi" / "agent"))
+# Back-compat alias; prefer resolve_agent_dir() when env may change after import.
+AGENT_DIR = resolve_agent_dir()
+TEMPLATE_DIR = Path(__file__).resolve().parent / "agent"
+SETTINGS_TEMPLATE = TEMPLATE_DIR / "settings.json"
+DEPLOYMENT_LOCAL = "local-docker"
+DEPLOYMENT_HF_SPACE = "hf-space"
+DEPLOYMENT_PROFILE = (
+    os.environ.get("PI_DEPLOYMENT_PROFILE", DEPLOYMENT_LOCAL).strip().lower()
+)
+def pi_max_retries() -> int:
+    """Max retries for Pi auto-retry and Gradio quota backoff (env: PI_MAX_RETRIES, default 5)."""
+    raw = (
+        os.environ.get("PI_QUOTA_RETRY_ATTEMPTS")
+        or os.environ.get("PI_MAX_RETRIES")
+        or "5"
+    ).strip()
+    return int(raw)
+def _apply_retry_settings(
+    settings: dict[str, Any],
+    *,
+    provider: str,
+) -> None:
+    """Write Pi ``settings.json`` retry block (Gemini uses longer delays)."""
+    max_retries = pi_max_retries()
+    gemini_delays = provider == PROVIDER_GEMINI or is_hf_space_profile()
+    base_delay_ms = 2000
+    max_delay_ms = 60000
+    if gemini_delays:
+        base_delay_ms = int(os.environ.get("PI_GEMINI_RETRY_BASE_DELAY_MS", "60000"))
+        max_delay_ms = int(os.environ.get("PI_GEMINI_RETRY_MAX_DELAY_MS", "90000"))
+    settings["retry"] = {
+        "enabled": True,
+        "maxRetries": max_retries,
+        "baseDelayMs": base_delay_ms,
+        "provider": {
+            "timeoutMs": 3600000,
+            "maxRetries": max_retries,
+            "maxRetryDelayMs": max_delay_ms,
+        },
+    }
+PROVIDER_LLAMA = "llama-cpp"
+PROVIDER_GEMINI = "google-gemini"
+PROVIDER_BEDROCK = "amazon-bedrock"
+PROVIDER_LABELS: dict[str, str] = {
+    PROVIDER_LLAMA: "Local (llama-cpp)",
+    PROVIDER_GEMINI: "Gemini",
+    PROVIDER_BEDROCK: "AWS Bedrock",
+}
+def is_hf_space_profile() -> bool:
+    profile = os.environ.get("PI_DEPLOYMENT_PROFILE", DEPLOYMENT_LOCAL).strip().lower()
+    return profile == DEPLOYMENT_HF_SPACE
+LLAMA_BASE_URL = os.environ.get("PI_LLAMA_BASE_URL", "http://llama-inference:8080/v1")
+LLAMA_MODEL_ID = os.environ.get("PI_LLAMA_MODEL_ID", "unsloth/Qwen3.6-27B-MTP-GGUF")
+LLAMA_CONTEXT = int(os.environ.get("PI_LLAMA_CONTEXT_WINDOW", "114688"))
+LLAMA_MAX_TOKENS = int(os.environ.get("PI_LLAMA_MAX_TOKENS", "32768"))
+GEMINI_MODELS: tuple[tuple[str, str, int, bool], ...] = (
+    ("gemini-flash-lite-latest", "Gemini Flash Lite", 1048576, False),
+    ("gemini-flash-latest", "Gemini Flash", 1048576, True),
+    ("gemini-pro-latest", "Gemini Pro", 1048576, True),
+)
+BEDROCK_MODELS: tuple[tuple[str, str, int, bool], ...] = (
+    (
+        "anthropic.claude-3-haiku-20240307-v1:0",
+        "Claude 3 Haiku (Bedrock)",
+        200000,
+        False,
+    ),
+    (
+        "anthropic.claude-3-7-sonnet-20250219-v1:0",
+        "Claude 3.7 Sonnet (Bedrock)",
+        200000,
+        True,
+    ),
+    (
+        "anthropic.claude-sonnet-4-5-20250929-v1:0",
+        "Claude Sonnet 4.5 (Bedrock)",
+        200000,
+        True,
+    ),
+    ("anthropic.claude-sonnet-4-6", "Claude Sonnet 4.6 (Bedrock)", 200000, True),
+    ("amazon.nova-micro-v1:0", "Amazon Nova Micro (Bedrock)", 128000, False),
+    ("amazon.nova-lite-v1:0", "Amazon Nova Lite (Bedrock)", 300000, False),
+    ("amazon.nova-pro-v1:0", "Amazon Nova Pro (Bedrock)", 300000, False),
+)
+PROVIDER_MODELS: dict[str, list[str]] = {
+    PROVIDER_LLAMA: [LLAMA_MODEL_ID],
+    PROVIDER_GEMINI: [model_id for model_id, _, _, _ in GEMINI_MODELS],
+    PROVIDER_BEDROCK: [model_id for model_id, _, _, _ in BEDROCK_MODELS],
+}
+DEFAULT_MODEL_BY_PROVIDER: dict[str, str] = {
+    PROVIDER_LLAMA: LLAMA_MODEL_ID,
+    PROVIDER_GEMINI: GEMINI_MODELS[0][0],  # Gemini Flash Lite
+    PROVIDER_BEDROCK: "anthropic.claude-sonnet-4-6",
+}
+def get_default_provider() -> str:
+    """Current default Pi provider (reads ``PI_DEFAULT_PROVIDER`` from env each call)."""
+    if is_hf_space_profile():
+        return PROVIDER_GEMINI
+    raw = (os.environ.get("PI_DEFAULT_PROVIDER") or PROVIDER_LLAMA).strip()
+    if raw in PROVIDER_MODELS:
+        return raw
+    return PROVIDER_LLAMA
+DEFAULT_PROVIDER = get_default_provider()
+_env_default_model = (os.environ.get("PI_DEFAULT_MODEL") or "").strip()
+DEFAULT_MODEL = _env_default_model or DEFAULT_MODEL_BY_PROVIDER.get(
+    DEFAULT_PROVIDER, LLAMA_MODEL_ID
+)
+def resolved_default_model(provider: str, *, override: str | None = None) -> str:
+    """
+    Pick the default model id for a provider.
+    Order: explicit override → ``PI_DEFAULT_MODEL`` (if listed for provider) →
+    built-in per-provider default.
+    """
+    models = PROVIDER_MODELS.get(provider, [])
+    if override and override in models:
+        return override
+    env_model = (os.environ.get("PI_DEFAULT_MODEL") or DEFAULT_MODEL or "").strip()
+    if env_model and env_model in models:
+        return env_model
+    return DEFAULT_MODEL_BY_PROVIDER.get(provider, LLAMA_MODEL_ID)
+def _zero_cost() -> dict[str, int]:
+    return {"input": 0, "output": 0, "cacheRead": 0, "cacheWrite": 0}
+def _model_entry(
+    model_id: str,
+    name: str,
+    *,
+    context_window: int,
+    max_tokens: int,
+    reasoning: bool,
+    image_input: bool = True,
+) -> dict[str, Any]:
+    inputs = ["text", "image"] if image_input else ["text"]
+    return {
+        "id": model_id,
+        "name": name,
+        "reasoning": reasoning,
+        "input": inputs,
+        "contextWindow": context_window,
+        "maxTokens": max_tokens,
+        "cost": _zero_cost(),
+    }
+def _llama_provider() -> dict[str, Any]:
+    return {
+        "baseUrl": LLAMA_BASE_URL,
+        "api": "openai-completions",
+        "apiKey": "llama-cpp",
+        "compat": {
+            "supportsDeveloperRole": False,
+            "supportsReasoningEffort": False,
+            "supportsUsageInStreaming": False,
+            "maxTokensField": "max_tokens",
+        },
+        "models": [
+            _model_entry(
+                LLAMA_MODEL_ID,
+                "Qwen 3.6 27B (local)",
+                context_window=LLAMA_CONTEXT,
+                max_tokens=LLAMA_MAX_TOKENS,
+                reasoning=False,
+            )
+        ],
+    }
+def _gemini_provider() -> dict[str, Any]:
+    return {
+        "baseUrl": "https://generativelanguage.googleapis.com/v1beta",
+        "api": "google-generative-ai",
+        "apiKey": "GEMINI_API_KEY",
+        "models": [
+            _model_entry(
+                model_id, name, context_window=ctx, max_tokens=8192, reasoning=reasoning
+            )
+            for model_id, name, ctx, reasoning in GEMINI_MODELS
+        ],
+    }
+def _bedrock_region() -> str:
+    return (
+        os.environ.get("AWS_REGION")
+        or os.environ.get("AWS_DEFAULT_REGION")
+        or "eu-west-2"
+    )
+_AWS_CREDENTIAL_ENV_KEYS: tuple[str, ...] = (
+    "AWS_ACCESS_KEY_ID",
+    "AWS_SECRET_ACCESS_KEY",
+    "AWS_SESSION_TOKEN",
+    "AWS_ACCESS_KEY",
+    "AWS_SECRET_KEY",
+)
+_AWS_PROFILE_ENV_KEYS: tuple[str, ...] = ("AWS_PROFILE", "PI_AWS_PROFILE")
+def _env_flag(name: str, *, default: bool = False) -> bool:
+    raw = os.environ.get(name)
+    if raw is None:
+        return default
+    return raw.strip().lower() in {"1", "true", "yes", "on"}
+def _strip_empty_env_vars(names: tuple[str, ...]) -> None:
+    for name in names:
+        if not (os.environ.get(name) or "").strip():
+            os.environ.pop(name, None)
+def _mirror_legacy_aws_key_env_vars() -> None:
+    if not (os.environ.get("AWS_ACCESS_KEY_ID") or "").strip():
+        legacy = (os.environ.get("AWS_ACCESS_KEY") or "").strip()
+        if legacy:
+            os.environ["AWS_ACCESS_KEY_ID"] = legacy
+    if not (os.environ.get("AWS_SECRET_ACCESS_KEY") or "").strip():
+        legacy = (os.environ.get("AWS_SECRET_KEY") or "").strip()
+        if legacy:
+            os.environ["AWS_SECRET_ACCESS_KEY"] = legacy
+def _has_explicit_aws_access_keys() -> bool:
+    access = (
+        os.environ.get("AWS_ACCESS_KEY_ID") or os.environ.get("AWS_ACCESS_KEY") or ""
+    ).strip()
+    secret = (
+        os.environ.get("AWS_SECRET_ACCESS_KEY")
+        or os.environ.get("AWS_SECRET_KEY")
+        or ""
+    ).strip()
+    return bool(access and secret)
+def _aws_config_path() -> Path | None:
+    explicit = (os.environ.get("AWS_CONFIG_FILE") or "").strip()
+    if explicit:
+        path = Path(explicit).expanduser()
+        return path if path.is_file() else None
+    home = Path(os.environ.get("HOME", "/home/node"))
+    path = home / ".aws" / "config"
+    return path if path.is_file() else None
+def _discover_aws_profile_from_config() -> str | None:
+    """Return an AWS profile name for Pi/Bedrock when only ~/.aws is mounted."""
+    explicit = (os.environ.get("PI_AWS_PROFILE") or "").strip()
+    if not explicit:
+        explicit = (os.environ.get("AWS_PROFILE") or "").strip()
+    if explicit:
+        return explicit
+    path = _aws_config_path()
+    if not path:
+        return None
+    current_profile: str | None = None
+    sso_profiles: list[str] = []
+    all_profiles: list[str] = []
+    for raw_line in path.read_text(encoding="utf-8").splitlines():
+        line = raw_line.strip()
+        if not line or line.startswith("#") or line.startswith(";"):
+            continue
+        if line == "[default]":
+            current_profile = "default"
+            all_profiles.append("default")
+            continue
+        if line.startswith("[profile ") and line.endswith("]"):
+            current_profile = line[len("[profile ") : -1].strip()
+            if current_profile:
+                all_profiles.append(current_profile)
+            continue
+        if current_profile and line.startswith("sso_session"):
+            sso_profiles.append(current_profile)
+    if sso_profiles:
+        return sso_profiles[0]
+    if "default" in all_profiles:
+        return "default"
+    return all_profiles[0] if all_profiles else None
+def _region_from_aws_config(profile: str | None = None) -> str | None:
+    """Read ``region =`` from a profile block in ``~/.aws/config``."""
+    path = _aws_config_path()
+    if not path:
+        return None
+    target = (profile or _discover_aws_profile_from_config() or "").strip()
+    if not target:
+        return None
+    current_profile: str | None = None
+    for raw_line in path.read_text(encoding="utf-8").splitlines():
+        line = raw_line.strip()
+        if not line or line.startswith("#") or line.startswith(";"):
+            continue
+        if line == "[default]":
+            current_profile = "default"
+            continue
+        if line.startswith("[profile ") and line.endswith("]"):
+            current_profile = line[len("[profile ") : -1].strip()
+            continue
+        if current_profile != target:
+            continue
+        if line.startswith("region"):
+            _, _, value = line.partition("=")
+            region = value.strip()
+            if region:
+                return region
+    return None
+def _ensure_aws_region_env() -> None:
+    """Ensure AWS SDK env has a non-empty region (profile config, then eu-west-2)."""
+    _strip_empty_env_vars(("AWS_REGION", "AWS_DEFAULT_REGION"))
+    region = (
+        os.environ.get("AWS_REGION") or os.environ.get("AWS_DEFAULT_REGION") or ""
+    ).strip()
+    if not region:
+        profile = (os.environ.get("AWS_PROFILE") or "").strip()
+        region = (_region_from_aws_config(profile) or "").strip()
+    if not region:
+        region = _bedrock_region()
+    os.environ["AWS_REGION"] = region
+    os.environ["AWS_DEFAULT_REGION"] = region
+def _pi_bedrock_auth_visible() -> bool:
+    """True when Pi's amazon-bedrock provider would detect configured auth."""
+    if (os.environ.get("AWS_PROFILE") or "").strip():
+        return True
+    if _has_explicit_aws_access_keys():
+        return True
+    if (os.environ.get("AWS_BEARER_TOKEN_BEDROCK") or "").strip():
+        return True
+    return False
+def _ensure_pi_bedrock_auth_env() -> None:
+    """
+    Pi checks env vars (not ~/.aws alone) before Bedrock is usable.
+    When SSO credentials live in a mounted ``~/.aws`` tree, set ``AWS_PROFILE``
+    so Pi passes its auth preflight and the AWS SDK loads the profile.
+    """
+    if _pi_bedrock_auth_visible():
+        return
+    profile = _discover_aws_profile_from_config()
+    if profile:
+        os.environ["AWS_PROFILE"] = profile
+def configure_aws_credentials(
+    *,
+    session_access_key_id: str | None = None,
+    session_secret_access_key: str | None = None,
+    session_session_token: str | None = None,
+) -> None:
+    """
+    Align Pi Bedrock AWS env with doc_redaction SSO/key priority.
+    Mirrors ``tools/file_redaction.py``: when ``RUN_AWS_FUNCTIONS`` is enabled,
+    prefer the default credential chain (SSO profile, instance role, etc.) over
+    static env keys when ``PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS`` is true.
+    Explicit UI session keys from **Apply backend** always win.
+    """
+    _strip_empty_env_vars(_AWS_CREDENTIAL_ENV_KEYS)
+    _strip_empty_env_vars(_AWS_PROFILE_ENV_KEYS)
+    _mirror_legacy_aws_key_env_vars()
+    session_explicit = bool(
+        session_access_key_id
+        and session_access_key_id.strip()
+        and session_secret_access_key
+        and session_secret_access_key.strip()
+    )
+    if session_explicit:
+        os.environ["AWS_ACCESS_KEY_ID"] = session_access_key_id.strip()
+        os.environ["AWS_SECRET_ACCESS_KEY"] = session_secret_access_key.strip()
+        if session_session_token and session_session_token.strip():
+            os.environ["AWS_SESSION_TOKEN"] = session_session_token.strip()
+        else:
+            os.environ.pop("AWS_SESSION_TOKEN", None)
+        _ensure_aws_region_env()
+        return
+    run_aws = _env_flag("RUN_AWS_FUNCTIONS")
+    prioritise_sso = _env_flag("PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS", default=True)
+    if run_aws and prioritise_sso:
+        for key in _AWS_CREDENTIAL_ENV_KEYS:
+            os.environ.pop(key, None)
+        _ensure_pi_bedrock_auth_env()
+    elif run_aws:
+        for key in _AWS_CREDENTIAL_ENV_KEYS:
+            os.environ.pop(key, None)
+        _ensure_pi_bedrock_auth_env()
+    # Propagate PI_AWS_PROFILE when only that alias is set (e.g. pi_agent.env).
+    pi_profile = (os.environ.get("PI_AWS_PROFILE") or "").strip()
+    if pi_profile and not (os.environ.get("AWS_PROFILE") or "").strip():
+        os.environ["AWS_PROFILE"] = pi_profile
+    _ensure_aws_region_env()
+def _aws_credential_status() -> str:
+    if _has_explicit_aws_access_keys():
+        return "access keys"
+    profile = (os.environ.get("AWS_PROFILE") or "").strip()
+    if profile:
+        return f"profile `{profile}`"
+    if (os.environ.get("AWS_BEARER_TOKEN_BEDROCK") or "").strip():
+        return "Bedrock bearer token"
+    if _aws_config_path():
+        return "SSO config mounted (profile not set)"
+    if _env_flag("RUN_AWS_FUNCTIONS"):
+        return "SSO/default chain (missing profile)"
+    return "missing"
+def _bedrock_provider() -> dict[str, Any]:
+    region = _bedrock_region()
+    return {
+        "baseUrl": f"https://bedrock-runtime.{region}.amazonaws.com",
+        "api": "bedrock-converse-stream",
+        "models": [
+            _model_entry(
+                model_id,
+                name,
+                context_window=ctx,
+                max_tokens=8192,
+                reasoning=reasoning,
+            )
+            for model_id, name, ctx, reasoning in BEDROCK_MODELS
+        ],
+    }
+def build_models_config() -> dict[str, Any]:
+    if is_hf_space_profile():
+        return {"providers": {PROVIDER_GEMINI: _gemini_provider()}}
+    return {
+        "providers": {
+            PROVIDER_LLAMA: _llama_provider(),
+            PROVIDER_GEMINI: _gemini_provider(),
+            PROVIDER_BEDROCK: _bedrock_provider(),
+        }
+    }
+def _load_settings_template() -> dict[str, Any]:
+    if SETTINGS_TEMPLATE.is_file():
+        return json.loads(SETTINGS_TEMPLATE.read_text(encoding="utf-8"))
+    return {
+        "defaultThinkingLevel": "off",
+        "hideThinkingBlock": True,
+        "compaction": {
+            "enabled": True,
+            "reserveTokens": 32768,
+            "keepRecentTokens": 20000,
+        },
+        "enableSkillCommands": True,
+        "sessionDir": "sessions",
+    }
+def _apply_compaction_settings(settings: dict[str, Any]) -> None:
+    """
+    Merge Pi session auto-compaction from env into ``settings.json``.
+    ``PI_COMPACTION_ENABLED`` — when set, overrides the template ``compaction.enabled``
+    flag (``true`` / ``false``). When unset, the template default applies (enabled).
+    Optional tuning: ``PI_COMPACTION_RESERVE_TOKENS``, ``PI_COMPACTION_KEEP_RECENT_TOKENS``.
+    """
+    compaction = dict(
+        settings.get("compaction")
+        or {
+            "enabled": True,
+            "reserveTokens": 32768,
+            "keepRecentTokens": 20000,
+        }
+    )
+    if os.environ.get("PI_COMPACTION_ENABLED") is not None:
+        compaction["enabled"] = _env_flag("PI_COMPACTION_ENABLED")
+    reserve = (os.environ.get("PI_COMPACTION_RESERVE_TOKENS") or "").strip()
+    if reserve:
+        compaction["reserveTokens"] = int(reserve)
+    keep = (os.environ.get("PI_COMPACTION_KEEP_RECENT_TOKENS") or "").strip()
+    if keep:
+        compaction["keepRecentTokens"] = int(keep)
+    settings["compaction"] = compaction
+def resolve_session_dir() -> str:
+    """Pi session JSONL directory (absolute path or relative to ``AGENT_DIR``)."""
+    explicit = os.environ.get("PI_SESSION_DIR", "").strip()
+    if explicit:
+        return explicit
+    if is_hf_space_profile():
+        return "/tmp/pi-sessions"
+    return "sessions"
+def ensure_session_dir(session_dir: str | None = None) -> Path:
+    """Create the Pi session directory and return its resolved absolute path."""
+    raw = (session_dir or resolve_session_dir()).strip()
+    path = Path(raw)
+    if not path.is_absolute():
+        path = (resolve_agent_dir() / path).resolve()
+    else:
+        path = path.resolve()
+    path.mkdir(parents=True, exist_ok=True)
+    return path
+def build_settings_config(
+    *,
+    default_provider: str | None = None,
+    default_model: str | None = None,
+) -> dict[str, Any]:
+    provider = default_provider or get_default_provider()
+    if provider not in PROVIDER_MODELS:
+        provider = PROVIDER_GEMINI if is_hf_space_profile() else PROVIDER_LLAMA
+    model = resolved_default_model(provider, override=default_model)
+    settings = _load_settings_template()
+    settings["defaultProvider"] = provider
+    settings["defaultModel"] = model
+    _apply_compaction_settings(settings)
+    session_path = ensure_session_dir(resolve_session_dir())
+    settings["sessionDir"] = session_path.as_posix()
+    if is_hf_space_profile() or provider == PROVIDER_GEMINI:
+        _apply_retry_settings(settings, provider=provider)
+    from pi_workspace_skills import ensure_workspace_skills, workspace_skills_dir
+    ensure_workspace_skills()
+    settings["skills"] = [workspace_skills_dir().as_posix()]
+    return settings
+def write_runtime_config(
+    *,
+    agent_dir: Path | None = None,
+    default_provider: str | None = None,
+    default_model: str | None = None,
+) -> tuple[Path, Path]:
+    """Write models.json and settings.json; return their paths."""
+    target = Path(agent_dir or resolve_agent_dir())
+    target.mkdir(parents=True, exist_ok=True)
+    models_path = target / "models.json"
+    settings_path = target / "settings.json"
+    models_path.write_text(
+        json.dumps(build_models_config(), indent=2) + "\n",
+        encoding="utf-8",
+    )
+    settings_path.write_text(
+        json.dumps(
+            build_settings_config(
+                default_provider=default_provider,
+                default_model=default_model,
+            ),
+            indent=2,
+        )
+        + "\n",
+        encoding="utf-8",
+    )
+    return models_path, settings_path
+def models_for_provider(provider: str) -> list[str]:
+    if is_hf_space_profile():
+        return list(PROVIDER_MODELS[PROVIDER_GEMINI])
+    return list(PROVIDER_MODELS.get(provider, PROVIDER_MODELS[PROVIDER_LLAMA]))
+def default_model_for_provider(provider: str) -> str:
+    return resolved_default_model(provider)
+def normalize_provider(provider: str) -> str:
+    label_map = {label.lower(): key for key, label in PROVIDER_LABELS.items()}
+    lowered = (provider or "").strip().lower()
+    if lowered in PROVIDER_MODELS:
+        return lowered
+    if lowered in label_map:
+        return label_map[lowered]
+    return PROVIDER_GEMINI if is_hf_space_profile() else PROVIDER_LLAMA
+def apply_session_credentials(
+    *,
+    gemini_api_key: str | None = None,
+    hf_token: str | None = None,
+    aws_region: str | None = None,
+    aws_access_key_id: str | None = None,
+    aws_secret_access_key: str | None = None,
+    aws_session_token: str | None = None,
+) -> None:
+    """Apply session-only credential overrides to os.environ."""
+    if gemini_api_key and gemini_api_key.strip():
+        os.environ["GEMINI_API_KEY"] = gemini_api_key.strip()
+    if hf_token and hf_token.strip():
+        token = hf_token.strip()
+        os.environ["HF_TOKEN"] = token
+        os.environ["DOC_REDACTION_HF_TOKEN"] = token
+    if aws_region and aws_region.strip():
+        os.environ["AWS_REGION"] = aws_region.strip()
+        os.environ["AWS_DEFAULT_REGION"] = aws_region.strip()
+    configure_aws_credentials(
+        session_access_key_id=aws_access_key_id,
+        session_secret_access_key=aws_secret_access_key,
+        session_session_token=aws_session_token,
+    )
+def mirror_hf_token_from_env() -> None:
+    """Mirror DOC_REDACTION_HF_TOKEN or Space secret HF_TOKEN for Pi subprocess."""
+    if os.environ.get("HF_TOKEN"):
+        return
+    doc_token = os.environ.get("DOC_REDACTION_HF_TOKEN", "").strip()
+    if doc_token:
+        os.environ["HF_TOKEN"] = doc_token
+def _hf_token_status() -> str:
+    if os.environ.get("HF_TOKEN"):
+        source = (
+            "UI session" if os.environ.get("_HF_TOKEN_FROM_UI") else "env/Space secret"
+        )
+        return f"set ({source})"
+    return "missing"
+def credential_status_markdown() -> str:
+    gemini = (
+        "set"
+        if os.environ.get("GEMINI_API_KEY") or os.environ.get("GOOGLE_API_KEY")
+        else "missing"
+    )
+    lines = [f"**Credentials:** Gemini `{gemini}`"]
+    if is_hf_space_profile():
+        lines.append(f"HF token (redaction backend) `{_hf_token_status()}`")
+    else:
+        region = _bedrock_region()
+        lines.append(f"AWS `{_aws_credential_status()}` · region `{region}`")
+    return " · ".join(lines)
+def provider_choices() -> list[str]:
+    if is_hf_space_profile():
+        return [PROVIDER_GEMINI]
+    return list(PROVIDER_LABELS.keys())
+def gemini_api_key_configured() -> bool:
+    return bool(os.environ.get("GEMINI_API_KEY") or os.environ.get("GOOGLE_API_KEY"))
+def provider_label(provider: str) -> str:
+    return PROVIDER_LABELS.get(provider, provider)
+if __name__ == "__main__":
+    configure_aws_credentials()
+    models_path, settings_path = write_runtime_config()
+    print(f"Wrote {models_path}")
+    print(f"Wrote {settings_path}")

agent-redact/pi/pi_examples.py ADDED Viewed

	@@ -0,0 +1,180 @@

+"""Pi agent Gradio examples aligned with the main app SHOW_EXAMPLES redaction demos."""
+from __future__ import annotations
+import os
+from dataclasses import dataclass
+from pathlib import Path
+from pi_agent_config import is_hf_space_profile
+from redaction_prompt import HF_DEFAULT_OCR
+def _show_examples_from_env() -> bool:
+    """True unless PI_GRADIO_SHOW_EXAMPLES or SHOW_PI_EXAMPLES is explicitly false."""
+    for key in ("PI_GRADIO_SHOW_EXAMPLES", "SHOW_PI_EXAMPLES"):
+        raw = os.environ.get(key)
+        if raw is None:
+            continue
+        lowered = raw.strip().lower()
+        if lowered in {"0", "false", "no"}:
+            return False
+        if lowered in {"1", "true", "yes"}:
+            return True
+    return True
+SHOW_PI_EXAMPLES = _show_examples_from_env()
+@dataclass(frozen=True)
+class PiRedactionExample:
+    label: str
+    file_name: str
+    instructions: str
+    ocr_method: str
+    pii_method: str = "Local"
+    encourage_vlm_faces: bool = False
+    encourage_vlm_signatures: bool = False
+    page_range: str = "all"
+def resolve_example_data_dir() -> Path | None:
+    """Locate bundled example PDFs (repo checkout, PyPI package, or Docker layout)."""
+    from bootstrap_pi_config import pi_repo_root_path
+    workdir = pi_repo_root_path()
+    repo_root = Path(__file__).resolve().parents[2]
+    candidates = [
+        workdir / "doc_redaction" / "example_data",
+        workdir / "example_data",
+        repo_root / "doc_redaction" / "example_data",
+        repo_root / "example_data",
+    ]
+    for candidate in candidates:
+        if candidate.is_dir():
+            return candidate.resolve()
+    return None
+def example_file_path(file_name: str) -> Path | None:
+    root = resolve_example_data_dir()
+    if root is None:
+        return None
+    path = (root / file_name).resolve()
+    try:
+        path.relative_to(root)
+    except ValueError:
+        return None
+    if not path.is_file():
+        return None
+    if _is_lfs_pointer(path):
+        return None
+    return path
+def _is_lfs_pointer(path: Path) -> bool:
+    try:
+        first_line = path.read_text(encoding="utf-8", errors="ignore").splitlines()[0]
+    except (OSError, IndexError):
+        return False
+    return first_line.startswith("version https://git-lfs.github.com/spec/v1")
+def _catalog() -> tuple[PiRedactionExample, ...]:
+    selectable_text_ocr = (
+        HF_DEFAULT_OCR if is_hf_space_profile() else "Local model - selectable text"
+    )
+    # local_ocr = (
+    #     HF_DEFAULT_OCR
+    #     if is_hf_space_profile()
+    #     else "Local OCR model - PDFs without selectable text"
+    # )
+    return (
+        PiRedactionExample(
+            label="Emails to a professor",
+            file_name="example_of_emails_sent_to_a_professor_before_applying.pdf",
+            ocr_method=selectable_text_ocr,
+            pii_method="Local",
+            instructions=(
+                "- Any redaction box related to Dr Kornbluth should be removed\n"
+                "- References to Dr Hyde, or Dr Hyde's lab should be redacted. Also any references to Lauren, or Lauren Lilley\n"
+                "- All mentions of Universities and their names should be redacted\n"
+            ),
+        ),
+        PiRedactionExample(
+            label="Graduate cover letter",
+            file_name="graduate-job-example-cover-letter.pdf",
+            ocr_method=selectable_text_ocr,
+            pii_method="Local",
+            instructions=(
+                "- Redact any names and titles, apart from Mr Wilson\n"
+                "- Redact any organisation names\n"
+                "- Redact any place names\n"
+            ),
+        ),
+    )
+def available_pi_examples() -> list[PiRedactionExample]:
+    if not SHOW_PI_EXAMPLES:
+        return []
+    available: list[PiRedactionExample] = []
+    for example in _catalog():
+        if example_file_path(example.file_name) is not None:
+            available.append(example)
+    return available
+def example_rows() -> tuple[list[list], list[str]]:
+    """Return (gr.Examples rows, labels) for available demos."""
+    rows: list[list] = []
+    labels: list[str] = []
+    for example in available_pi_examples():
+        path = example_file_path(example.file_name)
+        if path is None:
+            continue
+        rows.append(
+            [
+                str(path),
+                example.instructions,
+                example.page_range,
+                example.ocr_method,
+                example.pii_method,
+                example.encourage_vlm_faces,
+                example.encourage_vlm_signatures,
+            ]
+        )
+        labels.append(example.label)
+    return rows, labels
+def gradio_example_allowed_paths() -> list[str]:
+    root = resolve_example_data_dir()
+    if root is None:
+        return []
+    return [str(root)]
+def examples_status_markdown() -> str:
+    """Human-readable status for the UI when examples are missing or disabled."""
+    if not SHOW_PI_EXAMPLES:
+        return (
+            "_Examples are disabled. Set Space variable "
+            "`PI_GRADIO_SHOW_EXAMPLES=true` (or `SHOW_PI_EXAMPLES=true`) and restart._"
+        )
+    root = resolve_example_data_dir()
+    if root is None:
+        return (
+            "_Example PDFs not found — expected under "
+            "`doc_redaction/example_data/` in the Space image._"
+        )
+    available = available_pi_examples()
+    if not available:
+        return (
+            f"_Example PDFs not found under `{root}`. "
+            "Rebuild the Space after syncing example files from the monorepo._"
+        )
+    names = ", ".join(f"`{ex.file_name}`" for ex in available)
+    return f"_Examples loaded from `{root}`: {names}_"

agent-redact/pi/pi_rpc_client.py ADDED Viewed

	@@ -0,0 +1,649 @@

+"""Python client for Pi RPC mode (JSONL over stdin/stdout)."""
+from __future__ import annotations
+import json
+import os
+import shutil
+import subprocess
+import threading
+import uuid
+from collections.abc import Iterator
+from dataclasses import dataclass, field
+from typing import Any
+class PiRpcError(RuntimeError):
+    pass
+# Pi RPC is JSONL over pipes; always UTF-8 (Windows default locale is cp1252).
+_PI_SUBPROCESS_ENCODING = "utf-8"
+_PI_SUBPROCESS_ENCODING_ERRORS = "replace"
+_PI_INSTALL_HINT = (
+    "Install the Pi coding agent CLI, then restart the Gradio app:  \n"
+    "`npm install -g @earendil-works/pi-coding-agent`  \n"
+    "On Windows, ensure Node.js/npm are on PATH (or set `PI_EXECUTABLE` to the "
+    "full path to `pi.cmd`, e.g. `%APPDATA%\\npm\\pi.cmd`).  \n"
+    "Docker users: run the Pi UI via `docker compose` (`pi-agent` service) instead "
+    "of `python gradio_app.py` on the host."
+)
+def resolve_pi_executable() -> str:
+    """Return a path to the ``pi`` RPC executable (raises ``PiRpcError`` if missing)."""
+    override = os.environ.get("PI_EXECUTABLE", "").strip()
+    if override:
+        if os.path.isfile(override) or shutil.which(override):
+            return override
+        raise PiRpcError(
+            f"PI_EXECUTABLE is set but not found: `{override}`  \n\n{_PI_INSTALL_HINT}"
+        )
+    for name in ("pi", "pi.cmd"):
+        found = shutil.which(name)
+        if found:
+            return found
+    raise PiRpcError(f"Pi CLI (`pi`) not found on PATH.  \n\n{_PI_INSTALL_HINT}")
+@dataclass
+class PiStreamEvent:
+    """Structured event from Pi RPC for UI layers."""
+    kind: str
+    text: str = ""
+    tool_name: str | None = None
+    tool_call_id: str | None = None
+    tool_args: dict[str, Any] | None = None
+    tool_output: str | None = None
+    is_error: bool = False
+    meta: dict[str, Any] = field(default_factory=dict)
+def extract_tool_text(payload: dict[str, Any] | None) -> str:
+    if not payload:
+        return ""
+    content = payload.get("content")
+    if content is None and isinstance(payload.get("partialResult"), dict):
+        content = payload["partialResult"].get("content")
+    if content is None and isinstance(payload.get("result"), dict):
+        content = payload["result"].get("content")
+    if not isinstance(content, list):
+        return ""
+    parts: list[str] = []
+    for block in content:
+        if isinstance(block, dict) and block.get("type") == "text":
+            parts.append(str(block.get("text") or ""))
+    return "\n".join(parts).strip()
+def extract_assistant_display(message: dict[str, Any] | None) -> tuple[str, str]:
+    """Extract visible text and thinking from a partial assistant message."""
+    if not message or message.get("role") != "assistant":
+        return "", ""
+    content = message.get("content")
+    if isinstance(content, str):
+        return content, ""
+    if not isinstance(content, list):
+        return "", ""
+    texts: list[str] = []
+    thinkings: list[str] = []
+    for block in content:
+        if isinstance(block, str):
+            if block.strip():
+                texts.append(block)
+            continue
+        if not isinstance(block, dict):
+            continue
+        block_type = block.get("type")
+        if block_type in (None, "text", "output_text"):
+            text = block.get("text") or block.get("content") or ""
+            if text:
+                texts.append(str(text))
+        elif block_type in ("thinking", "reasoning", "thought"):
+            thought = (
+                block.get("thinking")
+                or block.get("text")
+                or block.get("reasoning")
+                or block.get("content")
+                or ""
+            )
+            if thought:
+                thinkings.append(str(thought))
+    return "".join(texts), "".join(thinkings)
+def assistant_chat_text(visible: str, thinking: str) -> str:
+    """Text to show in the main chat — visible answer, or thinking when Gemini sends only that."""
+    if visible.strip():
+        return visible
+    return thinking
+def _tool_lines_from_content(content: list[Any]) -> list[str]:
+    tool_lines: list[str] = []
+    for block in content:
+        if not isinstance(block, dict):
+            continue
+        block_type = block.get("type")
+        if block_type not in {"toolCall", "tool_use", "functionCall"}:
+            continue
+        name = str(block.get("name") or block.get("toolName") or "tool")
+        args = block.get("arguments") or block.get("input") or block.get("args")
+        if isinstance(args, str):
+            try:
+                args = json.loads(args)
+            except json.JSONDecodeError:
+                args = {"raw": args}
+        if not isinstance(args, dict):
+            args = {}
+        tool_lines.append(f"**{name}:** {format_tool_args(name, args)}")
+    return tool_lines
+def format_assistant_message_for_chat(message: dict[str, Any]) -> str:
+    """Render one assistant message for the chat UI (visible text or tool calls; no thinking)."""
+    visible, _thinking = extract_assistant_display(message)
+    if visible.strip():
+        return visible
+    content = message.get("content")
+    if not isinstance(content, list):
+        return ""
+    return "\n".join(_tool_lines_from_content(content))
+def chat_text_from_assistant_message(message: dict[str, Any] | None) -> str:
+    """Non-thinking chat text from a Pi/Gemini assistant message snapshot."""
+    if not message or message.get("role") != "assistant":
+        return ""
+    return format_assistant_message_for_chat(message)
+_RATE_LIMIT_MARKERS = (
+    "429",
+    "quota",
+    "rate limit",
+    "rate-limit",
+    "resource_exhausted",
+    "too many requests",
+)
+def is_rate_limit_error(text: str | None) -> bool:
+    """True when *text* looks like a provider quota or rate-limit failure."""
+    if not text:
+        return False
+    lowered = text.lower()
+    return any(marker in lowered for marker in _RATE_LIMIT_MARKERS)
+def last_assistant_turn_error(messages: list[dict[str, Any]]) -> str | None:
+    """Return the latest assistant error in the current user turn, if any."""
+    last_user = -1
+    for index, message in enumerate(messages):
+        if message.get("role") == "user":
+            last_user = index
+    turn_messages = messages[last_user + 1 :] if last_user >= 0 else messages
+    for message in reversed(turn_messages):
+        if message.get("role") != "assistant":
+            continue
+        error = message.get("errorMessage")
+        if error:
+            return str(error)
+        if message.get("stopReason") == "error":
+            visible, _ = extract_assistant_display(message)
+            if visible.strip():
+                return visible
+            return "assistant turn failed"
+    return None
+def assistant_text_since_last_user(messages: list[dict[str, Any]]) -> str:
+    """Combine assistant messages from the latest user turn."""
+    last_user = -1
+    for index, message in enumerate(messages):
+        if message.get("role") == "user":
+            last_user = index
+    turn_messages = messages[last_user + 1 :] if last_user >= 0 else messages
+    parts: list[str] = []
+    for message in turn_messages:
+        if message.get("role") != "assistant":
+            continue
+        part = format_assistant_message_for_chat(message)
+        if part.strip():
+            parts.append(part)
+    return "\n\n".join(parts)
+def partial_message_from_update(event: dict[str, Any]) -> dict[str, Any] | None:
+    delta = event.get("assistantMessageEvent") or {}
+    partial = delta.get("partial")
+    if isinstance(partial, dict):
+        return partial
+    message = event.get("message")
+    if isinstance(message, dict):
+        return message
+    return None
+def format_tool_args(tool_name: str | None, args: dict[str, Any] | None) -> str:
+    if not args:
+        return ""
+    name = (tool_name or "").lower()
+    if name == "bash" and args.get("command"):
+        cmd = str(args["command"]).replace("\n", " ↵ ")
+        return f"`{cmd[:240]}{'…' if len(cmd) > 240 else ''}`"
+    if name in {"read", "write", "edit"} and args.get("path"):
+        return f"`{args['path']}`"
+    compact = json.dumps(args, ensure_ascii=False)
+    if len(compact) > 280:
+        compact = compact[:277] + "…"
+    return compact
+class PiRpcClient:
+    """Drive a long-lived ``pi --mode rpc`` subprocess."""
+    def __init__(
+        self,
+        *,
+        cwd: str | None = None,
+        env: dict[str, str] | None = None,
+        pi_args: list[str] | None = None,
+    ) -> None:
+        self._cwd = cwd
+        self._env = env
+        self._pi_args = pi_args or []
+        self._proc: subprocess.Popen[str] | None = None
+        self._io_lock = threading.Lock()
+        self._abort_requested = False
+    @property
+    def running(self) -> bool:
+        return self._proc is not None and self._proc.poll() is None
+    def start(self) -> None:
+        if self.running:
+            return
+        command = [resolve_pi_executable(), "--mode", "rpc", *self._pi_args]
+        self._proc = subprocess.Popen(
+            command,
+            stdin=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            encoding=_PI_SUBPROCESS_ENCODING,
+            errors=_PI_SUBPROCESS_ENCODING_ERRORS,
+            bufsize=1,
+            cwd=self._cwd,
+            env=self._env,
+        )
+    def close(self) -> None:
+        if not self._proc:
+            return
+        if self.running:
+            try:
+                self.abort()
+            except Exception:
+                pass
+            self._proc.terminate()
+            try:
+                self._proc.wait(timeout=5)
+            except subprocess.TimeoutExpired:
+                self._proc.kill()
+        self._proc = None
+    def _ensure_running(self) -> subprocess.Popen[str]:
+        if not self.running:
+            self.start()
+        assert self._proc is not None
+        return self._proc
+    def _read_line(self) -> dict[str, Any]:
+        proc = self._ensure_running()
+        assert proc.stdout is not None
+        with self._io_lock:
+            line = proc.stdout.readline()
+        if not line:
+            code = proc.poll()
+            err = ""
+            if proc.stderr is not None:
+                err = proc.stderr.read() or ""
+            raise PiRpcError(
+                f"Pi RPC process exited (code={code})."
+                + (f" stderr: {err[:500]}" if err else "")
+            )
+        line = line.rstrip("\r\n")
+        if not line:
+            return self._read_line()
+        return json.loads(line)
+    def _write_command(self, command: dict[str, Any]) -> None:
+        proc = self._ensure_running()
+        assert proc.stdin is not None
+        with self._io_lock:
+            proc.stdin.write(json.dumps(command) + "\n")
+            proc.stdin.flush()
+    def _send_command(
+        self,
+        command: dict[str, Any],
+        *,
+        wait_response: bool = True,
+    ) -> dict[str, Any] | None:
+        req_id = command.setdefault("id", str(uuid.uuid4()))
+        self._write_command(command)
+        if not wait_response:
+            return None
+        while True:
+            event = self._read_line()
+            if event.get("type") == "response" and event.get("id") == req_id:
+                if not event.get("success", False):
+                    error = (
+                        event.get("error") or event.get("message") or "command failed"
+                    )
+                    raise PiRpcError(str(error))
+                return event
+    def abort(self) -> None:
+        """Request abort without reading stdout (the active stream consumer drains events)."""
+        if not self.running:
+            return
+        self._abort_requested = True
+        try:
+            self._send_command({"type": "abort"}, wait_response=False)
+        except OSError:
+            pass
+    @property
+    def abort_requested(self) -> bool:
+        return self._abort_requested
+    def clear_abort(self) -> None:
+        self._abort_requested = False
+    def new_session(self) -> None:
+        self._send_command({"type": "new_session"})
+    def get_state(self) -> dict[str, Any]:
+        response = self._send_command({"type": "get_state"})
+        data = response.get("data") if response else {}
+        return data if isinstance(data, dict) else {}
+    def get_messages(self) -> list[dict[str, Any]]:
+        response = self._send_command({"type": "get_messages"})
+        data = response.get("data") if response else {}
+        messages = data.get("messages") if isinstance(data, dict) else []
+        return messages if isinstance(messages, list) else []
+    def get_session_stats(self) -> dict[str, Any]:
+        """Token usage and cost totals for the active session (Pi RPC ``get_session_stats``)."""
+        response = self._send_command({"type": "get_session_stats"})
+        data = response.get("data") if response else {}
+        return data if isinstance(data, dict) else {}
+    def set_model(self, provider: str, model_id: str) -> dict[str, Any]:
+        response = self._send_command(
+            {
+                "type": "set_model",
+                "provider": provider,
+                "modelId": model_id,
+            }
+        )
+        data = response.get("data") if response else {}
+        return data if isinstance(data, dict) else {}
+    def get_available_models(self) -> list[dict[str, Any]]:
+        response = self._send_command({"type": "get_available_models"})
+        data = response.get("data") if response else {}
+        models = data.get("models") if isinstance(data, dict) else []
+        return models if isinstance(models, list) else []
+    def restart(self) -> None:
+        self.close()
+        self.start()
+    def prompt_events(self, message: str) -> Iterator[PiStreamEvent]:
+        """Send a user message and yield structured events until ``agent_end``."""
+        self.clear_abort()
+        req_id = str(uuid.uuid4())
+        self._send_command(
+            {"id": req_id, "type": "prompt", "message": message},
+            wait_response=False,
+        )
+        while True:
+            event = self._read_line()
+            if event.get("type") == "response" and event.get("id") == req_id:
+                if not event.get("success", False):
+                    error = (
+                        event.get("error") or event.get("message") or "prompt rejected"
+                    )
+                    yield PiStreamEvent(kind="error", text=str(error), is_error=True)
+                    return
+                break
+        yield from self._iter_agent_events()
+    def _iter_agent_events(self) -> Iterator[PiStreamEvent]:
+        while True:
+            event = self._read_line()
+            event_type = event.get("type")
+            if event_type == "agent_start":
+                yield PiStreamEvent(kind="status", text="Agent started…")
+            elif event_type == "turn_start":
+                yield PiStreamEvent(kind="status", text="Turn started.")
+            elif event_type == "turn_end":
+                yield PiStreamEvent(kind="turn_end", text="Turn finished.")
+            elif event_type == "message_update":
+                yield from self._parse_message_update(event)
+            elif event_type == "tool_execution_start":
+                tool_name = event.get("toolName")
+                tool_args = (
+                    event.get("args") if isinstance(event.get("args"), dict) else {}
+                )
+                yield PiStreamEvent(
+                    kind="tool_start",
+                    tool_name=str(tool_name) if tool_name else "tool",
+                    tool_call_id=event.get("toolCallId"),
+                    tool_args=tool_args,
+                    text=format_tool_args(
+                        str(tool_name) if tool_name else None,
+                        tool_args,
+                    ),
+                )
+            elif event_type == "tool_execution_update":
+                output = extract_tool_text(event)
+                yield PiStreamEvent(
+                    kind="tool_update",
+                    tool_name=event.get("toolName"),
+                    tool_call_id=event.get("toolCallId"),
+                    tool_output=output,
+                )
+            elif event_type == "tool_execution_end":
+                result = (
+                    event.get("result") if isinstance(event.get("result"), dict) else {}
+                )
+                output = extract_tool_text(result)
+                yield PiStreamEvent(
+                    kind="tool_end",
+                    tool_name=event.get("toolName"),
+                    tool_call_id=event.get("toolCallId"),
+                    tool_output=output,
+                    is_error=bool(event.get("isError")),
+                )
+            elif event_type == "queue_update":
+                steering = event.get("steering") or []
+                follow_up = event.get("followUp") or []
+                if steering or follow_up:
+                    yield PiStreamEvent(
+                        kind="status",
+                        text="Queue updated.",
+                        meta={"steering": steering, "follow_up": follow_up},
+                    )
+            elif event_type == "compaction_start":
+                reason = event.get("reason") or "unknown"
+                yield PiStreamEvent(
+                    kind="status",
+                    text=f"Compaction started ({reason})…",
+                    meta={"reason": reason},
+                )
+            elif event_type == "compaction_end":
+                if event.get("aborted"):
+                    text = "Compaction aborted."
+                elif event.get("errorMessage"):
+                    text = f"Compaction failed: {event['errorMessage']}"
+                    yield PiStreamEvent(kind="error", text=text, is_error=True)
+                    continue
+                elif event.get("willRetry"):
+                    text = "Compaction complete — retrying prompt…"
+                else:
+                    tokens = (event.get("result") or {}).get("tokensBefore")
+                    text = (
+                        f"Compaction complete ({tokens:,} tokens before)."
+                        if isinstance(tokens, int)
+                        else "Compaction complete."
+                    )
+                yield PiStreamEvent(kind="status", text=text, meta=event)
+            elif event_type == "auto_retry_start":
+                attempt = event.get("attempt")
+                max_attempts = event.get("maxAttempts")
+                delay_ms = event.get("delayMs")
+                msg = event.get("errorMessage") or "transient error"
+                yield PiStreamEvent(
+                    kind="status",
+                    text=(
+                        f"Auto-retry {attempt}/{max_attempts} in {delay_ms}ms "
+                        f"({str(msg)[:120]})"
+                    ),
+                    meta=event,
+                )
+            elif event_type == "auto_retry_end":
+                if event.get("success"):
+                    yield PiStreamEvent(
+                        kind="status",
+                        text=f"Auto-retry succeeded on attempt {event.get('attempt')}.",
+                    )
+                else:
+                    yield PiStreamEvent(
+                        kind="error",
+                        text=f"Auto-retry failed: {event.get('finalError', 'unknown error')}",
+                        is_error=True,
+                    )
+            elif event_type == "extension_error":
+                yield PiStreamEvent(
+                    kind="error",
+                    text=str(event.get("error") or "extension error"),
+                    is_error=True,
+                )
+            elif event_type == "agent_end":
+                aborted = self._abort_requested
+                self.clear_abort()
+                yield PiStreamEvent(
+                    kind="done",
+                    text="Agent aborted." if aborted else "Agent finished.",
+                )
+                return
+    def _parse_message_update(self, event: dict[str, Any]) -> Iterator[PiStreamEvent]:
+        delta = event.get("assistantMessageEvent") or {}
+        delta_type = delta.get("type")
+        partial = partial_message_from_update(event)
+        if partial is not None:
+            visible, thinking = extract_assistant_display(partial)
+            if visible.strip():
+                yield PiStreamEvent(kind="text_snapshot", text=visible)
+            elif chat_text := chat_text_from_assistant_message(partial):
+                yield PiStreamEvent(kind="text_snapshot", text=chat_text)
+            if thinking.strip():
+                yield PiStreamEvent(kind="thinking_snapshot", text=thinking)
+        if delta_type == "text_delta":
+            chunk = delta.get("delta") or ""
+            if chunk:
+                yield PiStreamEvent(kind="text_delta", text=chunk)
+        elif delta_type == "thinking_delta":
+            chunk = delta.get("delta") or ""
+            if chunk:
+                yield PiStreamEvent(kind="thinking_delta", text=chunk)
+        elif delta_type == "toolcall_start":
+            tool_call = delta.get("toolCall") or {}
+            tool_name = tool_call.get("name") or delta.get("toolName") or "tool"
+            tool_args = tool_call.get("arguments")
+            if isinstance(tool_args, str):
+                try:
+                    tool_args = json.loads(tool_args)
+                except json.JSONDecodeError:
+                    tool_args = {"raw": tool_args}
+            if not isinstance(tool_args, dict):
+                tool_args = {}
+            detail = format_tool_args(str(tool_name), tool_args)
+            chat_line = f"**{tool_name}:** {detail}" if detail else f"**{tool_name}**"
+            yield PiStreamEvent(kind="text_snapshot", text=chat_line)
+        elif delta_type == "error":
+            yield PiStreamEvent(
+                kind="error",
+                text=str(
+                    delta.get("message") or delta.get("error") or "generation error"
+                ),
+                is_error=True,
+            )
+    def prompt_stream(
+        self, message: str, *, show_tool_status: bool = True
+    ) -> Iterator[str]:
+        """Backward-compatible text stream (assistant visible text + optional tool status)."""
+        for event in self.prompt_events(message):
+            if event.kind == "text_delta":
+                yield event.text
+            elif show_tool_status and event.kind == "tool_start":
+                yield f"\n\n_[Running {event.tool_name}…]_\n"
+            elif event.kind == "error":
+                yield f"\n\n**Error:** {event.text}\n"
+def default_client(session_hash: str | None = None) -> PiRpcClient:
+    from pi_agent_config import configure_aws_credentials
+    from pi_workspace_skills import ensure_workspace_skills, pi_rpc_args, pi_rpc_cwd
+    configure_aws_credentials()
+    ensure_workspace_skills()
+    env = os.environ.copy()
+    env.setdefault("HOME", os.path.expanduser("~"))
+    env.setdefault("PYTHONUTF8", "1")
+    env.setdefault("PYTHONIOENCODING", "utf-8")
+    from session_workspace import workspace_base_dir
+    env.setdefault("PI_WORKSPACE_DIR", str(workspace_base_dir()))
+    if not env.get("GEMINI_API_KEY") and env.get("GOOGLE_API_KEY"):
+        env["GEMINI_API_KEY"] = env["GOOGLE_API_KEY"]
+    if not env.get("HF_TOKEN") and env.get("DOC_REDACTION_HF_TOKEN"):
+        env["HF_TOKEN"] = env["DOC_REDACTION_HF_TOKEN"]
+    return PiRpcClient(
+        cwd=pi_rpc_cwd(session_hash),
+        env=env,
+        pi_args=pi_rpc_args(),
+    )

agent-redact/pi/pi_session_usage.py ADDED Viewed

	@@ -0,0 +1,185 @@

+"""Summarize Pi agent LLM token usage for usage-log CSV rows."""
+from __future__ import annotations
+import json
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+from pi_rpc_client import PiRpcClient, PiRpcError
+@dataclass(frozen=True)
+class TokenUsageTotals:
+    """Pi session usage (see Pi session-format ``Usage``)."""
+    input: int = 0
+    output: int = 0
+    cache_read: int = 0
+    cache_write: int = 0
+    @property
+    def llm_input_tokens(self) -> int:
+        """Input-side tokens for the main-app usage log (input + cache)."""
+        return self.input + self.cache_read + self.cache_write
+    @property
+    def llm_output_tokens(self) -> int:
+        return self.output
+def _int_field(raw: Any) -> int:
+    try:
+        return max(0, int(raw or 0))
+    except (TypeError, ValueError):
+        return 0
+def totals_from_usage_dict(usage: dict[str, Any] | None) -> TokenUsageTotals:
+    if not usage:
+        return TokenUsageTotals()
+    return TokenUsageTotals(
+        input=_int_field(usage.get("input")),
+        output=_int_field(usage.get("output")),
+        cache_read=_int_field(usage.get("cacheRead")),
+        cache_write=_int_field(usage.get("cacheWrite")),
+    )
+def totals_from_stats_payload(data: dict[str, Any] | None) -> TokenUsageTotals:
+    if not data:
+        return TokenUsageTotals()
+    tokens = data.get("tokens")
+    if isinstance(tokens, dict):
+        return totals_from_usage_dict(tokens)
+    return TokenUsageTotals()
+def subtract_usage(
+    after: TokenUsageTotals, before: TokenUsageTotals
+) -> TokenUsageTotals:
+    return TokenUsageTotals(
+        input=max(0, after.input - before.input),
+        output=max(0, after.output - before.output),
+        cache_read=max(0, after.cache_read - before.cache_read),
+        cache_write=max(0, after.cache_write - before.cache_write),
+    )
+def add_usage(left: TokenUsageTotals, right: TokenUsageTotals) -> TokenUsageTotals:
+    return TokenUsageTotals(
+        input=left.input + right.input,
+        output=left.output + right.output,
+        cache_read=left.cache_read + right.cache_read,
+        cache_write=left.cache_write + right.cache_write,
+    )
+def sum_usage_from_messages(
+    messages: list[dict[str, Any]],
+    *,
+    since_last_user: bool = False,
+) -> TokenUsageTotals:
+    """Sum ``usage`` on assistant messages (optional: only after the last user turn)."""
+    last_user = -1
+    if since_last_user:
+        for index, message in enumerate(messages):
+            if message.get("role") == "user":
+                last_user = index
+        messages = messages[last_user + 1 :] if last_user >= 0 else messages
+    total = TokenUsageTotals()
+    for message in messages:
+        if message.get("role") != "assistant":
+            continue
+        usage = message.get("usage")
+        if isinstance(usage, dict):
+            total = add_usage(total, totals_from_usage_dict(usage))
+    return total
+def sum_usage_from_jsonl(path: Path) -> TokenUsageTotals:
+    """Parse a Pi session JSONL file and sum assistant ``usage`` blocks."""
+    total = TokenUsageTotals()
+    try:
+        text = path.read_text(encoding="utf-8")
+    except OSError:
+        return total
+    for line in text.splitlines():
+        stripped = line.strip()
+        if not stripped:
+            continue
+        try:
+            entry = json.loads(stripped)
+        except json.JSONDecodeError:
+            continue
+        if entry.get("type") != "message":
+            continue
+        message = entry.get("message")
+        if not isinstance(message, dict) or message.get("role") != "assistant":
+            continue
+        usage = message.get("usage")
+        if isinstance(usage, dict):
+            total = add_usage(total, totals_from_usage_dict(usage))
+    return total
+def resolve_session_token_usage(client: PiRpcClient | None) -> TokenUsageTotals:
+    """
+    Best-effort session usage from Pi RPC ``get_session_stats``, live messages, or JSONL.
+    """
+    if client is None or not client.running:
+        return TokenUsageTotals()
+    try:
+        stats = client.get_session_stats()
+        totals = totals_from_stats_payload(stats)
+        if totals.input or totals.output or totals.cache_read or totals.cache_write:
+            return totals
+    except PiRpcError:
+        pass
+    try:
+        messages = client.get_messages()
+        totals = sum_usage_from_messages(messages)
+        if totals.input or totals.output or totals.cache_read or totals.cache_write:
+            return totals
+    except PiRpcError:
+        pass
+    from session_logs import pi_session_file_from_client
+    session_file = pi_session_file_from_client(client)
+    if session_file is not None:
+        return sum_usage_from_jsonl(session_file)
+    return TokenUsageTotals()
+def usage_for_completed_turn(
+    client: PiRpcClient | None,
+    baseline: TokenUsageTotals | None,
+) -> TokenUsageTotals:
+    """
+    Tokens consumed by the prompt that just finished.
+    Prefers delta from *baseline* (captured before ``prompt_events``). Falls back to
+    summing assistant ``usage`` since the last user message, then whole-session totals.
+    """
+    if client is None or not client.running:
+        return TokenUsageTotals()
+    current = resolve_session_token_usage(client)
+    if baseline is not None:
+        delta = subtract_usage(current, baseline)
+        if delta.input or delta.output or delta.cache_read or delta.cache_write:
+            return delta
+    try:
+        turn = sum_usage_from_messages(client.get_messages(), since_last_user=True)
+        if turn.input or turn.output or turn.cache_read or turn.cache_write:
+            return turn
+    except PiRpcError:
+        pass
+    return current

agent-redact/pi/pi_workspace_skills.py ADDED Viewed

	@@ -0,0 +1,182 @@

+"""Sync doc_redaction skills into the Pi workspace and constrain Pi RPC to that tree."""
+from __future__ import annotations
+import os
+import shutil
+import stat
+from pathlib import Path
+from bootstrap_pi_config import pi_repo_root_path
+def workspace_base_dir() -> Path:
+    from session_workspace import workspace_base_dir as _base
+    return _base()
+def workspace_pi_dir() -> Path:
+    return workspace_base_dir() / ".pi"
+def workspace_skills_dir() -> Path:
+    return workspace_pi_dir() / "skills"
+def repo_skills_dir() -> Path:
+    return pi_repo_root_path() / "skills"
+def _env_flag(name: str) -> bool:
+    return os.environ.get(name, "").strip().lower() in {"1", "true", "yes", "on"}
+def _should_resync(dest: Path, src: Path) -> bool:
+    if _env_flag("PI_SKILLS_RESYNC"):
+        return True
+    if not dest.is_dir():
+        return True
+    if not any(dest.iterdir()):
+        return True
+    try:
+        return src.stat().st_mtime > dest.stat().st_mtime
+    except OSError:
+        return True
+def _copy_tree_item(src: Path, dest: Path) -> None:
+    if src.is_dir():
+        if dest.exists():
+            for child in src.iterdir():
+                _copy_tree_item(child, dest / child.name)
+        else:
+            shutil.copytree(src, dest, copy_function=shutil.copy2)
+        return
+    dest.parent.mkdir(parents=True, exist_ok=True)
+    shutil.copy2(src, dest)
+def _make_readonly(path: Path) -> None:
+    if _env_flag("PI_SKILLS_WRITABLE"):
+        return
+    try:
+        if path.is_dir():
+            for root, dirs, files in os.walk(path):
+                root_path = Path(root)
+                for name in files:
+                    file_path = root_path / name
+                    mode = file_path.stat().st_mode
+                    file_path.chmod(
+                        mode & ~stat.S_IWUSR & ~stat.S_IWGRP & ~stat.S_IWOTH
+                    )
+                for name in dirs:
+                    dir_path = root_path / name
+                    mode = dir_path.stat().st_mode
+                    dir_path.chmod(mode & ~stat.S_IWUSR & ~stat.S_IWGRP & ~stat.S_IWOTH)
+            mode = path.stat().st_mode
+            path.chmod(mode & ~stat.S_IWUSR & ~stat.S_IWGRP & ~stat.S_IWOTH)
+        else:
+            mode = path.stat().st_mode
+            path.chmod(mode & ~stat.S_IWUSR & ~stat.S_IWGRP & ~stat.S_IWOTH)
+    except OSError:
+        pass
+def write_workspace_pi_settings() -> Path:
+    """
+    Project Pi settings under ``{workspace}/.pi/settings.json``.
+    Paths in that file resolve relative to ``{workspace}/.pi/`` per Pi docs.
+    """
+    pi_dir = workspace_pi_dir()
+    pi_dir.mkdir(parents=True, exist_ok=True)
+    settings_path = pi_dir / "settings.json"
+    payload = {
+        "skills": ["skills"],
+        "extensions": [],
+        "packages": [],
+        "enableSkillCommands": True,
+    }
+    import json
+    settings_path.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8")
+    return settings_path
+def sync_repo_skills_to_workspace(*, force: bool = False) -> Path:
+    """
+    Copy ``{repo}/skills/`` → ``{workspace}/.pi/skills/`` (read-only for the agent).
+    Re-sync when the repo tree is newer or ``PI_SKILLS_RESYNC=true``.
+    """
+    src = repo_skills_dir()
+    dest = workspace_skills_dir()
+    workspace_pi_dir().mkdir(parents=True, exist_ok=True)
+    if not src.is_dir():
+        dest.mkdir(parents=True, exist_ok=True)
+        write_workspace_pi_settings()
+        return dest
+    if force or _should_resync(dest, src):
+        if dest.exists():
+            shutil.rmtree(dest, ignore_errors=True)
+        dest.mkdir(parents=True, exist_ok=True)
+        for item in sorted(src.iterdir()):
+            _copy_tree_item(item, dest / item.name)
+    _make_readonly(dest)
+    write_workspace_pi_settings()
+    os.environ["PI_WORKSPACE_SKILLS_DIR"] = str(dest.resolve())
+    return dest.resolve()
+def ensure_workspace_skills(*, force: bool = False) -> Path:
+    """Idempotent sync used at app startup and before Pi RPC starts."""
+    return sync_repo_skills_to_workspace(force=force)
+def partnership_template_in_workspace() -> Path | None:
+    path = workspace_skills_dir() / "Example prompt partnership.txt"
+    return path if path.is_file() else None
+def pi_rpc_cwd(session_hash: str | None = None) -> str:
+    """Subprocess cwd for ``pi --mode rpc`` (session subfolder when enabled)."""
+    from session_workspace import session_workspace_dir, session_workspace_enabled
+    base = workspace_base_dir()
+    if session_hash and session_hash.strip() and session_workspace_enabled():
+        return str(session_workspace_dir(session_hash))
+    return str(base)
+def pi_rpc_args() -> list[str]:
+    """Load only workspace skills; do not discover repo ``skills/`` via ancestors."""
+    skills_dir = ensure_workspace_skills()
+    return ["--no-skills", "--skill", str(skills_dir)]
+def workspace_boundary_prefix(session_hash: str | None = None) -> str:
+    """Extra prompt text: workspace root, skills path, and path rules."""
+    base = workspace_base_dir().as_posix().rstrip("/")
+    skills = workspace_skills_dir().as_posix()
+    from session_workspace import session_workspace_dir, session_workspace_enabled
+    if session_hash and session_hash.strip() and session_workspace_enabled():
+        root = session_workspace_dir(session_hash).as_posix().rstrip("/")
+        scope = f"your session folder `{root}/`"
+    else:
+        root = base
+        scope = f"the workspace `{base}/`"
+    return (
+        f"**Workspace boundary (mandatory):** work only under `{base}/`. "
+        f"Your active directory is {scope}. "
+        f"Do not read, write, or run shell commands targeting paths outside `{base}/` "
+        f"(including the git checkout and `agent-redact/`). "
+        f"**Skills (read-only):** doc_redaction skills are synced to `{skills}/`. "
+        f"Use `/skill:doc-redaction-app`, `/skill:doc-redact-page-review`, etc. "
+        f"Do not edit files under `{skills}/`.\n\n"
+    )

agent-redact/pi/redaction_prompt.py ADDED Viewed

	@@ -0,0 +1,556 @@

+"""Build Pi redaction task prompts from the partnership example template."""
+from __future__ import annotations
+import os
+import re
+import shutil
+from dataclasses import dataclass
+from pathlib import Path
+from pi_agent_config import is_hf_space_profile
+from session_workspace import workspace_base_dir
+def upload_root() -> Path:
+    """Gradio upload directory (created by ``bootstrap_pi_config.ensure_pi_upload_root``)."""
+    raw = (os.environ.get("PI_UPLOAD_ROOT") or "").strip()
+    if not raw:
+        from bootstrap_pi_config import ensure_pi_upload_root
+        raw = ensure_pi_upload_root(pi_repo_root())
+    path = Path(raw)
+    path.mkdir(parents=True, exist_ok=True)
+    return path.resolve()
+_SAFE_UPLOAD_FILENAME_MAX_BYTES = 255
+# Path separators, nulls, and characters unsafe on common filesystems — not general punctuation.
+_UNSAFE_UPLOAD_FILENAME_CHARS_RE = re.compile(r'[\x00-\x1f<>:"|?*\\/]')
+def _truncate_upload_filename(
+    name: str, *, max_bytes: int = _SAFE_UPLOAD_FILENAME_MAX_BYTES
+) -> str:
+    encoded = name.encode("utf-8")
+    if len(encoded) <= max_bytes:
+        return name
+    stem, suffix = os.path.splitext(name)
+    suffix_bytes = suffix.encode("utf-8")
+    max_stem_bytes = max(1, max_bytes - len(suffix_bytes))
+    while stem and len(stem.encode("utf-8")) > max_stem_bytes:
+        stem = stem[:-1]
+    if not stem:
+        stem = "file"
+    return stem + suffix
+def _split_upload_basename(name: str) -> tuple[str, str]:
+    """Split an upload basename into stem and extension (handles ``.pdf`` on Windows)."""
+    if re.fullmatch(r"\.[^./\\]+", name):
+        return "", name
+    path = Path(name)
+    return path.stem, path.suffix
+def _workspace_filename_from_upload(name: str) -> tuple[str, str, bool]:
+    """
+    Derive a workspace-safe basename, changing the name only when required for security.
+    Returns ``(original_basename, workspace_basename, renamed)``.
+    """
+    original = Path(name).name.strip()
+    if not original or original in {".", ".."}:
+        raise ValueError("Uploaded file has an invalid name.")
+    if "\x00" in original or "/" in original or "\\" in original:
+        raise ValueError("Uploaded file has an invalid name.")
+    stem, suffix = _split_upload_basename(original)
+    safe_stem = _UNSAFE_UPLOAD_FILENAME_CHARS_RE.sub("_", stem)
+    safe_suffix = _UNSAFE_UPLOAD_FILENAME_CHARS_RE.sub("_", suffix)
+    safe_stem = safe_stem.strip(". ")
+    if not safe_stem:
+        safe_stem = "file"
+    safe_name = _truncate_upload_filename(safe_stem + safe_suffix)
+    return original, safe_name, safe_name != original
+_PARTNERSHIP_TEMPLATE = Path("skills") / "Example prompt partnership.txt"
+def _workspace_root() -> Path:
+    return workspace_base_dir()
+def pi_repo_root() -> Path:
+    """Monorepo checkout root (skills/, config/). Set via :func:`bootstrap_pi_config.ensure_pi_workdir`."""
+    from bootstrap_pi_config import pi_repo_root_path
+    return pi_repo_root_path()
+def partnership_template_path() -> Path:
+    from pi_workspace_skills import partnership_template_in_workspace
+    synced = partnership_template_in_workspace()
+    if synced is not None:
+        return synced
+    return pi_repo_root() / _PARTNERSHIP_TEMPLATE
+HF_DEFAULT_OCR = "Local model - selectable text"
+HF_DEFAULT_PII = "Local"
+HF_DEFAULT_GRADIO_URL = "https://seanpedrickcase-document-redaction.hf.space"
+# Used only when PI_DEFAULT_OCR_METHOD / PI_DEFAULT_PII_METHOD are unset (local-docker profile).
+_FALLBACK_LOCAL_OCR = "hybrid-paddle-inference-server"
+_FALLBACK_LOCAL_PII = "Local"
+def _env_default(key: str, *, hf_default: str, local_fallback: str) -> str:
+    """Resolve Pi redaction defaults from env (e.g. config/pi_agent.env) with profile fallbacks."""
+    explicit = (os.environ.get(key) or "").strip()
+    if explicit:
+        return explicit
+    if is_hf_space_profile():
+        return hf_default
+    return local_fallback
+DEFAULT_OCR_METHOD = _env_default(
+    "PI_DEFAULT_OCR_METHOD",
+    hf_default=HF_DEFAULT_OCR,
+    local_fallback=_FALLBACK_LOCAL_OCR,
+)
+DEFAULT_PII_METHOD = _env_default(
+    "PI_DEFAULT_PII_METHOD",
+    hf_default=HF_DEFAULT_PII,
+    local_fallback=_FALLBACK_LOCAL_PII,
+)
+OCR_METHOD_CHOICES: tuple[str, ...] = (
+    "hybrid-paddle-inference-server",
+    "hybrid-paddle-vlm",
+    "Local model - selectable text",
+    "Local OCR",
+    "AWS Textract service - all PDF types",
+    "tesseract",
+    "paddle",
+    "hybrid-paddle",
+    "vlm",
+    "inference-server",
+)
+PII_METHOD_CHOICES: tuple[str, ...] = (
+    "Local",
+    "AWS Comprehend",
+    "LLM (AWS Bedrock)",
+    "Local inference server",
+    "Local transformers LLM",
+    "Only extract text (no redaction)",
+)
+_DEFAULT_MAX_PAGES = 3000
+def max_pages_limit() -> int:
+    """
+    Maximum PDF pages allowed for a Pi redaction task.
+    Resolution order: ``PI_MAX_PAGES`` → ``MAX_PAGES`` → ``MAX_DOC_PAGES`` → 3000.
+    """
+    for key in ("PI_MAX_PAGES", "MAX_PAGES", "MAX_DOC_PAGES"):
+        raw = (os.environ.get(key) or "").strip()
+        if raw:
+            value = int(raw)
+            if value < 1:
+                raise ValueError(f"{key} must be a positive integer.")
+            return value
+    return _DEFAULT_MAX_PAGES
+def pages_to_process_count(page_range: str, total_pages: int) -> int:
+    """Return how many pages ``page_range`` selects from a ``total_pages`` PDF."""
+    if total_pages < 1:
+        raise ValueError("PDF has no pages.")
+    text = (page_range or "all").strip().lower()
+    if not text or text == "all":
+        return total_pages
+    if "-" in text:
+        start_text, end_text = text.split("-", 1)
+        try:
+            start = int(start_text.strip())
+            end = int(end_text.strip())
+        except ValueError as exc:
+            raise ValueError(f"Invalid page range: {page_range!r}") from exc
+        if start < 1 or end < start:
+            raise ValueError(f"Invalid page range: {page_range!r}")
+        if end > total_pages:
+            raise ValueError(
+                f"Page range {page_range!r} exceeds document length "
+                f"({total_pages} pages)."
+            )
+        return end - start + 1
+    try:
+        page = int(text)
+    except ValueError as exc:
+        raise ValueError(f"Invalid page range: {page_range!r}") from exc
+    if page < 1 or page > total_pages:
+        raise ValueError(
+            f"Page {page} is out of range (document has {total_pages} pages)."
+        )
+    return 1
+def pdf_page_count(file_path: str | Path) -> int:
+    import pymupdf
+    path = Path(file_path)
+    with pymupdf.open(path) as doc:
+        return int(doc.page_count)
+def validate_pdf_page_limit(
+    file_path: str | Path,
+    *,
+    page_range: str = "all",
+    max_pages: int | None = None,
+) -> None:
+    """Reject PDFs whose selected page count exceeds ``max_pages_limit()``."""
+    path = Path(file_path)
+    if path.suffix.lower() != ".pdf":
+        return
+    limit = max_pages if max_pages is not None else max_pages_limit()
+    try:
+        total = pdf_page_count(path)
+    except Exception as exc:
+        raise ValueError(f"Could not read PDF page count for {path.name}.") from exc
+    count = pages_to_process_count(page_range, total)
+    if count > limit:
+        scope = page_range.strip() or "all"
+        raise ValueError(
+            f"Number of pages to process ({count}) exceeds the maximum allowed "
+            f"({limit}). Submit a smaller document or narrow the page range "
+            f"({scope!r})."
+        )
+@dataclass(frozen=True)
+class RedactionTaskSettings:
+    ocr_method: str = DEFAULT_OCR_METHOD
+    pii_method: str = DEFAULT_PII_METHOD
+    encourage_vlm_faces: bool = False if is_hf_space_profile() else True
+    encourage_vlm_signatures: bool = False if is_hf_space_profile() else True
+    @classmethod
+    def hf_space_defaults(cls) -> RedactionTaskSettings:
+        return cls(
+            ocr_method=HF_DEFAULT_OCR,
+            pii_method=HF_DEFAULT_PII,
+            encourage_vlm_faces=False,
+            encourage_vlm_signatures=False,
+        )
+    @classmethod
+    def from_ui(
+        cls,
+        ocr_method: str,
+        pii_method: str,
+        encourage_vlm_faces: bool,
+        encourage_vlm_signatures: bool,
+    ) -> RedactionTaskSettings:
+        ocr = (ocr_method or DEFAULT_OCR_METHOD).strip()
+        pii = (pii_method or DEFAULT_PII_METHOD).strip()
+        if ocr not in OCR_METHOD_CHOICES:
+            ocr = DEFAULT_OCR_METHOD
+        if pii not in PII_METHOD_CHOICES:
+            pii = DEFAULT_PII_METHOD
+        return cls(
+            ocr_method=ocr,
+            pii_method=pii,
+            encourage_vlm_faces=bool(encourage_vlm_faces),
+            encourage_vlm_signatures=bool(encourage_vlm_signatures),
+        )
+def doc_redaction_gradio_url() -> str:
+    """
+    Base URL of the doc_redaction Gradio app used for ``/doc_redact`` and review APIs.
+    Set ``DOC_REDACTION_GRADIO_URL`` in ``config/pi_agent.env`` (or the process environment).
+    Loaded via ``tools.config`` when the Pi app starts (default local: ``http://127.0.0.1:7860``).
+    """
+    from tools.config import DOC_REDACTION_GRADIO_URL
+    return str(DOC_REDACTION_GRADIO_URL).strip().rstrip("/")
+def _default_gradio_url() -> str:
+    """Back-compat alias for prompt template substitution."""
+    return doc_redaction_gradio_url()
+def _default_vlm_base_url() -> str:
+    return os.environ.get("PI_VLM_BASE_URL", "http://llama-inference:8080")
+def _default_vlm_model() -> str:
+    return os.environ.get("PI_VLM_MODEL", "unsloth/Qwen3.6-27B-MTP-GGUF")
+def load_template(path: Path | None = None) -> str:
+    template_file = path or partnership_template_path()
+    if not template_file.is_file():
+        raise FileNotFoundError(f"Prompt template not found: {template_file}")
+    return template_file.read_text(encoding="utf-8")
+def format_user_requirements(instructions: str) -> str:
+    lines: list[str] = []
+    for raw in instructions.strip().splitlines():
+        line = raw.strip()
+        if not line:
+            continue
+        if not line.startswith("-"):
+            line = f"- {line}"
+        lines.append(line)
+    return "\n".join(lines)
+def replace_user_requirements_section(template: str, instructions: str) -> str:
+    marker = "## User redaction requirements"
+    idx = template.find(marker)
+    formatted = format_user_requirements(instructions)
+    if idx == -1:
+        return f"{template.rstrip()}\n\n{marker} (authoritative for this task)\n\n{formatted}\n"
+    head = template[:idx]
+    return f"{head}{marker} (authoritative for this task)\n\n{formatted}\n"
+def _is_textract_ocr_method(ocr_method: str) -> bool:
+    lowered = ocr_method.casefold()
+    return "textract" in lowered or lowered in {"textract", "aws textract"}
+def build_vlm_faces_guidance(encourage: bool) -> str:
+    if is_hf_space_profile():
+        return (
+            "Pass 2 VLM and CUSTOM_VLM_FACES are not available on this deployment. "
+            "Do not pass CUSTOM_VLM_FACES or request face detection."
+        )
+    if encourage:
+        return (
+            "If the user asks to redact faces, then pass the entity CUSTOM_VLM_FACES "
+            "in the initial redaction entity selection"
+        )
+    return (
+        "Do not pass CUSTOM_VLM_FACES in the initial redaction entity list unless "
+        "the user explicitly asks to redact faces"
+    )
+def build_vlm_signature_guidance(encourage: bool, ocr_method: str) -> str:
+    if is_hf_space_profile():
+        return (
+            "Pass 2 VLM and CUSTOM_VLM_SIGNATURE are not available on this deployment. "
+            "Do not pass CUSTOM_VLM_SIGNATURE or request signature detection."
+        )
+    if encourage:
+        if _is_textract_ocr_method(ocr_method):
+            return (
+                "If the user asked to redact signatures, then pass the CUSTOM_VLM_SIGNATURE "
+                "entity in the initial redaction entity selection, unless the text extraction "
+                "option is AWS Textract, in which case the handwrite_signature_textbox parameter "
+                "for the doc_redact endpoint should include 'Extract signatures'"
+            )
+        return (
+            "If the user asked to redact signatures, then pass the CUSTOM_VLM_SIGNATURE "
+            "entity in the initial redaction entity selection"
+        )
+    return (
+        "Do not pass CUSTOM_VLM_SIGNATURE in the initial redaction entity list unless "
+        "the user explicitly asks to redact signatures"
+    )
+def build_remote_backend_guidance(
+    *,
+    gradio_url: str,
+    output_base: str,
+    workspace_root: str,
+) -> str:
+    if not is_hf_space_profile():
+        return ""
+    return (
+        f"- **Remote redaction backend:** the doc_redaction app runs at `{gradio_url}` "
+        "(private Hugging Face Space). Use **`gradio_client` only** — upload local files "
+        f"with `handle_file()` from `{workspace_root.rstrip('/')}/`. "
+        "**Do not** call `/agent/*` routes or use server-side paths from the redaction container.\n"
+        f"- Download all `/doc_redact` and `/review_apply` outputs via "
+        f"`{gradio_url.rstrip('/')}/gradio_api/file=…` with "
+        "`Authorization: Bearer $HF_TOKEN` into `{output_base}` (create subdirs as needed).\n"
+        "- Run **`verify_redaction_coverage`** locally on downloaded CSV/PDF paths in this "
+        "workspace (pandas/PyMuPDF), not via Agent API.\n"
+        "- **Pass 2 VLM is not available** — do not call a VLM endpoint or use "
+        "`CUSTOM_VLM_FACES` / `CUSTOM_VLM_SIGNATURE` entities.\n"
+        "- Helper module: `agent-redact/pi/remote_redaction.py` (`make_redaction_client`, "
+        "`download_gradio_files`)."
+    ).format(output_base=output_base.rstrip("/") + "/")
+def _resolve_and_validate_upload_path(upload_path: str | Path) -> Path:
+    if not isinstance(upload_path, (str, Path)):
+        raise ValueError("Uploaded file path has an invalid type.")
+    if not str(upload_path).strip():
+        raise ValueError("Uploaded file path is empty.")
+    root = upload_root()
+    raw_path = Path(upload_path).expanduser()
+    try:
+        source = raw_path.resolve(strict=True)
+    except FileNotFoundError as exc:
+        raise FileNotFoundError(f"Uploaded file not found: {raw_path}") from exc
+    try:
+        source.relative_to(root)
+    except ValueError as exc:
+        raise ValueError(
+            f"Uploaded file path resolves outside allowed upload root: {source}"
+        ) from exc
+    if not source.is_file():
+        raise FileNotFoundError(f"Uploaded file not found: {source}")
+    if source.is_symlink():
+        raise ValueError(f"Symlink uploads are not allowed: {source}")
+    return source
+def _resolve_and_validate_workspace_dir(workspace_dir: Path | None) -> Path:
+    if workspace_dir is not None and not isinstance(workspace_dir, Path):
+        raise ValueError("Workspace path has an invalid type.")
+    base_root = _workspace_root().resolve()
+    candidate = (
+        workspace_dir if workspace_dir is not None else _workspace_root()
+    ).resolve()
+    try:
+        candidate.relative_to(base_root)
+    except ValueError as exc:
+        raise ValueError(
+            f"Workspace path resolves outside allowed workspace root: {candidate}"
+        ) from exc
+    return candidate
+def copy_upload_to_workspace(
+    upload_path: str | Path,
+    *,
+    workspace_dir: Path | None = None,
+) -> tuple[Path, str | None]:
+    """
+    Copy upload into the session workspace.
+    Returns ``(destination_path, original_basename)`` where ``original_basename`` is
+    set only when the file was renamed for path safety.
+    """
+    source = _resolve_and_validate_upload_path(upload_path)
+    if not source.is_file():
+        raise FileNotFoundError(f"Uploaded file not found: {source}")
+    workspace_root = _resolve_and_validate_workspace_dir(workspace_dir)
+    workspace_root.mkdir(parents=True, exist_ok=True)
+    _original_name, safe_name, renamed = _workspace_filename_from_upload(source.name)
+    dest = (workspace_root / safe_name).resolve()
+    try:
+        dest.relative_to(workspace_root)
+    except ValueError as exc:
+        raise ValueError(f"Destination path is outside workspace: {dest}") from exc
+    if source != dest:
+        # copyfile only: copy2/copystat raises EPERM when overwriting on Docker Desktop bind mounts.
+        shutil.copyfile(source, dest)
+    return dest, (_original_name if renamed else None)
+def build_redaction_prompt(
+    file_name: str,
+    user_instructions: str,
+    *,
+    page_range: str = "all",
+    template: str | None = None,
+    settings: RedactionTaskSettings | None = None,
+    workspace_dir: Path | None = None,
+) -> str:
+    if not file_name.strip():
+        raise ValueError("A document file name is required.")
+    if not user_instructions.strip():
+        raise ValueError("Redaction requirements are required (use bullet points).")
+    task_settings = settings or RedactionTaskSettings()
+    workspace_root = (workspace_dir or _workspace_root()).resolve()
+    file_name = Path(file_name).name
+    input_path = f"{workspace_root.as_posix().rstrip('/')}/{file_name}"
+    output_base = f"{workspace_root.as_posix().rstrip('/')}/redact/{file_name}/"
+    text = template if template is not None else load_template()
+    remote_guidance = build_remote_backend_guidance(
+        gradio_url=_default_gradio_url(),
+        output_base=output_base,
+        workspace_root=workspace_root.as_posix(),
+    )
+    replacements = {
+        "{FILE_NAME}": file_name,
+        "{INPUT_PATH}": input_path,
+        "{OUTPUT_BASE}": output_base,
+        "{GRADIO_URL}": _default_gradio_url(),
+        "{PAGE_RANGE}": page_range.strip() or "all",
+        "{VLM_BASE_URL}": _default_vlm_base_url(),
+        "{VLM_MODEL}": _default_vlm_model(),
+        "{DEFAULT_OCR_METHOD}": task_settings.ocr_method,
+        "{DEFAULT_PII_METHOD}": task_settings.pii_method,
+        "{VLM_FACES_GUIDANCE}": build_vlm_faces_guidance(
+            task_settings.encourage_vlm_faces
+        ),
+        "{VLM_SIGNATURE_GUIDANCE}": build_vlm_signature_guidance(
+            task_settings.encourage_vlm_signatures,
+            task_settings.ocr_method,
+        ),
+    }
+    if remote_guidance:
+        replacements["{REMOTE_BACKEND_GUIDANCE}"] = remote_guidance
+    else:
+        text = text.replace("- {REMOTE_BACKEND_GUIDANCE}\n", "")
+    for key, value in replacements.items():
+        text = text.replace(key, value)
+    return replace_user_requirements_section(text, user_instructions)
+def prepare_redaction_task(
+    upload_path: str | Path | None,
+    user_instructions: str,
+    *,
+    page_range: str = "all",
+    settings: RedactionTaskSettings | None = None,
+    workspace_dir: Path | None = None,
+) -> tuple[str, str, str | None]:
+    """
+    Copy upload into workspace and return ``(file_name, full_prompt, renamed_from)``.
+    ``renamed_from`` is the original upload basename when it was adjusted for path
+    safety; otherwise ``None``.
+    """
+    if upload_path is None:
+        raise ValueError("Please upload a document.")
+    root = _resolve_and_validate_workspace_dir(workspace_dir)
+    validate_pdf_page_limit(upload_path, page_range=page_range)
+    dest, renamed_from = copy_upload_to_workspace(upload_path, workspace_dir=root)
+    prompt = build_redaction_prompt(
+        dest.name,
+        user_instructions,
+        page_range=page_range,
+        settings=settings,
+        workspace_dir=root,
+    )
+    return dest.name, prompt, renamed_from

agent-redact/pi/remote_redaction.py ADDED Viewed

	@@ -0,0 +1,104 @@

+"""Gradio client helpers for remote doc_redaction HF Space backends."""
+from __future__ import annotations
+import os
+from pathlib import Path
+from typing import Any
+from urllib.parse import quote
+import httpx
+from gradio_client import Client
+DEFAULT_CONNECT_TIMEOUT = 120.0
+DEFAULT_READ_TIMEOUT = 1800.0
+def redaction_base_url() -> str:
+    from redaction_prompt import doc_redaction_gradio_url
+    return doc_redaction_gradio_url()
+def redaction_hf_token() -> str | None:
+    token = os.environ.get("HF_TOKEN") or os.environ.get("DOC_REDACTION_HF_TOKEN")
+    return token.strip() if token and token.strip() else None
+def httpx_timeout(
+    *,
+    connect: float = DEFAULT_CONNECT_TIMEOUT,
+    read: float = DEFAULT_READ_TIMEOUT,
+) -> httpx.Timeout:
+    return httpx.Timeout(connect=connect, read=read, write=connect, pool=connect)
+def make_redaction_client(
+    base_url: str | None = None,
+    hf_token: str | None = None,
+) -> Client:
+    """Return a gradio_client for the remote doc_redaction Space."""
+    url = (base_url or redaction_base_url()).rstrip("/")
+    token = hf_token if hf_token is not None else redaction_hf_token()
+    kwargs = {"httpx_kwargs": {"timeout": httpx_timeout()}}
+    if token:
+        return Client(url, hf_token=token, **kwargs)
+    return Client(url, **kwargs)
+def _collect_paths(value: Any, out: list[str]) -> None:
+    if isinstance(value, str) and value.startswith("/"):
+        out.append(value)
+    elif isinstance(value, dict):
+        path = value.get("path")
+        if isinstance(path, str) and path.startswith("/"):
+            out.append(path)
+        for item in value.values():
+            _collect_paths(item, out)
+    elif isinstance(value, (list, tuple)):
+        for item in value:
+            _collect_paths(item, out)
+def extract_server_paths(result: Any) -> list[str]:
+    """Walk a gradio_client predict result and collect server file paths."""
+    paths: list[str] = []
+    _collect_paths(result, paths)
+    seen: set[str] = set()
+    ordered: list[str] = []
+    for path in paths:
+        if path not in seen:
+            seen.add(path)
+            ordered.append(path)
+    return ordered
+def download_gradio_files(
+    paths: list[str],
+    dest_dir: str | Path,
+    *,
+    base_url: str | None = None,
+    hf_token: str | None = None,
+) -> list[Path]:
+    """Download server paths from a Gradio Space into dest_dir."""
+    url = (base_url or redaction_base_url()).rstrip("/")
+    token = hf_token if hf_token is not None else redaction_hf_token()
+    headers: dict[str, str] = {}
+    if token:
+        headers["Authorization"] = f"Bearer {token.strip()}"
+    dest = Path(dest_dir)
+    dest.mkdir(parents=True, exist_ok=True)
+    downloaded: list[Path] = []
+    with httpx.Client(timeout=httpx_timeout(), headers=headers) as http:
+        for path in paths:
+            if not isinstance(path, str) or not path.startswith("/"):
+                continue
+            file_url = f"{url}/gradio_api/file={quote(path, safe='')}"
+            local_path = dest / Path(path).name
+            response = http.get(file_url)
+            response.raise_for_status()
+            local_path.write_bytes(response.content)
+            downloaded.append(local_path)
+    return downloaded

agent-redact/pi/session_logs.py ADDED Viewed

	@@ -0,0 +1,119 @@

+"""Resolve Pi agent session JSONL logs for Gradio download and usage-log persistence."""
+from __future__ import annotations
+import shutil
+from pathlib import Path
+from pi_agent_config import ensure_session_dir
+from pi_rpc_client import PiRpcClient, PiRpcError
+from tools.aws_functions import upload_log_file_to_s3
+from tools.config import (
+    RUN_AWS_FUNCTIONS,
+    S3_USAGE_LOGS_FOLDER,
+    SAVE_LOGS_TO_CSV,
+    USAGE_LOGS_FOLDER,
+)
+def _session_dir_root() -> Path:
+    return ensure_session_dir()
+def pi_session_file_from_client(client: PiRpcClient | None) -> Path | None:
+    """Return the active Pi session JSONL path from RPC state, if readable."""
+    if client is None or not client.running:
+        return None
+    try:
+        state = client.get_state()
+    except PiRpcError:
+        return None
+    raw = state.get("sessionFile")
+    if not raw or str(raw).strip() in ("", "—"):
+        return None
+    path = Path(str(raw)).expanduser()
+    if not path.is_file():
+        return None
+    resolved = path.resolve(strict=False)
+    try:
+        resolved.relative_to(_session_dir_root())
+    except ValueError:
+        return None
+    return resolved
+def _usage_log_archive_name(source: Path, session_hash: str = "") -> str:
+    if session_hash and str(session_hash).strip():
+        return f"{str(session_hash).strip()}_{source.name}"
+    return source.name
+def copy_session_log_to_usage_folder(
+    source: Path,
+    *,
+    session_hash: str = "",
+) -> Path | None:
+    """Copy a Pi session JSONL into ``USAGE_LOGS_FOLDER`` (beside ``usage_log.csv``)."""
+    if not SAVE_LOGS_TO_CSV:
+        return None
+    usage_dir = Path(USAGE_LOGS_FOLDER)
+    usage_dir.mkdir(parents=True, exist_ok=True)
+    dest = usage_dir / _usage_log_archive_name(source, session_hash)
+    try:
+        shutil.copy2(source, dest)
+    except OSError:
+        return None
+    return dest.resolve()
+def collect_session_log_download(client: PiRpcClient | None) -> str | None:
+    """Path suitable for ``gr.File`` download, or ``None`` if no log yet."""
+    path = pi_session_file_from_client(client)
+    if path is None:
+        return None
+    return str(path)
+def persist_session_log(
+    client: PiRpcClient | None,
+    *,
+    session_hash: str = "",
+) -> Path | None:
+    """
+    Archive the active Pi session JSONL when local usage logging is enabled.
+    Copies into ``USAGE_LOGS_FOLDER`` when ``SAVE_LOGS_TO_CSV`` is true, then
+    uploads that copy to ``S3_USAGE_LOGS_FOLDER`` when ``RUN_AWS_FUNCTIONS`` is true.
+    """
+    if not SAVE_LOGS_TO_CSV:
+        return None
+    source = pi_session_file_from_client(client)
+    if source is None:
+        return None
+    archived = copy_session_log_to_usage_folder(source, session_hash=session_hash)
+    if archived is None:
+        return None
+    if RUN_AWS_FUNCTIONS:
+        upload_log_file_to_s3(str(archived), S3_USAGE_LOGS_FOLDER)
+    return archived
+def export_session_log_to_s3(client: PiRpcClient | None) -> None:
+    """Back-compat: persist session log (local archive + optional S3)."""
+    persist_session_log(client)
+def gradio_session_log_allowed_paths() -> list[str]:
+    """Directories Gradio must allow to serve Pi session JSONL files."""
+    paths: list[str] = []
+    try:
+        paths.append(str(_session_dir_root()))
+    except OSError:
+        pass
+    if SAVE_LOGS_TO_CSV:
+        try:
+            paths.append(str(Path(USAGE_LOGS_FOLDER).resolve()))
+        except OSError:
+            pass
+    return paths

agent-redact/pi/session_workspace.py ADDED Viewed

	@@ -0,0 +1,192 @@

+"""Per-session workspace paths for the Pi Gradio UI (mirrors main app session folders)."""
+from __future__ import annotations
+import os
+import re
+import sys
+from pathlib import Path
+import gradio as gr
+_REPO_ROOT = Path(__file__).resolve().parents[2]
+if str(_REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(_REPO_ROOT))
+_SESSION_ID_RE = re.compile(r"[^a-zA-Z0-9_@.+-]+")
+def workspace_base_dir() -> Path:
+    """Shared Pi workspace root (see ``bootstrap_pi_config.ensure_pi_workspace_dir``)."""
+    raw = (os.environ.get("PI_WORKSPACE_DIR") or "").strip()
+    if raw:
+        path = Path(raw)
+    else:
+        from bootstrap_pi_config import ensure_pi_workspace_dir
+        return Path(ensure_pi_workspace_dir(_REPO_ROOT))
+    path.mkdir(parents=True, exist_ok=True)
+    return path.resolve()
+def _session_output_folder_enabled() -> bool:
+    """Read at call time so ``pi_agent.env`` / dotenv apply before first use."""
+    raw = (os.environ.get("SESSION_OUTPUT_FOLDER") or "").strip().lower()
+    return raw in {"1", "true", "yes", "on"}
+def session_workspace_enabled() -> bool:
+    """
+    When true, each Gradio session uses ``{PI_WORKSPACE_DIR}/{session_hash}/``.
+    Controlled by ``PI_SESSION_WORKSPACE`` in ``config/pi_agent.env`` (default on when unset).
+    Set ``PI_SESSION_WORKSPACE=false`` for a single shared workspace root.
+    """
+    raw = os.environ.get("PI_SESSION_WORKSPACE", "").strip().lower()
+    if raw in {"0", "false", "no", "off"}:
+        return False
+    if raw in {"1", "true", "yes", "on"}:
+        return True
+    if _session_output_folder_enabled():
+        return True
+    return True
+def workspace_base_dir_resolved() -> Path:
+    """Current workspace root (never cached at import)."""
+    return workspace_base_dir()
+def sanitize_session_id(raw: str) -> str:
+    cleaned = _SESSION_ID_RE.sub("_", (raw or "").strip())[:128].strip("_")
+    return cleaned or "default"
+def resolve_session_hash(request: gr.Request | None) -> str:
+    """
+    Resolve Gradio session id for per-user workspace folders.
+    Prefers ``request.session_hash`` (local Pi UI). Falls back to the main app's
+    Cognito/OIDC resolver when a deployment header is configured.
+    """
+    if request is None:
+        return "default"
+    gradio_hash = getattr(request, "session_hash", None)
+    if gradio_hash is not None and str(gradio_hash).strip():
+        return sanitize_session_id(str(gradio_hash))
+    from tools.gradio_platform import resolve_session_identity
+    try:
+        identity = resolve_session_identity(request)
+    except ValueError:
+        return "default"
+    return sanitize_session_id(str(identity))
+def effective_session_hash(
+    session_hash: str,
+    request: gr.Request | None = None,
+) -> str:
+    """
+    Use ``session_hash_state`` when set; otherwise resolve from the active request.
+    Gradio ``demo.load`` may run before ``request.session_hash`` exists, so handlers
+    should pass ``request`` and call this on each event.
+    """
+    stored = (session_hash or "").strip()
+    if stored and stored != "default":
+        return sanitize_session_id(stored)
+    if request is not None:
+        resolved = resolve_session_hash(request)
+        if resolved and resolved != "default":
+            return resolved
+    if stored:
+        return sanitize_session_id(stored)
+    return "default"
+def session_workspace_status_markdown(session_hash: str) -> str:
+    """Markdown for the workspace panel."""
+    workspace = ensure_session_workspace(session_hash)
+    path = workspace.as_posix()
+    if session_workspace_enabled():
+        return (
+            f"**Session id:** `{session_hash}`  \n" f"**Your workspace:** `{path}/`  \n"
+        )
+    return f"**Workspace:** `{path}/`"
+def prepare_session_workspace(
+    session_hash: str,
+    request: gr.Request | None = None,
+) -> tuple[str, Path, str]:
+    """
+    Resolve session id, create ``{PI_WORKSPACE_DIR}/{hash}/``, return status text.
+    Call at the start of redaction (and on page load) so the folder always exists.
+    """
+    effective = effective_session_hash(session_hash, request)
+    workspace = ensure_session_workspace(effective)
+    return effective, workspace, session_workspace_status_markdown(effective)
+def session_s3_outputs_prefix(session_hash: str) -> str:
+    """Session-scoped S3 output prefix (shared env vars with main app)."""
+    from tools.gradio_platform import build_s3_outputs_prefix
+    return build_s3_outputs_prefix(
+        session_hash,
+        session_scoped=session_workspace_enabled(),
+    )
+def session_workspace_dir(session_hash: str) -> Path:
+    base = workspace_base_dir().resolve()
+    if not session_workspace_enabled():
+        return base
+    safe_id = sanitize_session_id(session_hash)
+    candidate = (base / safe_id).resolve()
+    try:
+        candidate.relative_to(base)
+    except ValueError:
+        return (base / "default").resolve()
+    return candidate
+def ensure_session_workspace(session_hash: str) -> Path:
+    workspace = session_workspace_dir(session_hash)
+    workspace.mkdir(parents=True, exist_ok=True)
+    return workspace
+def init_session_workspace(
+    request: gr.Request,
+) -> tuple[str, gr.FileExplorer, str, str]:
+    """
+    App-load handler: create the session subfolder and scope the file explorer.
+    Returns ``(session_hash, file_explorer_update, status_markdown, s3_output_prefix)``.
+    """
+    session_hash, workspace, status = prepare_session_workspace("", request)
+    s3_prefix = session_s3_outputs_prefix(session_hash)
+    return (
+        session_hash,
+        gr.FileExplorer(root_dir=workspace.as_posix()),
+        status,
+        s3_prefix,
+    )
+def workspace_context_prefix(session_hash: str) -> str:
+    """Prefix Pi prompts so the agent uses the session workspace."""
+    if not session_workspace_enabled() or not session_hash.strip():
+        return ""
+    root = session_workspace_dir(session_hash).as_posix().rstrip("/")
+    return (
+        f"**Session workspace (mandatory):** all uploads, downloads, and redaction "
+        f"artifacts for this user must live under `{root}/`. "
+        f"Use `{root}/redact/<document>/` for per-document output trees. "
+        f"Do not write to `{root}/output_final_download/` (UI-managed download copies only). "
+        f"Do not read or write other session folders under `{workspace_base_dir().as_posix()}/`.\n\n"
+    )

agent-redact/pi/start.sh ADDED Viewed

	@@ -0,0 +1,26 @@

+#!/usr/bin/env bash
+# Start Gradio Pi chat UI in the background; keep container alive for `docker compose exec pi-agent pi`.
+set -euo pipefail
+export HOME="${HOME:-/home/node}"
+export PI_WORKDIR="${PI_WORKDIR:-/workspace/doc_redaction}"
+export PYTHONPATH="${PI_WORKDIR}:${PI_WORKDIR}/agent-redact/pi:${PYTHONPATH:-}"
+cd "$PI_WORKDIR"
+export APP_TYPE="${APP_TYPE:-pi}"
+export APP_CONFIG_PATH="${APP_CONFIG_PATH:-$PI_WORKDIR/config/pi_agent.env}"
+mkdir -p "${PI_WORKSPACE_DIR:-/home/user/app/workspace}"
+python3 agent-redact/pi/pi_agent_config.py
+if [ "${RUN_FASTAPI:-False}" = "True" ]; then
+  exec uvicorn gradio_app:app \
+    --app-dir agent-redact/pi \
+    --host "${GRADIO_SERVER_NAME:-0.0.0.0}" \
+    --port "${PI_GRADIO_PORT:-${GRADIO_SERVER_PORT:-7862}}"
+else
+  python3 agent-redact/pi/gradio_app.py &
+fi
+wait -n

agent-redact/requirements_pi_agent.txt ADDED Viewed

	@@ -0,0 +1,35 @@

+# Python stack for the pi-agent Docker image (orchestration + Pi Gradio UI).
+#
+# Excludes spaCy, Presidio, and OCR stacks — heavy redaction runs in redaction-app-llama.
+# Includes full Gradio for agent-redact/pi/gradio_app.py (chat frontend over Pi RPC mode).
+#
+# Version caps align with requirements_lightweight.txt where packages overlap.
+# --- Gradio UI + API client ---
+gradio>=6.9.0,<=6.10.0
+gradio-client>=2.0.0,<=2.4.0
+httpx<=0.28.1
+requests<=2.34.2
+starlette>=0.52.1
+# --- Config ---
+python-dotenv<=1.2.2
+# --- CSV / tabular review (skills, page-review merge) ---
+numpy<=2.4.4
+pandas<=2.3.3
+openpyxl<=3.1.5
+# --- PDF helpers (verify_redaction_coverage, preview scripts) ---
+pymupdf<=1.27.1
+# --- General utilities ---
+tabulate<=0.10.0
+rapidfuzz<=3.14.5
+defusedxml<=0.7.1
+# --- Shared platform features (logging, Cognito, S3 via tools/) ---
+boto3<=1.42.61
+bleach<=6.3.0
+fastapi>=0.115.0
+uvicorn>=0.34.0

agent_routes.py ADDED Viewed

	@@ -0,0 +1,1167 @@

+"""
+FastAPI routes for programmatic / agent callers.
+HTTP paths align with Gradio ``api_name`` values in app.py. See GET /agent/operations
+for the full map. Uses cli_redact.main(direct_mode_args=...) where a CLI task exists.
+"""
+from __future__ import annotations
+import io
+import os
+import sys
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+from fastapi import APIRouter, Depends, Header, HTTPException
+from fastapi.responses import JSONResponse
+from pydantic import BaseModel, Field, field_validator
+from tools.config import (
+    AWS_LLM_PII_OPTION,
+    AWS_PII_OPTION,
+    INFERENCE_SERVER_PII_OPTION,
+    INPUT_FOLDER,
+    LOCAL_OCR_MODEL_OPTIONS,
+    LOCAL_PII_OPTION,
+    LOCAL_TRANSFORMERS_LLM_PII_OPTION,
+    OUTPUT_FOLDER,
+)
+from tools.secure_path_utils import validate_path_safety
+router = APIRouter(tags=["Agent"])
+REPO_ROOT = Path(__file__).resolve().parent
+_MAX_INSTRUCTION_LEN = 16_000
+# NOTE: Paths from request bodies are untrusted. Avoid Path.resolve() on untrusted
+# input (CodeQL py/path-injection); instead normalize via os.path and enforce
+# containment under trusted roots.
+# Mirrors app.py api_name values (Gradio).
+GRADIO_API_NAMES: tuple[str, ...] = (
+    "redact_document",
+    "load_and_prepare_documents_or_data",
+    "apply_review_redactions",
+    "review_apply",
+    "pdf_summarise",
+    "tabular_redact",
+    "word_level_ocr_text_search",
+    "redact_data",
+    "find_duplicate_pages",
+    "find_duplicate_tabular",
+    "summarise_document",
+    "combine_review_csvs",
+    "combine_review_pdfs",
+    "export_review_redaction_overlay",
+    "export_review_page_ocr_visualisation",
+    "verify_redaction_coverage",
+)
+def _allowed_path_roots() -> list[Path]:
+    # Return roots without resolving. These are trusted config values, but avoiding
+    # Path.resolve() keeps CodeQL happy and matches our "no resolve on untrusted"
+    # approach elsewhere.
+    roots = [REPO_ROOT]
+    for folder in (INPUT_FOLDER, OUTPUT_FOLDER):
+        if folder:
+            roots.append(Path(str(folder)))
+    return roots
+def _sanitize_untrusted_path_input(path_str: str) -> str:
+    """Basic raw-input validation before any path normalization."""
+    if not isinstance(path_str, str):
+        raise HTTPException(status_code=400, detail="Path must be a string.")
+    cleaned = path_str.strip()
+    if not cleaned:
+        raise HTTPException(status_code=400, detail="Path must not be empty.")
+    if "\x00" in cleaned:
+        raise HTTPException(status_code=400, detail="Path contains invalid null byte.")
+    return cleaned
+def _normalize_untrusted_path_to_abs(path_str: str) -> str:
+    """
+    Expand ~, then normalize to an absolute path.
+    Relative paths are interpreted relative to REPO_ROOT (matching prior behaviour).
+    """
+    safe_input = _sanitize_untrusted_path_input(path_str)
+    expanded = os.path.expanduser(safe_input)
+    if os.path.isabs(expanded):
+        return os.path.normpath(os.path.abspath(expanded))
+    return os.path.normpath(os.path.abspath(os.path.join(str(REPO_ROOT), expanded)))
+def _must_be_under_allowed_roots(candidate_abs: str, original: str) -> None:
+    """Enforce candidate is contained under repo, INPUT_FOLDER, or OUTPUT_FOLDER."""
+    candidate_real = os.path.realpath(str(candidate_abs))
+    allowed_roots = [
+        os.path.realpath(os.path.abspath(str(p))) for p in _allowed_path_roots()
+    ]
+    for root in allowed_roots:
+        try:
+            common = os.path.commonpath([candidate_real, root])
+        except ValueError:
+            # Different drive on Windows or invalid path mix
+            continue
+        if common == root:
+            return
+    raise HTTPException(
+        status_code=403,
+        detail="Path must be under the app repo, INPUT_FOLDER, or OUTPUT_FOLDER",
+    )
+def _path_must_be_allowed_file(path_str: str) -> str:
+    """Resolve path, ensure it is under an allowed root and exists as a file."""
+    candidate_abs = _normalize_untrusted_path_to_abs(path_str)
+    candidate_real = os.path.realpath(candidate_abs)
+    # Validate both "safe path" patterns and containment under trusted roots.
+    _must_be_under_allowed_roots(candidate_real, path_str)
+    ok = any(
+        validate_path_safety(candidate_real, base_path=str(root))
+        for root in _allowed_path_roots()
+    )
+    if not ok:
+        raise HTTPException(status_code=400, detail=f"Unsafe path rejected: {path_str}")
+    try:
+        candidate_path = Path(candidate_real)
+        if not candidate_path.is_file():
+            raise HTTPException(
+                status_code=400, detail=f"Not a file or missing: {candidate_real}"
+            )
+    except OSError:
+        raise HTTPException(
+            status_code=400, detail=f"Not a file or missing: {candidate_real}"
+        )
+    return candidate_real
+def _path_must_be_allowed_directory(path_str: str, *, must_exist: bool = True) -> str:
+    """
+    Normalize and validate a directory path under allowed roots.
+    By default the directory must already exist; callers can opt out (e.g. output_dir
+    that will be created later by the CLI).
+    """
+    candidate_abs = _normalize_untrusted_path_to_abs(path_str)
+    candidate_real = os.path.realpath(candidate_abs)
+    _must_be_under_allowed_roots(candidate_real, path_str)
+    ok = any(
+        validate_path_safety(candidate_real, base_path=str(root))
+        for root in _allowed_path_roots()
+    )
+    if not ok:
+        raise HTTPException(status_code=400, detail=f"Unsafe path rejected: {path_str}")
+    if must_exist:
+        try:
+            if not Path(candidate_real).is_dir():
+                raise HTTPException(
+                    status_code=400, detail=f"Not a directory: {candidate_real}"
+                )
+        except OSError:
+            raise HTTPException(
+                status_code=400, detail=f"Not a directory: {candidate_real}"
+            )
+    return candidate_real
+def _optional_agent_api_key(x_agent_api_key: Optional[str] = Header(None)) -> None:
+    expected = os.environ.get("AGENT_API_KEY", "").strip()
+    if not expected:
+        return
+    if not x_agent_api_key or x_agent_api_key.strip() != expected:
+        raise HTTPException(
+            status_code=401,
+            detail="Set header X-Agent-API-Key to match AGENT_API_KEY environment variable",
+        )
+class AgentRedactDocumentRequest(BaseModel):
+    """Parity with Gradio api_name ``redact_document``."""
+    input_files: list[str] = Field(
+        ...,
+        min_length=1,
+        description="Paths to input files (PDF, images, or tabular/Word for anonymisation)",
+    )
+    instruction: Optional[str] = Field(
+        None,
+        description="Optional instructions for LLM-based PII detection (custom_llm_instructions)",
+    )
+    output_dir: Optional[str] = None
+    input_dir: Optional[str] = None
+    ocr_method: Optional[str] = Field(
+        None,
+        description=(
+            "High-level OCR/text mode. Accepted values: 'Local OCR', "
+            "'AWS Textract', 'Local text'. To choose a specific local OCR engine "
+            "(e.g. paddle/tesseract/vlm), set "
+            "overrides.chosen_local_ocr_model."
+        ),
+    )
+    pii_detector: Optional[str] = Field(
+        None,
+        description=(
+            "PII detection method. Recommended configured labels: "
+            f"'{LOCAL_PII_OPTION}', '{AWS_PII_OPTION}', '{AWS_LLM_PII_OPTION}', "
+            f"'{INFERENCE_SERVER_PII_OPTION}', '{LOCAL_TRANSFORMERS_LLM_PII_OPTION}', "
+            "'None'."
+        ),
+    )
+    overrides: Optional[dict[str, Any]] = Field(
+        None,
+        description=(
+            "Optional CLI flag overrides; keys must match argparse destination names. "
+            "For local OCR model selection, set 'chosen_local_ocr_model' "
+            f"(allowed models depend on deployment; configured options: {LOCAL_OCR_MODEL_OPTIONS})."
+        ),
+    )
+    model_config = {
+        "json_schema_extra": {
+            "examples": [
+                {
+                    "input_files": [
+                        "example_data/example_of_emails_sent_to_a_professor_before_applying.pdf"
+                    ],
+                    "instruction": "Do not redact the university name.",
+                    "ocr_method": "Local OCR",
+                    "pii_detector": LOCAL_PII_OPTION,
+                    "overrides": {"chosen_local_ocr_model": "paddle"},
+                }
+            ]
+        }
+    }
+    @field_validator("instruction")
+    @classmethod
+    def _cap_instruction(cls, v: Optional[str]) -> Optional[str]:
+        if v is None:
+            return v
+        if len(v) > _MAX_INSTRUCTION_LEN:
+            raise ValueError(f"instruction exceeds {_MAX_INSTRUCTION_LEN} characters")
+        return v
+class AgentRedactDataRequest(AgentRedactDocumentRequest):
+    """Parity with Gradio api_name ``redact_data``; same CLI task as redact_document."""
+class AgentTaskResponse(BaseModel):
+    status: str
+    gradio_api_name: str
+    task: str
+    output_dir: str
+    input_dir: str
+    message: str
+    log_excerpt: Optional[str] = None
+    output_paths: Optional[list[str]] = None
+class AgentVerifyRedactionRequest(BaseModel):
+    review_csv_path: str = Field(..., description="Path to *_review_file.csv")
+    ocr_words_csv_path: str = Field(
+        ..., description="Path to *_ocr_results_with_words_*.csv from the same run"
+    )
+    must_redact: Optional[List[str]] = Field(
+        None,
+        description="Regex patterns for terms that must be covered by review boxes.",
+    )
+    must_not_redact: Optional[List[str]] = Field(
+        None,
+        description="Regex patterns for terms that must not appear in review rows.",
+    )
+    redacted_pdf_path: Optional[str] = Field(
+        None, description="Optional applied *_redacted.pdf for text-layer leak checks."
+    )
+    total_pages: Optional[int] = Field(None, ge=1)
+    min_word_length: int = Field(3, ge=1, le=32)
+    sample_pixels: bool = Field(
+        False,
+        description="Sample pixel darkness at box centres on redacted PDF (requires redacted_pdf_path).",
+    )
+    auto_prune_suspicious: bool = Field(
+        False,
+        description="Remove prunable suspicious short/OCR-fragment rows and write pruned CSV.",
+    )
+    pruned_output_path: Optional[str] = Field(
+        None,
+        description="Output path for pruned CSV when auto_prune_suspicious is true.",
+    )
+class AgentVerifyRedactionResponse(BaseModel):
+    status: str
+    gradio_api_name: str = "verify_redaction_coverage"
+    coverage_pass: bool
+    coverage_pass_strict: bool
+    coverage_pass_with_cleanup: bool
+    pruned_csv_path: Optional[str] = None
+    prune_log: Optional[Dict[str, Any]] = None
+    report: Dict[str, Any]
+class AgentWordLevelOcrSearchRequest(BaseModel):
+    ocr_words_csv_path: str = Field(
+        ..., description="Path to *_ocr_results_with_words_*.csv"
+    )
+    search_text: str = Field(..., min_length=3, max_length=500)
+    similarity_threshold: float = Field(1.0, ge=0.0, le=1.0)
+    use_regex: bool = False
+    review_csv_path: Optional[str] = Field(
+        None,
+        description="Optional *_review_file.csv to flag whether each hit is covered by a box.",
+    )
+class AgentWordLevelOcrSearchResponse(BaseModel):
+    status: str
+    gradio_api_name: str = "word_level_ocr_text_search"
+    result: Dict[str, Any]
+def _merge_redact_direct_mode(body: AgentRedactDocumentRequest) -> dict[str, Any]:
+    from cli_redact import get_cli_default_args_dict
+    merged: dict[str, Any] = get_cli_default_args_dict()
+    merged["task"] = "redact"
+    merged["input_file"] = [_path_must_be_allowed_file(p) for p in body.input_files]
+    if body.instruction is not None:
+        merged["custom_llm_instructions"] = body.instruction
+    if body.output_dir is not None:
+        # Output folders may not exist yet (CLI will create). Still constrain to allowed roots.
+        merged["output_dir"] = _path_must_be_allowed_directory(
+            body.output_dir, must_exist=False
+        )
+    if body.input_dir is not None:
+        # Input dir should exist if provided.
+        merged["input_dir"] = _path_must_be_allowed_directory(
+            body.input_dir, must_exist=True
+        )
+    if body.ocr_method is not None:
+        merged["ocr_method"] = body.ocr_method
+    if body.pii_detector is not None:
+        merged["pii_detector"] = body.pii_detector
+    if body.overrides:
+        allowed = set(merged.keys())
+        for key, value in body.overrides.items():
+            if key not in allowed:
+                raise HTTPException(
+                    status_code=400,
+                    detail=f"Unknown override key '{key}'. Must be a known CLI argument name.",
+                )
+            merged[key] = value
+    return merged
+def _run_cli_main(direct: dict[str, Any], gradio_api_name: str) -> AgentTaskResponse:
+    from cli_redact import main as cli_main
+    buf = io.StringIO()
+    old_stdout = sys.stdout
+    try:
+        sys.stdout = buf
+        cli_main(direct_mode_args=direct)
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e)) from e
+    finally:
+        sys.stdout = old_stdout
+    log_excerpt = buf.getvalue()
+    if len(log_excerpt) > 8000:
+        log_excerpt = log_excerpt[-8000:]
+    return AgentTaskResponse(
+        status="completed",
+        gradio_api_name=gradio_api_name,
+        task=str(direct.get("task", "")),
+        output_dir=str(direct.get("output_dir", "")),
+        input_dir=str(direct.get("input_dir", "")),
+        message="cli_redact.main finished; see log_excerpt for console output",
+        log_excerpt=log_excerpt or None,
+    )
+@router.post(
+    "/redact_document",
+    response_model=AgentTaskResponse,
+    summary="redact_document (Gradio api_name)",
+    description=(
+        "Matches Gradio ``api_name='redact_document'``. "
+        "``python cli_redact.py --task redact --input_file ...``. "
+        "Optional ``instruction`` maps to ``custom_llm_instructions``. "
+        "OCR modes: 'Local OCR' | 'AWS Textract' | 'Local text'. "
+        "Specific local OCR engines are set via ``overrides.chosen_local_ocr_model`` "
+        f"(for example: {LOCAL_OCR_MODEL_OPTIONS}). "
+        "PII methods should use configured labels shown on the request schema."
+    ),
+)
+def post_redact_document(
+    body: AgentRedactDocumentRequest,
+    _: None = Depends(_optional_agent_api_key),
+) -> AgentTaskResponse:
+    direct = _merge_redact_direct_mode(body)
+    return _run_cli_main(direct, "redact_document")
+@router.post(
+    "/redact_data",
+    response_model=AgentTaskResponse,
+    summary="redact_data (Gradio api_name)",
+    description=(
+        "Matches Gradio ``api_name='redact_data'``. Same CLI ``redact`` task as "
+        "/redact_document; use CSV/XLSX/DOCX paths for tabular/Word flows. "
+        "OCR modes: 'Local OCR' | 'AWS Textract' | 'Local text'. "
+        "Specific local OCR engines are set via ``overrides.chosen_local_ocr_model`` "
+        f"(for example: {LOCAL_OCR_MODEL_OPTIONS}). "
+        "PII methods should use configured labels shown on the request schema."
+    ),
+)
+def post_redact_data(
+    body: AgentRedactDataRequest,
+    _: None = Depends(_optional_agent_api_key),
+) -> AgentTaskResponse:
+    direct = _merge_redact_direct_mode(body)
+    return _run_cli_main(direct, "redact_data")
+@router.post(
+    "/tasks/redact",
+    response_model=AgentTaskResponse,
+    summary="Legacy: same as /redact_document",
+    description="Deprecated alias; prefer POST /agent/redact_document.",
+    deprecated=True,
+    include_in_schema=True,
+)
+def post_tasks_redact_legacy(
+    body: AgentRedactDocumentRequest,
+    _: None = Depends(_optional_agent_api_key),
+) -> AgentTaskResponse:
+    direct = _merge_redact_direct_mode(body)
+    return _run_cli_main(direct, "redact_document")
+class AgentFindDuplicatePagesRequest(BaseModel):
+    input_files: list[str] = Field(..., min_length=1)
+    similarity_threshold: Optional[float] = None
+    min_word_count: Optional[int] = None
+    min_consecutive_pages: Optional[int] = None
+    greedy_match: Optional[bool] = None
+    combine_pages: Optional[bool] = None
+    overrides: Optional[dict[str, Any]] = None
+@router.post(
+    "/find_duplicate_pages",
+    response_model=AgentTaskResponse,
+    summary="find_duplicate_pages (Gradio api_name)",
+    description="``cli_redact --task deduplicate --duplicate_type pages``.",
+)
+def post_find_duplicate_pages(
+    body: AgentFindDuplicatePagesRequest,
+    _: None = Depends(_optional_agent_api_key),
+) -> AgentTaskResponse:
+    from cli_redact import get_cli_default_args_dict
+    merged = get_cli_default_args_dict()
+    merged["task"] = "deduplicate"
+    merged["duplicate_type"] = "pages"
+    merged["input_file"] = [_path_must_be_allowed_file(p) for p in body.input_files]
+    if body.similarity_threshold is not None:
+        merged["similarity_threshold"] = body.similarity_threshold
+    if body.min_word_count is not None:
+        merged["min_word_count"] = body.min_word_count
+    if body.min_consecutive_pages is not None:
+        merged["min_consecutive_pages"] = body.min_consecutive_pages
+    if body.greedy_match is not None:
+        merged["greedy_match"] = "True" if body.greedy_match else "False"
+    if body.combine_pages is not None:
+        merged["combine_pages"] = "True" if body.combine_pages else "False"
+    if body.overrides:
+        allowed = set(merged.keys())
+        for k, v in body.overrides.items():
+            if k not in allowed:
+                raise HTTPException(400, f"Unknown override key: {k}")
+            merged[k] = v
+    return _run_cli_main(merged, "find_duplicate_pages")
+class AgentFindDuplicateTabularRequest(BaseModel):
+    input_files: list[str] = Field(..., min_length=1)
+    text_columns: Optional[list[str]] = None
+    similarity_threshold: Optional[float] = None
+    min_word_count: Optional[int] = None
+    overrides: Optional[dict[str, Any]] = None
+@router.post(
+    "/find_duplicate_tabular",
+    response_model=AgentTaskResponse,
+    summary="find_duplicate_tabular (Gradio api_name)",
+)
+def post_find_duplicate_tabular(
+    body: AgentFindDuplicateTabularRequest,
+    _: None = Depends(_optional_agent_api_key),
+) -> AgentTaskResponse:
+    from cli_redact import get_cli_default_args_dict
+    merged = get_cli_default_args_dict()
+    merged["task"] = "deduplicate"
+    merged["duplicate_type"] = "tabular"
+    merged["input_file"] = [_path_must_be_allowed_file(p) for p in body.input_files]
+    if body.text_columns is not None:
+        merged["text_columns"] = body.text_columns
+    if body.similarity_threshold is not None:
+        merged["similarity_threshold"] = body.similarity_threshold
+    if body.min_word_count is not None:
+        merged["min_word_count"] = body.min_word_count
+    if body.overrides:
+        allowed = set(merged.keys())
+        for k, v in body.overrides.items():
+            if k not in allowed:
+                raise HTTPException(400, f"Unknown override key: {k}")
+            merged[k] = v
+    return _run_cli_main(merged, "find_duplicate_tabular")
+class AgentSummariseDocumentRequest(BaseModel):
+    input_files: list[str] = Field(..., min_length=1)
+    summarisation_inference_method: Optional[str] = None
+    summarisation_format: Optional[str] = None
+    summarisation_context: Optional[str] = None
+    summarisation_additional_instructions: Optional[str] = None
+    overrides: Optional[dict[str, Any]] = None
+@router.post(
+    "/summarise_document",
+    response_model=AgentTaskResponse,
+    summary="summarise_document (Gradio api_name)",
+)
+def post_summarise_document(
+    body: AgentSummariseDocumentRequest,
+    _: None = Depends(_optional_agent_api_key),
+) -> AgentTaskResponse:
+    from cli_redact import get_cli_default_args_dict
+    merged = get_cli_default_args_dict()
+    merged["task"] = "summarise"
+    merged["input_file"] = [_path_must_be_allowed_file(p) for p in body.input_files]
+    if body.summarisation_inference_method is not None:
+        merged["summarisation_inference_method"] = body.summarisation_inference_method
+    if body.summarisation_format is not None:
+        merged["summarisation_format"] = body.summarisation_format
+    if body.summarisation_context is not None:
+        merged["summarisation_context"] = body.summarisation_context
+    if body.summarisation_additional_instructions is not None:
+        merged["summarisation_additional_instructions"] = (
+            body.summarisation_additional_instructions
+        )
+    if body.overrides:
+        allowed = set(merged.keys())
+        for k, v in body.overrides.items():
+            if k not in allowed:
+                raise HTTPException(400, f"Unknown override key: {k}")
+            merged[k] = v
+    return _run_cli_main(merged, "summarise_document")
+class AgentCombineReviewPdfsRequest(BaseModel):
+    input_files: list[str] = Field(..., min_length=2)
+    output_dir: Optional[str] = None
+@router.post(
+    "/combine_review_pdfs",
+    response_model=AgentTaskResponse,
+    summary="combine_review_pdfs (Gradio api_name)",
+)
+def post_combine_review_pdfs(
+    body: AgentCombineReviewPdfsRequest,
+    _: None = Depends(_optional_agent_api_key),
+) -> AgentTaskResponse:
+    from cli_redact import get_cli_default_args_dict
+    merged = get_cli_default_args_dict()
+    merged["task"] = "combine_review_pdfs"
+    merged["input_file"] = [_path_must_be_allowed_file(p) for p in body.input_files]
+    if body.output_dir is not None:
+        merged["output_dir"] = _path_must_be_allowed_directory(body.output_dir)
+    return _run_cli_main(merged, "combine_review_pdfs")
+class _NamedPath:
+    """merge_csv_files expects objects with a .name attribute (Gradio file-like)."""
+    __slots__ = ("name",)
+    def __init__(self, path: str) -> None:
+        self.name = path
+class AgentCombineReviewCsvsRequest(BaseModel):
+    input_files: list[str] = Field(..., min_length=1)
+    output_dir: Optional[str] = Field(
+        None, description="Defaults to config OUTPUT_FOLDER"
+    )
+class AgentApplyReviewRedactionsRequest(BaseModel):
+    """Headless parity with Gradio ``api_name='apply_review_redactions'`` (prepare + apply)."""
+    pdf_path: str = Field(
+        ...,
+        description="Path to the source PDF under allowed roots.",
+    )
+    review_csv_path: str = Field(
+        ...,
+        description=(
+            "Path to the review plan CSV; basename must contain '_review_file' "
+            "(e.g. mydoc_review_file.csv)."
+        ),
+    )
+    output_dir: Optional[str] = Field(
+        None,
+        description="Output directory (created if missing); defaults to OUTPUT_FOLDER.",
+    )
+    input_dir: Optional[str] = Field(
+        None,
+        description="Input/working directory for page images; defaults to INPUT_FOLDER.",
+    )
+    text_extract_method: Optional[str] = Field(
+        None,
+        description="OCR/text mode passed to prepare (defaults to CLI ocr_method).",
+    )
+    efficient_ocr: Optional[bool] = Field(
+        None,
+        description="If set, overrides EFFICIENT_OCR for the prepare step.",
+    )
+@router.post(
+    "/combine_review_csvs",
+    response_model=AgentTaskResponse,
+    summary="combine_review_csvs (Gradio api_name)",
+    description="Uses tools.helper_functions.merge_csv_files (not cli_redact).",
+)
+def post_combine_review_csvs(
+    body: AgentCombineReviewCsvsRequest,
+    _: None = Depends(_optional_agent_api_key),
+) -> AgentTaskResponse:
+    from tools.helper_functions import merge_csv_files
+    paths = [_NamedPath(_path_must_be_allowed_file(p)) for p in body.input_files]
+    out_dir = body.output_dir or OUTPUT_FOLDER
+    out_dir_resolved = _path_must_be_allowed_directory(str(out_dir), must_exist=True)
+    sep = "/" if not out_dir_resolved.endswith(("/", "\\")) else ""
+    out_files = merge_csv_files(paths, output_folder=out_dir_resolved + sep)
+    return AgentTaskResponse(
+        status="completed",
+        gradio_api_name="combine_review_csvs",
+        task="combine_review_csvs",
+        output_dir=out_dir_resolved,
+        input_dir="",
+        message="merge_csv_files completed",
+        output_paths=out_files,
+    )
+class AgentExportReviewRedactionOverlayRequest(BaseModel):
+    """Agent JSON body for the same overlay render as Gradio ``api_name='page_redaction_review_image'``."""
+    page_image_path: str = Field(
+        ...,
+        description="Path to page raster (PNG/JPEG) used as underlay; must be under allowed roots.",
+    )
+    boxes: List[Dict[str, Any]] = Field(
+        ...,
+        min_length=1,
+        description="Annotator-style boxes: label, color, xmin, ymin, xmax, ymax (normalized 0–1).",
+    )
+    page_number: int = Field(
+        1, ge=1, description="1-based page index for the output filename."
+    )
+    doc_base_name: str = Field(
+        "review",
+        description="Basename for output file (e.g. document name without extension).",
+    )
+    review_df_records: Optional[List[Dict[str, Any]]] = Field(
+        None,
+        description="Optional rows (include at least 'label') for stable label→line-pattern mapping.",
+    )
+    label_abbrev_chars: Optional[int] = Field(
+        None,
+        ge=0,
+        le=24,
+        description="Draw this many leading characters of each label on the image; omit to use REVIEW_OVERLAY_LABEL_ABBREV_CHARS from config (0 = off).",
+    )
+class AgentExportReviewPageOcrVisualisationRequest(BaseModel):
+    """Agent JSON body for the same OCR visualisation as Gradio ``api_name='page_ocr_review_image'``."""
+    page_image_path: str = Field(
+        ...,
+        description="Path to page raster (PNG/JPEG) used as underlay; must be under allowed roots.",
+    )
+    ocr_results: Dict[str, Any] = Field(
+        ...,
+        description="Word-level OCR results dict (line_key -> {words:[{text, bounding_box, conf, ...}]}).",
+    )
+    page_number: int = Field(
+        1, ge=1, description="1-based page index (used for naming)."
+    )
+    doc_base_name: str = Field(
+        "review",
+        description="Basename for output file (e.g. document name without extension).",
+    )
+@router.post(
+    "/export_review_redaction_overlay",
+    response_model=AgentTaskResponse,
+    summary="export_review_redaction_overlay (Agent API; Gradio api_name: page_redaction_review_image)",
+    description=(
+        "Renders hollow redaction outlines and a top-right legend on the page image; "
+        "writes ``redaction_overlay/{doc_base_name}_page{n}_redaction_overlay.jpg`` under OUTPUT_FOLDER "
+        "(scaled per REVIEW_OVERLAY_MAX_PIXELS, JPEG capped by REVIEW_OVERLAY_MAX_FILE_BYTES). "
+        "Uses ``tools.redaction_review.visualise_review_redaction_boxes``."
+    ),
+)
+def post_export_review_redaction_overlay(
+    body: AgentExportReviewRedactionOverlayRequest,
+    _: None = Depends(_optional_agent_api_key),
+) -> AgentTaskResponse:
+    import pandas as pd
+    from tools.redaction_review import visualise_review_redaction_boxes
+    img_path = _path_must_be_allowed_file(body.page_image_path)
+    annotator: dict[str, Any] = {"image": img_path, "boxes": body.boxes}
+    review_df = (
+        pd.DataFrame(body.review_df_records)
+        if body.review_df_records
+        else pd.DataFrame()
+    )
+    out_folder_abs = os.path.realpath(
+        os.path.abspath(os.path.expanduser(str(OUTPUT_FOLDER)))
+    )
+    if not validate_path_safety(out_folder_abs):
+        raise HTTPException(status_code=400, detail="Unsafe OUTPUT_FOLDER path")
+    _must_be_under_allowed_roots(out_folder_abs, str(out_folder_abs))
+    try:
+        Path(out_folder_abs).mkdir(parents=True, exist_ok=True)
+    except OSError:
+        raise HTTPException(status_code=500, detail="Could not create OUTPUT_FOLDER")
+    out_folder = out_folder_abs
+    path = visualise_review_redaction_boxes(
+        annotator,
+        review_df=review_df,
+        output_folder=out_folder,
+        page_number=body.page_number,
+        doc_base_name=body.doc_base_name,
+        label_abbrev_chars=body.label_abbrev_chars,
+    )
+    if not path:
+        raise HTTPException(
+            status_code=500,
+            detail=(
+                "Could not produce overlay PNG (invalid image/boxes or write failed). "
+                "Ensure boxes are valid and the image loads."
+            ),
+        )
+    return AgentTaskResponse(
+        status="completed",
+        gradio_api_name="export_review_redaction_overlay",
+        task="export_review_redaction_overlay",
+        output_dir=out_folder,
+        input_dir="",
+        message="Redaction overlay PNG written",
+        output_paths=[path],
+    )
+@router.post(
+    "/export_review_page_ocr_visualisation",
+    response_model=AgentTaskResponse,
+    summary="export_review_page_ocr_visualisation (Agent API; Gradio api_name: page_ocr_review_image)",
+    description=(
+        "Renders a per-page OCR visualisation using tools.file_redaction.visualise_ocr_words_bounding_boxes; "
+        "writes under OUTPUT_FOLDER/review_ocr_visualisations/."
+    ),
+)
+def post_export_review_page_ocr_visualisation(
+    body: AgentExportReviewPageOcrVisualisationRequest,
+    _: None = Depends(_optional_agent_api_key),
+) -> AgentTaskResponse:
+    from PIL import Image
+    from tools.file_redaction import visualise_ocr_words_bounding_boxes
+    img_path = _path_must_be_allowed_file(body.page_image_path)
+    out_folder_abs = os.path.realpath(
+        os.path.abspath(os.path.expanduser(str(OUTPUT_FOLDER)))
+    )
+    if not validate_path_safety(out_folder_abs):
+        raise HTTPException(status_code=400, detail="Unsafe OUTPUT_FOLDER path")
+    _must_be_under_allowed_roots(out_folder_abs, str(out_folder_abs))
+    try:
+        Path(out_folder_abs).mkdir(parents=True, exist_ok=True)
+    except OSError:
+        raise HTTPException(status_code=500, detail="Could not create OUTPUT_FOLDER")
+    out_folder = out_folder_abs
+    safe_base = str(body.doc_base_name or "review")
+    image_name = f"{safe_base}_page{int(body.page_number)}.png"
+    log_paths: list[str] = []
+    try:
+        log_paths = visualise_ocr_words_bounding_boxes(
+            Image.open(img_path).convert("RGB"),
+            body.ocr_results,
+            image_name=image_name,
+            output_folder=out_folder,
+            visualisation_folder="review_ocr_visualisations",
+            add_legend=True,
+            log_files_output_paths=log_paths,
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e)) from e
+    if not log_paths:
+        raise HTTPException(
+            status_code=500,
+            detail="Could not produce OCR visualisation (invalid image/ocr_results or write failed).",
+        )
+    out_path = log_paths[-1]
+    return AgentTaskResponse(
+        status="completed",
+        gradio_api_name="export_review_page_ocr_visualisation",
+        task="export_review_page_ocr_visualisation",
+        output_dir=out_folder,
+        input_dir="",
+        message="OCR visualisation written",
+        output_paths=[out_path],
+    )
+def _gradio_only(api_name: str, detail: str) -> JSONResponse:
+    return JSONResponse(
+        status_code=501,
+        content={
+            "gradio_api_name": api_name,
+            "detail": detail,
+            "hint": (
+                "This flow is Gradio-session stateful. Call the named route on the "
+                "Gradio HTTP API, not /agent."
+            ),
+            "gradio_http": {
+                "discover_schema": "GET /gradio_api/info",
+                "start_call": f"POST /gradio_api/call/{api_name}",
+                "request_body_shape": '{"data": [<args in schema order>]}',
+                "poll": f"GET /gradio_api/call/{api_name}/{{event_id}}",
+            },
+            "gradio_client_notes": [
+                "Pass api_name explicitly; do not rely on inferring the endpoint from "
+                "Python function names (large Blocks apps will look ambiguous).",
+                "If predict() still cannot resolve the route, open GET /gradio_api/info "
+                "and use the numeric fn_index with gradio_client, or call the HTTP "
+                "endpoints directly.",
+                "The length of data must match the parameter list for this deployment; "
+                "copy order and types from /gradio_api/info.",
+            ],
+        },
+    )
+@router.post("/load_and_prepare_documents_or_data")
+def post_load_and_prepare_documents_or_data() -> JSONResponse:
+    return _gradio_only(
+        "load_and_prepare_documents_or_data",
+        "Preparation uses Gradio session state and prepare_image_or_pdf_with_efficient_ocr; no single CLI task.",
+    )
+@router.post(
+    "/apply_review_redactions",
+    response_model=AgentTaskResponse,
+    summary="apply_review_redactions (Gradio api_name)",
+    description=(
+        "Runs prepare_image_or_pdf_with_efficient_ocr([pdf, review_csv]) then "
+        "apply_redactions_to_review_df_and_files — same core pipeline as the Review tab, "
+        "without Gradio session state. Requires paths under allowed roots."
+    ),
+)
+def post_apply_review_redactions(
+    body: AgentApplyReviewRedactionsRequest,
+    _: None = Depends(_optional_agent_api_key),
+) -> AgentTaskResponse:
+    from tools.simplified_api import run_apply_review_redactions
+    pdf = _path_must_be_allowed_file(body.pdf_path)
+    csv = _path_must_be_allowed_file(body.review_csv_path)
+    out_dir: str | None = None
+    if body.output_dir is not None:
+        out_dir = _path_must_be_allowed_directory(body.output_dir, must_exist=False)
+    in_dir: str | None = None
+    if body.input_dir is not None:
+        in_dir = _path_must_be_allowed_directory(body.input_dir, must_exist=False)
+    try:
+        result = run_apply_review_redactions(
+            pdf_path=pdf,
+            review_csv_path=csv,
+            output_dir=out_dir,
+            input_dir=in_dir,
+            text_extract_method=body.text_extract_method,
+            efficient_ocr=body.efficient_ocr,
+        )
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=str(e)) from e
+    except Exception as e:
+        raise HTTPException(
+            status_code=500,
+            detail=f"apply_review_redactions failed: {e}",
+        ) from e
+    return AgentTaskResponse(
+        status="completed",
+        gradio_api_name="apply_review_redactions",
+        task="apply_review_redactions",
+        output_dir=result["output_dir"],
+        input_dir=result["input_dir"],
+        message=result["message"],
+        output_paths=result.get("output_paths"),
+    )
+@router.post(
+    "/verify_redaction_coverage",
+    response_model=AgentVerifyRedactionResponse,
+    summary="verify_redaction_coverage (Pass 1 programmatic QA)",
+)
+def post_verify_redaction_coverage(
+    body: AgentVerifyRedactionRequest,
+    _: None = Depends(_optional_agent_api_key),
+) -> AgentVerifyRedactionResponse:
+    from tools.simplified_api import run_verify_redaction_coverage
+    review = _path_must_be_allowed_file(body.review_csv_path)
+    ocr_words = _path_must_be_allowed_file(body.ocr_words_csv_path)
+    redacted = None
+    if body.redacted_pdf_path:
+        redacted = _path_must_be_allowed_file(body.redacted_pdf_path)
+    try:
+        report, pruned_csv_path, prune_log = run_verify_redaction_coverage(
+            review_csv_path=review,
+            ocr_words_csv_path=ocr_words,
+            must_redact=body.must_redact,
+            must_not_redact=body.must_not_redact,
+            redacted_pdf_path=redacted,
+            total_pages=body.total_pages,
+            min_word_length=body.min_word_length,
+            sample_pixels=body.sample_pixels,
+            auto_prune_suspicious=body.auto_prune_suspicious,
+            pruned_output_path=body.pruned_output_path,
+        )
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=str(e)) from e
+    except Exception as e:
+        raise HTTPException(
+            status_code=500, detail=f"verify_redaction_coverage failed: {e}"
+        ) from e
+    return AgentVerifyRedactionResponse(
+        status="completed",
+        coverage_pass=bool(report.get("pass_strict", report.get("pass"))),
+        coverage_pass_strict=bool(report.get("pass_strict", report.get("pass"))),
+        coverage_pass_with_cleanup=bool(report.get("pass_with_cleanup")),
+        pruned_csv_path=pruned_csv_path,
+        prune_log=prune_log,
+        report=report,
+    )
+@router.post(
+    "/word_level_ocr_text_search",
+    response_model=AgentWordLevelOcrSearchResponse,
+    summary="word_level_ocr_text_search (headless OCR CSV search)",
+)
+def post_word_level_ocr_text_search(
+    body: AgentWordLevelOcrSearchRequest,
+    _: None = Depends(_optional_agent_api_key),
+) -> AgentWordLevelOcrSearchResponse:
+    from tools.simplified_api import run_word_level_ocr_text_search_api
+    ocr_words = _path_must_be_allowed_file(body.ocr_words_csv_path)
+    review = None
+    if body.review_csv_path:
+        review = _path_must_be_allowed_file(body.review_csv_path)
+    try:
+        result = run_word_level_ocr_text_search_api(
+            ocr_words_csv_path=ocr_words,
+            search_text=body.search_text,
+            similarity_threshold=body.similarity_threshold,
+            use_regex=body.use_regex,
+            review_csv_path=review,
+        )
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=str(e)) from e
+    except Exception as e:
+        raise HTTPException(
+            status_code=500, detail=f"word_level_ocr_text_search failed: {e}"
+        ) from e
+    return AgentWordLevelOcrSearchResponse(status="completed", result=result)
+@router.get("/operations")
+def list_operations() -> dict[str, Any]:
+    return {
+        "gradio_api_names": list(GRADIO_API_NAMES),
+        "gradio_session_state_endpoints": {
+            "description": (
+                "These api_name values are exposed on the Gradio HTTP API but return "
+                "501 on /agent because they depend on in-memory Gradio state."
+            ),
+            "discover_schema": "GET /gradio_api/info",
+            "call_pattern": 'POST /gradio_api/call/<api_name> with JSON body {"data": [...]}',
+            "names": [
+                "load_and_prepare_documents_or_data",
+            ],
+        },
+        "routes": [
+            {
+                "gradio_api_name": "redact_document",
+                "method": "POST",
+                "path": "/agent/redact_document",
+                "implementation": "cli_redact task redact",
+                "notes": {
+                    "ocr_method": [
+                        "Local OCR",
+                        "AWS Textract",
+                        "Local text",
+                    ],
+                    "chosen_local_ocr_model_override": LOCAL_OCR_MODEL_OPTIONS,
+                    "pii_detector_recommended": [
+                        LOCAL_PII_OPTION,
+                        AWS_PII_OPTION,
+                        AWS_LLM_PII_OPTION,
+                        INFERENCE_SERVER_PII_OPTION,
+                        LOCAL_TRANSFORMERS_LLM_PII_OPTION,
+                        "None",
+                    ],
+                },
+            },
+            {
+                "gradio_api_name": "redact_data",
+                "method": "POST",
+                "path": "/agent/redact_data",
+                "implementation": "cli_redact task redact",
+                "notes": {
+                    "ocr_method": [
+                        "Local OCR",
+                        "AWS Textract",
+                        "Local text",
+                    ],
+                    "chosen_local_ocr_model_override": LOCAL_OCR_MODEL_OPTIONS,
+                    "pii_detector_recommended": [
+                        LOCAL_PII_OPTION,
+                        AWS_PII_OPTION,
+                        AWS_LLM_PII_OPTION,
+                        INFERENCE_SERVER_PII_OPTION,
+                        LOCAL_TRANSFORMERS_LLM_PII_OPTION,
+                        "None",
+                    ],
+                },
+            },
+            {
+                "gradio_api_name": "find_duplicate_pages",
+                "method": "POST",
+                "path": "/agent/find_duplicate_pages",
+                "implementation": "cli_redact deduplicate pages",
+            },
+            {
+                "gradio_api_name": "find_duplicate_tabular",
+                "method": "POST",
+                "path": "/agent/find_duplicate_tabular",
+                "implementation": "cli_redact deduplicate tabular",
+            },
+            {
+                "gradio_api_name": "summarise_document",
+                "method": "POST",
+                "path": "/agent/summarise_document",
+                "implementation": "cli_redact task summarise",
+            },
+            {
+                "gradio_api_name": "combine_review_pdfs",
+                "method": "POST",
+                "path": "/agent/combine_review_pdfs",
+                "implementation": "cli_redact combine_review_pdfs",
+            },
+            {
+                "gradio_api_name": "export_review_redaction_overlay",
+                "method": "POST",
+                "path": "/agent/export_review_redaction_overlay",
+                "implementation": "visualise_review_redaction_boxes",
+            },
+            {
+                "gradio_api_name": "export_review_page_ocr_visualisation",
+                "method": "POST",
+                "path": "/agent/export_review_page_ocr_visualisation",
+                "implementation": "visualise_ocr_words_bounding_boxes",
+            },
+            {
+                "gradio_api_name": "combine_review_csvs",
+                "method": "POST",
+                "path": "/agent/combine_review_csvs",
+                "implementation": "helper merge_csv_files",
+            },
+            {
+                "gradio_api_name": "load_and_prepare_documents_or_data",
+                "method": "POST",
+                "path": "/agent/load_and_prepare_documents_or_data",
+                "implementation": "not_implemented_http",
+            },
+            {
+                "gradio_api_name": "apply_review_redactions",
+                "method": "POST",
+                "path": "/agent/apply_review_redactions",
+                "implementation": "tools.simplified_api.run_apply_review_redactions",
+            },
+            {
+                "gradio_api_name": "verify_redaction_coverage",
+                "method": "POST",
+                "path": "/agent/verify_redaction_coverage",
+                "implementation": "tools.verify_redaction_coverage.verify_redaction_coverage",
+                "notes": {
+                    "purpose": "Pass 1 programmatic QA — pass_strict (policy), pass_with_cleanup (+ suspicious rows), optional prune and text/pixel checks.",
+                    "must_redact": "list of regex strings",
+                    "must_not_redact": "list of regex strings",
+                    "auto_prune_suspicious": "remove short OCR-fragment rows before reporting",
+                    "pages_flagged_for_vlm": "policy/visual failures only",
+                    "pages_needing_csv_cleanup": "suspicious rows — prune, not VLM",
+                    "leak_likely_causes": "per-page hints when text_layer_leaks (coord_not_normalized, missing_page_boxes, etc.) — not a broken /review_apply",
+                },
+            },
+            {
+                "gradio_api_name": "word_level_ocr_text_search",
+                "method": "POST",
+                "path": "/agent/word_level_ocr_text_search",
+                "implementation": "tools.verify_redaction_coverage.run_word_level_ocr_text_search",
+            },
+        ],
+    }
+@router.get("/health")
+def agent_health() -> dict[str, str]:
+    return {"status": "ok", "service": "agent"}

app.py ADDED Viewed

The diff for this file is too large to render. See raw diff

cdk/__init__.py ADDED Viewed

File without changes

cdk/app.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import os
+from aws_cdk import App, Environment
+from cdk_appregistry import register_doc_redaction_application
+from cdk_config import (
+    ALB_NAME,
+    APPREGISTRY_APPLICATION_NAME,
+    APPREGISTRY_ATTRIBUTE_GROUP_NAME,
+    APPREGISTRY_DESCRIPTION,
+    APPREGISTRY_REPOSITORY_URL,
+    APPREGISTRY_STACK_NAME,
+    AWS_ACCOUNT_ID,
+    AWS_REGION,
+    CDK_CONTEXT_FILE,
+    CDK_PREFIX,
+    ENABLE_APPREGISTRY,
+    RUN_USEAST_STACK,
+    USE_CLOUDFRONT,
+)
+from cdk_functions import (
+    create_basic_config_env,
+    load_context_from_file,
+    log_aws_credential_context,
+    purge_cdk_lookup_context,
+)
+from cdk_stack import CdkStack, CdkStackCloudfront  # , CdkStackMain
+from check_resources import CONTEXT_FILE, check_and_set_context
+# Initialize the CDK app
+app = App()
+log_aws_credential_context(
+    expected_account_id=AWS_ACCOUNT_ID,
+    expected_region=AWS_REGION,
+)
+# Drop stale CDK lookup cache entries (require bootstrap lookup role in target account).
+purge_cdk_lookup_context(CDK_CONTEXT_FILE)
+# --- Pre-check context (boto3) — written to precheck.context.json, NOT cdk.context.json ---
+print(f"Pre-check context file: {CONTEXT_FILE}")
+print(f"CDK lookup cache file: {CDK_CONTEXT_FILE}")
+if os.path.basename(CONTEXT_FILE.replace("\\", "/")) == os.path.basename(
+    CDK_CONTEXT_FILE.replace("\\", "/")
+):
+    raise RuntimeError(
+        f"CONTEXT_FILE and CDK_CONTEXT_FILE must differ (got '{CONTEXT_FILE}' for both). "
+        "Set CONTEXT_FILE=precheck.context.json in config/cdk_config.env."
+    )
+print("Running pre-check script to generate application context...")
+try:
+    check_and_set_context()
+    if not os.path.exists(CONTEXT_FILE):
+        raise RuntimeError(
+            f"check_and_set_context() finished, but {CONTEXT_FILE} was not created."
+        )
+    print(f"Context generated successfully at {CONTEXT_FILE}.")
+except Exception as e:
+    raise RuntimeError(f"Failed to generate context via check_and_set_context(): {e}")
+# Pre-check must not repopulate CDK lookup keys; purge again if paths were ever shared.
+purge_cdk_lookup_context(CDK_CONTEXT_FILE)
+if os.path.exists(CONTEXT_FILE):
+    load_context_from_file(app, CONTEXT_FILE)
+else:
+    raise RuntimeError(f"Could not find {CONTEXT_FILE}.")
+create_basic_config_env("config")
+aws_env_regional = Environment(account=AWS_ACCOUNT_ID, region=AWS_REGION)
+regional_stack = CdkStack(
+    app, "RedactionStack", env=aws_env_regional, cross_region_references=True
+)
+regional_stack.termination_protection = True
+if ENABLE_APPREGISTRY == "True":
+    # Use pre-check context only — not regional_stack.params (avoids AppRegistry
+    # -> RedactionStack dependency cycle during synth).
+    _alb_dns_context = app.node.try_get_context(f"dns:{ALB_NAME}")
+    _alb_dns_name = (
+        _alb_dns_context.strip()
+        if isinstance(_alb_dns_context, str) and _alb_dns_context.strip()
+        else None
+    )
+    appregistry_stack = register_doc_redaction_application(
+        app,
+        aws_account_id=AWS_ACCOUNT_ID,
+        aws_region=AWS_REGION,
+        application_name=APPREGISTRY_APPLICATION_NAME,
+        application_description=APPREGISTRY_DESCRIPTION,
+        appregistry_stack_name=APPREGISTRY_STACK_NAME,
+        attribute_group_name=APPREGISTRY_ATTRIBUTE_GROUP_NAME,
+        repository_url=APPREGISTRY_REPOSITORY_URL,
+        cdk_prefix=CDK_PREFIX,
+        use_cloudfront=USE_CLOUDFRONT,
+        alb_dns_name=_alb_dns_name,
+    )
+    appregistry_stack.termination_protection = True
+if USE_CLOUDFRONT == "True" and RUN_USEAST_STACK == "True":
+    aws_env_us_east_1 = Environment(account=AWS_ACCOUNT_ID, region="us-east-1")
+    cloudfront_stack = CdkStackCloudfront(
+        app,
+        "RedactionStackCloudfront",
+        env=aws_env_us_east_1,
+        alb_arn=regional_stack.params["alb_arn_output"],
+        alb_sec_group_id=regional_stack.params["alb_security_group_id"],
+        alb_dns_name=regional_stack.params["alb_dns_name"],
+        cross_region_references=True,
+    )
+# CDK CLI invokes this script and expects a cloud assembly in cdk.out.
+# Without app.synth(), Python defines constructs but never writes manifest.json
+# (ENOENT on deploy). See: https://github.com/aws/aws-cdk/issues/11023
+app.synth()

cdk/cdk.json.example ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "app": "python app.py",
+  "output": "cdk.out",
+  "context": {
+    "@aws-cdk/aws-ec2:restrictDefaultSecurityGroup": false
+  }
+}

cdk/cdk_appregistry.py ADDED Viewed

	@@ -0,0 +1,69 @@

+"""AWS Console myApplications (Service Catalog AppRegistry) integration."""
+from aws_cdk import App, Environment
+from aws_cdk.aws_servicecatalogappregistry_alpha import (
+    ApplicationAssociator,
+    TargetApplication,
+)
+def register_doc_redaction_application(
+    app: App,
+    *,
+    aws_account_id: str,
+    aws_region: str,
+    application_name: str,
+    application_description: str,
+    appregistry_stack_name: str,
+    attribute_group_name: str,
+    repository_url: str,
+    cdk_prefix: str,
+    use_cloudfront: str,
+    alb_dns_name: str | None = None,
+) -> ApplicationAssociator:
+    """
+    Register regional CDK stacks with AWS Console myApplications.
+    Only stacks in ``aws_region`` are associated (phase 1). Cross-region stacks
+    such as RedactionStackCloudfront (us-east-1) are not included.
+    ``alb_dns_name`` must be a plain string (e.g. from pre-check context). Do not
+    pass a CloudFormation token from RedactionStack or synth will fail with a
+    dependency cycle against the associator stack.
+    """
+    associator = ApplicationAssociator(
+        app,
+        "DocRedactionAppRegistry",
+        applications=[
+            TargetApplication.create_application_stack(
+                application_name=application_name,
+                application_description=application_description,
+                stack_name=appregistry_stack_name,
+                env=Environment(account=aws_account_id, region=aws_region),
+            )
+        ],
+    )
+    attributes = {
+        "repository": repository_url,
+        "cdkPrefix": cdk_prefix,
+        "awsRegion": aws_region,
+        "useCloudFront": use_cloudfront,
+        "cloudFrontInAppRegistry": "false",
+        "cloudFrontNote": (
+            "CloudFront/WAF (RedactionStackCloudfront) is in us-east-1 and is "
+            "not linked to this myApplications entry in phase 1. View it in "
+            "CloudFormation (us-east-1) or the CloudFront console."
+        ),
+    }
+    if alb_dns_name:
+        attributes["albDnsName"] = alb_dns_name
+    associator.app_registry_application.add_attribute_group(
+        "DocRedactionAttributeGroup",
+        attribute_group_name=attribute_group_name,
+        description="doc_redaction deployment metadata",
+        attributes=attributes,
+    )
+    return associator

cdk/cdk_config.py ADDED Viewed

	@@ -0,0 +1,590 @@

+import os
+import tempfile
+from typing import List
+from dotenv import load_dotenv
+# Set or retrieve configuration variables for CDK redaction deployment
+def convert_string_to_boolean(value: str) -> bool:
+    """Convert string to boolean, handling various formats."""
+    if isinstance(value, bool):
+        return value
+    elif value in ["True", "1", "true", "TRUE"]:
+        return True
+    elif value in ["False", "0", "false", "FALSE"]:
+        return False
+    else:
+        raise ValueError(f"Invalid boolean value: {value}")
+def parse_comma_separated_list(value: str) -> List[str]:
+    """Parse a comma-separated env value into a list of non-empty strings."""
+    if not value or not str(value).strip():
+        return []
+    cleaned = str(value).strip().strip("[]")
+    return [
+        part.strip().strip('"').strip("'")
+        for part in cleaned.split(",")
+        if part.strip()
+    ]
+def get_or_create_env_var(var_name: str, default_value: str, print_val: bool = False):
+    """
+    Get an environmental variable, and set it to a default value if it doesn't exist
+    """
+    # Get the environment variable if it exists
+    value = os.environ.get(var_name)
+    # If it doesn't exist, set the environment variable to the default value
+    if value is None:
+        os.environ[var_name] = default_value
+        value = default_value
+    if print_val is True:
+        print(f"The value of {var_name} is {value}")
+    return value
+def ensure_folder_exists(output_folder: str):
+    """Checks if the specified folder exists, creates it if not."""
+    if not os.path.exists(output_folder):
+        # Create the folder if it doesn't exist
+        os.makedirs(output_folder, exist_ok=True)
+        print(f"Created the {output_folder} folder.")
+    else:
+        print(f"The {output_folder} folder already exists.")
+def add_folder_to_path(folder_path: str):
+    """
+    Check if a folder exists on your system. If so, get the absolute path and then add it to the system Path variable if it doesn't already exist. Function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
+    """
+    if os.path.exists(folder_path) and os.path.isdir(folder_path):
+        print(folder_path, "folder exists.")
+        # Resolve relative path to absolute path
+        absolute_path = os.path.abspath(folder_path)
+        current_path = os.environ["PATH"]
+        if absolute_path not in current_path.split(os.pathsep):
+            full_path_extension = absolute_path + os.pathsep + current_path
+            os.environ["PATH"] = full_path_extension
+            # print(f"Updated PATH with: ", full_path_extension)
+        else:
+            print(f"Directory {folder_path} already exists in PATH.")
+    else:
+        print(f"Folder not found at {folder_path} - not added to PATH")
+###
+# LOAD CONFIG FROM ENV FILE
+###
+CONFIG_FOLDER = get_or_create_env_var("CONFIG_FOLDER", "config/")
+ensure_folder_exists(CONFIG_FOLDER)
+# If you have an aws_config env file in the config folder, you can load in app variables this way, e.g. 'config/cdk_config.env'
+CDK_CONFIG_PATH = get_or_create_env_var(
+    "CDK_CONFIG_PATH", "config/cdk_config.env"
+)  # e.g. config/cdk_config.env
+if CDK_CONFIG_PATH:
+    if os.path.exists(CDK_CONFIG_PATH):
+        print(f"Loading CDK variables from config file {CDK_CONFIG_PATH}")
+        load_dotenv(CDK_CONFIG_PATH)
+    else:
+        print("CDK config file not found at location:", CDK_CONFIG_PATH)
+###
+# AWS OPTIONS
+###
+AWS_REGION = get_or_create_env_var("AWS_REGION", "")
+AWS_ACCOUNT_ID = get_or_create_env_var("AWS_ACCOUNT_ID", "")
+###
+# CDK OPTIONS
+###
+CDK_PREFIX = get_or_create_env_var("CDK_PREFIX", "")
+# AWS Console myApplications (Service Catalog AppRegistry)
+ENABLE_APPREGISTRY = get_or_create_env_var("ENABLE_APPREGISTRY", "True")
+APPREGISTRY_APPLICATION_NAME = get_or_create_env_var(
+    "APPREGISTRY_APPLICATION_NAME", f"{CDK_PREFIX}doc-redaction"
+)
+APPREGISTRY_DESCRIPTION = get_or_create_env_var(
+    "APPREGISTRY_DESCRIPTION",
+    "PII document redaction app (ALB, ECS Fargate, Cognito, S3)",
+)
+APPREGISTRY_STACK_NAME = get_or_create_env_var(
+    "APPREGISTRY_STACK_NAME", f"{CDK_PREFIX}AppRegistryStack"
+)
+APPREGISTRY_ATTRIBUTE_GROUP_NAME = get_or_create_env_var(
+    "APPREGISTRY_ATTRIBUTE_GROUP_NAME",
+    f"{APPREGISTRY_APPLICATION_NAME}-metadata",
+)
+APPREGISTRY_REPOSITORY_URL = get_or_create_env_var(
+    "APPREGISTRY_REPOSITORY_URL",
+    "https://github.com/seanpedrick-case/doc_redaction",
+)
+_precheck_context_file = get_or_create_env_var("CONTEXT_FILE", "precheck.context.json")
+# Never write boto3 pre-check output into CDK's lookup cache file (causes stale
+# vpc-provider / load-balancer entries and wrong-account lookup validation errors).
+if os.path.basename(_precheck_context_file.replace("\\", "/")) == "cdk.context.json":
+    print(
+        "WARNING: CONTEXT_FILE must not be 'cdk.context.json' (that file is CDK's "
+        "lookup cache). Using 'precheck.context.json' instead. Update "
+        "config/cdk_config.env and remove CONTEXT_FILE=cdk.context.json if set."
+    )
+    _precheck_context_file = "precheck.context.json"
+CONTEXT_FILE = _precheck_context_file
+CDK_CONTEXT_FILE = get_or_create_env_var("CDK_CONTEXT_FILE", "cdk.context.json")
+CDK_FOLDER = get_or_create_env_var(
+    "CDK_FOLDER", ""
+)  # FULL_PATH_TO_CDK_FOLDER_HERE (with forward slash)
+# App runtime config (uploaded to S3 for legacy Fargate; inlined for ECS Express Mode)
+_app_config_rel = os.path.join(CONFIG_FOLDER, "config.env").replace("\\", "/")
+APP_CONFIG_ENV_FILE = get_or_create_env_var(
+    "APP_CONFIG_ENV_FILE",
+    (
+        os.path.normpath(os.path.join(CDK_FOLDER, _app_config_rel))
+        if CDK_FOLDER
+        else os.path.normpath(_app_config_rel)
+    ),
+)
+RUN_USEAST_STACK = get_or_create_env_var("RUN_USEAST_STACK", "False")
+### VPC and connections
+VPC_NAME = get_or_create_env_var("VPC_NAME", "")
+NEW_VPC_DEFAULT_NAME = get_or_create_env_var("NEW_VPC_DEFAULT_NAME", f"{CDK_PREFIX}vpc")
+NEW_VPC_CIDR = get_or_create_env_var("NEW_VPC_CIDR", "")  # "10.0.0.0/24"
+EXISTING_IGW_ID = get_or_create_env_var("EXISTING_IGW_ID", "")
+SINGLE_NAT_GATEWAY_ID = get_or_create_env_var("SINGLE_NAT_GATEWAY_ID", "")
+### SUBNETS / ROUTE TABLES / NAT GATEWAY
+PUBLIC_SUBNETS_TO_USE = get_or_create_env_var(
+    "PUBLIC_SUBNETS_TO_USE", ""
+)  # e.g. ['PublicSubnet1', 'PublicSubnet2']
+PUBLIC_SUBNET_CIDR_BLOCKS = get_or_create_env_var(
+    "PUBLIC_SUBNET_CIDR_BLOCKS", ""
+)  # e.g. ["10.0.1.0/24", "10.0.2.0/24"]
+PUBLIC_SUBNET_AVAILABILITY_ZONES = get_or_create_env_var(
+    "PUBLIC_SUBNET_AVAILABILITY_ZONES", ""
+)  # e.g. ["eu-east-1b", "eu-east1b"]
+PRIVATE_SUBNETS_TO_USE = get_or_create_env_var(
+    "PRIVATE_SUBNETS_TO_USE", ""
+)  # e.g. ['PrivateSubnet1', 'PrivateSubnet2']
+PRIVATE_SUBNET_CIDR_BLOCKS = get_or_create_env_var(
+    "PRIVATE_SUBNET_CIDR_BLOCKS", ""
+)  # e.g. ["10.0.1.0/24", "10.0.2.0/24"]
+PRIVATE_SUBNET_AVAILABILITY_ZONES = get_or_create_env_var(
+    "PRIVATE_SUBNET_AVAILABILITY_ZONES", ""
+)  # e.g. ["eu-east-1b", "eu-east1b"]
+ROUTE_TABLE_BASE_NAME = get_or_create_env_var(
+    "ROUTE_TABLE_BASE_NAME", f"{CDK_PREFIX}PrivateRouteTable"
+)
+NAT_GATEWAY_EIP_NAME = get_or_create_env_var(
+    "NAT_GATEWAY_EIP_NAME", f"{CDK_PREFIX}NatGatewayEip"
+)
+NAT_GATEWAY_NAME = get_or_create_env_var("NAT_GATEWAY_NAME", f"{CDK_PREFIX}NatGateway")
+# IAM roles
+AWS_MANAGED_TASK_ROLES_LIST = get_or_create_env_var(
+    "AWS_MANAGED_TASK_ROLES_LIST",
+    '["AmazonCognitoReadOnly", "service-role/AmazonECSTaskExecutionRolePolicy", "AmazonS3FullAccess", "AmazonTextractFullAccess", "ComprehendReadOnly", "AmazonDynamoDBFullAccess", "service-role/AWSAppSyncPushToCloudWatchLogs", "AmazonBedrockFullAccess"]',
+)
+POLICY_FILE_LOCATIONS = get_or_create_env_var(
+    "POLICY_FILE_LOCATIONS", ""
+)  # e.g. '["config/sts_permissions.json"]'
+POLICY_FILE_ARNS = get_or_create_env_var("POLICY_FILE_ARNS", "")
+# GITHUB REPO
+GITHUB_REPO_USERNAME = get_or_create_env_var("GITHUB_REPO_USERNAME", "seanpedrick-case")
+GITHUB_REPO_NAME = get_or_create_env_var("GITHUB_REPO_NAME", "doc_redaction")
+GITHUB_REPO_BRANCH = get_or_create_env_var("GITHUB_REPO_BRANCH", "main")
+### CODEBUILD
+CODEBUILD_ROLE_NAME = get_or_create_env_var(
+    "CODEBUILD_ROLE_NAME", f"{CDK_PREFIX}CodeBuildRole"
+)
+CODEBUILD_PROJECT_NAME = get_or_create_env_var(
+    "CODEBUILD_PROJECT_NAME", f"{CDK_PREFIX}CodeBuildProject"
+)
+### ECR
+ECR_REPO_NAME = get_or_create_env_var(
+    "ECR_REPO_NAME", "doc-redaction"
+)  # Beware - cannot have underscores and must be lower case
+ECR_CDK_REPO_NAME = get_or_create_env_var(
+    "ECR_CDK_REPO_NAME", f"{CDK_PREFIX}{ECR_REPO_NAME}".lower()
+)
+### S3
+S3_LOG_CONFIG_BUCKET_NAME = get_or_create_env_var(
+    "S3_LOG_CONFIG_BUCKET_NAME", f"{CDK_PREFIX}s3-logs".lower()
+)  # S3 bucket names need to be lower case
+S3_OUTPUT_BUCKET_NAME = get_or_create_env_var(
+    "S3_OUTPUT_BUCKET_NAME", f"{CDK_PREFIX}s3-output".lower()
+)
+### KMS KEYS FOR S3 AND SECRETS MANAGER
+USE_CUSTOM_KMS_KEY = get_or_create_env_var("USE_CUSTOM_KMS_KEY", "1")
+CUSTOM_KMS_KEY_NAME = get_or_create_env_var(
+    "CUSTOM_KMS_KEY_NAME", f"alias/{CDK_PREFIX}kms-key".lower()
+)
+### ECS
+FARGATE_TASK_DEFINITION_NAME = get_or_create_env_var(
+    "FARGATE_TASK_DEFINITION_NAME", f"{CDK_PREFIX}FargateTaskDefinition"
+)
+TASK_DEFINITION_FILE_LOCATION = get_or_create_env_var(
+    "TASK_DEFINITION_FILE_LOCATION", CDK_FOLDER + CONFIG_FOLDER + "task_definition.json"
+)
+CLUSTER_NAME = get_or_create_env_var("CLUSTER_NAME", f"{CDK_PREFIX}Cluster")
+ECS_SERVICE_NAME = get_or_create_env_var("ECS_SERVICE_NAME", f"{CDK_PREFIX}ECSService")
+ECS_TASK_ROLE_NAME = get_or_create_env_var(
+    "ECS_TASK_ROLE_NAME", f"{CDK_PREFIX}TaskRole"
+)
+ECS_TASK_EXECUTION_ROLE_NAME = get_or_create_env_var(
+    "ECS_TASK_EXECUTION_ROLE_NAME", f"{CDK_PREFIX}ExecutionRole"
+)
+ECS_SECURITY_GROUP_NAME = get_or_create_env_var(
+    "ECS_SECURITY_GROUP_NAME", f"{CDK_PREFIX}SecurityGroupECS"
+)
+ECS_LOG_GROUP_NAME = get_or_create_env_var(
+    "ECS_LOG_GROUP_NAME", f"/ecs/{ECS_SERVICE_NAME}-logs".lower()
+)
+ECS_TASK_CPU_SIZE = get_or_create_env_var("ECS_TASK_CPU_SIZE", "1024")
+ECS_TASK_MEMORY_SIZE = get_or_create_env_var("ECS_TASK_MEMORY_SIZE", "4096")
+ECS_USE_FARGATE_SPOT = get_or_create_env_var("USE_FARGATE_SPOT", "False")
+ECS_READ_ONLY_FILE_SYSTEM = get_or_create_env_var("ECS_READ_ONLY_FILE_SYSTEM", "True")
+### Cognito
+COGNITO_USER_POOL_NAME = get_or_create_env_var(
+    "COGNITO_USER_POOL_NAME", f"{CDK_PREFIX}UserPool"
+)
+COGNITO_USER_POOL_CLIENT_NAME = get_or_create_env_var(
+    "COGNITO_USER_POOL_CLIENT_NAME", f"{CDK_PREFIX}UserPoolClient"
+)
+COGNITO_USER_POOL_CLIENT_SECRET_NAME = get_or_create_env_var(
+    "COGNITO_USER_POOL_CLIENT_SECRET_NAME", f"{CDK_PREFIX}ParamCognitoSecret"
+)
+COGNITO_USER_POOL_DOMAIN_PREFIX = get_or_create_env_var(
+    "COGNITO_USER_POOL_DOMAIN_PREFIX", "redaction-app-domain"
+)  # Should change this to something unique or you'll probably hit an error
+COGNITO_REFRESH_TOKEN_VALIDITY = int(
+    get_or_create_env_var("COGNITO_REFRESH_TOKEN_VALIDITY", "480")
+)  # Minutes
+COGNITO_ID_TOKEN_VALIDITY = int(
+    get_or_create_env_var("COGNITO_ID_TOKEN_VALIDITY", "60")
+)  # Minutes
+COGNITO_ACCESS_TOKEN_VALIDITY = int(
+    get_or_create_env_var("COGNITO_ACCESS_TOKEN_VALIDITY", "60")
+)  # Minutes
+# Application load balancer
+ALB_NAME = get_or_create_env_var(
+    "ALB_NAME", f"{CDK_PREFIX}Alb"[-32:]
+)  # Application load balancer name can be max 32 characters, so taking the last 32 characters of the suggested name
+ALB_NAME_SECURITY_GROUP_NAME = get_or_create_env_var(
+    "ALB_SECURITY_GROUP_NAME", f"{CDK_PREFIX}SecurityGroupALB"
+)
+ALB_TARGET_GROUP_NAME = get_or_create_env_var(
+    "ALB_TARGET_GROUP_NAME", f"{CDK_PREFIX}-tg"[-32:]
+)  # Max 32 characters
+EXISTING_LOAD_BALANCER_ARN = get_or_create_env_var("EXISTING_LOAD_BALANCER_ARN", "")
+EXISTING_LOAD_BALANCER_DNS = get_or_create_env_var(
+    "EXISTING_LOAD_BALANCER_DNS", "placeholder_load_balancer_dns.net"
+)
+## CLOUDFRONT
+USE_CLOUDFRONT = get_or_create_env_var("USE_CLOUDFRONT", "True")
+CLOUDFRONT_PREFIX_LIST_ID = get_or_create_env_var(
+    "CLOUDFRONT_PREFIX_LIST_ID", "pl-93a247fa"
+)
+CLOUDFRONT_GEO_RESTRICTION = get_or_create_env_var(
+    "CLOUDFRONT_GEO_RESTRICTION", ""
+)  # A country that Cloudfront restricts access to. See here: https://docs.aws.amazon.com/AmazonCloudFront/latest/DeveloperGuide/georestrictions.html
+CLOUDFRONT_DISTRIBUTION_NAME = get_or_create_env_var(
+    "CLOUDFRONT_DISTRIBUTION_NAME", f"{CDK_PREFIX}CfDist"
+)
+CLOUDFRONT_DOMAIN = get_or_create_env_var(
+    "CLOUDFRONT_DOMAIN", "cloudfront_placeholder.net"
+)
+# Certificate for Application load balancer (optional, for HTTPS and logins through the ALB)
+ACM_SSL_CERTIFICATE_ARN = get_or_create_env_var("ACM_SSL_CERTIFICATE_ARN", "")
+SSL_CERTIFICATE_DOMAIN = get_or_create_env_var(
+    "SSL_CERTIFICATE_DOMAIN", ""
+)  # e.g. example.com or www.example.com
+# ECS Express Mode (opt-in HTTPS ingress without supplying ACM_SSL_CERTIFICATE_ARN).
+# Pilot/dev: Express PrimaryContainer does not support S3 environmentFiles or Fargate mount points.
+USE_ECS_EXPRESS_MODE = get_or_create_env_var("USE_ECS_EXPRESS_MODE", "False")
+ECS_EXPRESS_SERVICE_NAME = get_or_create_env_var(
+    "ECS_EXPRESS_SERVICE_NAME", ECS_SERVICE_NAME
+)
+ECS_EXPRESS_HEALTH_CHECK_PATH = get_or_create_env_var(
+    "ECS_EXPRESS_HEALTH_CHECK_PATH", "/"
+)
+ECS_EXPRESS_INFRASTRUCTURE_ROLE_NAME = get_or_create_env_var(
+    "ECS_EXPRESS_INFRASTRUCTURE_ROLE_NAME", f"{CDK_PREFIX}ExpressInfraRole"
+)
+# After first deploy, set to ExpressServiceEndpoint output (https://...) if not using CloudFront.
+ECS_EXPRESS_COGNITO_REDIRECT_BASE = get_or_create_env_var(
+    "ECS_EXPRESS_COGNITO_REDIRECT_BASE", ""
+)
+if USE_ECS_EXPRESS_MODE == "True" and ACM_SSL_CERTIFICATE_ARN:
+    raise ValueError(
+        "USE_ECS_EXPRESS_MODE=True cannot be used with ACM_SSL_CERTIFICATE_ARN set. "
+        "Clear ACM_SSL_CERTIFICATE_ARN or set USE_ECS_EXPRESS_MODE=False."
+    )
+# ECS Service Connect (legacy Fargate only): VPC service-to-service HTTP to Gradio/FastAPI.
+ENABLE_ECS_SERVICE_CONNECT = get_or_create_env_var(
+    "ENABLE_ECS_SERVICE_CONNECT", "False"
+)
+ECS_SERVICE_CONNECT_NAMESPACE = get_or_create_env_var(
+    "ECS_SERVICE_CONNECT_NAMESPACE",
+    (f"{CDK_PREFIX}local".lower().replace("_", "-").strip("-") or "redaction-local"),
+)
+ECS_SERVICE_CONNECT_DISCOVERY_NAME = get_or_create_env_var(
+    "ECS_SERVICE_CONNECT_DISCOVERY_NAME", "redaction"
+)
+# Optional friendly DNS label; defaults to discovery name when empty.
+ECS_SERVICE_CONNECT_DNS_NAME = get_or_create_env_var("ECS_SERVICE_CONNECT_DNS_NAME", "")
+# Client task security groups (at least one of IDs, names, or CDK prefixes required when SC on).
+ECS_SERVICE_CONNECT_CLIENT_SECURITY_GROUP_IDS = get_or_create_env_var(
+    "ECS_SERVICE_CONNECT_CLIENT_SECURITY_GROUP_IDS", ""
+)
+ECS_SERVICE_CONNECT_CLIENT_SECURITY_GROUP_IDS_LIST = parse_comma_separated_list(
+    ECS_SERVICE_CONNECT_CLIENT_SECURITY_GROUP_IDS
+)
+ECS_SERVICE_CONNECT_CLIENT_SECURITY_GROUP_NAMES = get_or_create_env_var(
+    "ECS_SERVICE_CONNECT_CLIENT_SECURITY_GROUP_NAMES", ""
+)
+ECS_SERVICE_CONNECT_CLIENT_CDK_PREFIXES = get_or_create_env_var(
+    "ECS_SERVICE_CONNECT_CLIENT_CDK_PREFIXES", ""
+)
+# This should be the CloudFront domain, the domain linked to your ACM certificate, or the DNS of your application load balancer in console afterwards
+if USE_CLOUDFRONT == "True":
+    COGNITO_REDIRECTION_URL = get_or_create_env_var(
+        "COGNITO_REDIRECTION_URL", "https://" + CLOUDFRONT_DOMAIN
+    )
+elif SSL_CERTIFICATE_DOMAIN:
+    COGNITO_REDIRECTION_URL = get_or_create_env_var(
+        "COGNITO_REDIRECTION_URL", "https://" + SSL_CERTIFICATE_DOMAIN
+    )
+elif USE_ECS_EXPRESS_MODE == "True":
+    _express_redirect_default = ECS_EXPRESS_COGNITO_REDIRECT_BASE or (
+        "https://" + EXISTING_LOAD_BALANCER_DNS
+    )
+    COGNITO_REDIRECTION_URL = get_or_create_env_var(
+        "COGNITO_REDIRECTION_URL", _express_redirect_default
+    )
+else:
+    COGNITO_REDIRECTION_URL = get_or_create_env_var(
+        "COGNITO_REDIRECTION_URL", "https://" + EXISTING_LOAD_BALANCER_DNS
+    )
+# Custom headers e.g. if routing traffic through Cloudfront
+CUSTOM_HEADER = get_or_create_env_var(
+    "CUSTOM_HEADER", ""
+)  # Retrieving or setting CUSTOM_HEADER
+CUSTOM_HEADER_VALUE = get_or_create_env_var(
+    "CUSTOM_HEADER_VALUE", ""
+)  # Retrieving or setting CUSTOM_HEADER_VALUE
+# Firewall on top of load balancer
+LOAD_BALANCER_WEB_ACL_NAME = get_or_create_env_var(
+    "LOAD_BALANCER_WEB_ACL_NAME", f"{CDK_PREFIX}alb-web-acl"
+)
+# Firewall on top of CloudFront
+WEB_ACL_NAME = get_or_create_env_var("WEB_ACL_NAME", f"{CDK_PREFIX}cloudfront-web-acl")
+###
+# File I/O options
+###
+OUTPUT_FOLDER = get_or_create_env_var("GRADIO_OUTPUT_FOLDER", "output/")  # 'output/'
+INPUT_FOLDER = get_or_create_env_var("GRADIO_INPUT_FOLDER", "input/")  # 'input/'
+# Allow for files to be saved in a temporary folder for increased security in some instances
+if OUTPUT_FOLDER == "TEMP" or INPUT_FOLDER == "TEMP":
+    # Create a temporary directory
+    with tempfile.TemporaryDirectory() as temp_dir:
+        print(f"Temporary directory created at: {temp_dir}")
+        if OUTPUT_FOLDER == "TEMP":
+            OUTPUT_FOLDER = temp_dir + "/"
+        if INPUT_FOLDER == "TEMP":
+            INPUT_FOLDER = temp_dir + "/"
+###
+# LOGGING OPTIONS
+###
+SAVE_LOGS_TO_CSV = get_or_create_env_var("SAVE_LOGS_TO_CSV", "True")
+### DYNAMODB logs. Whether to save to DynamoDB, and the headers of the table
+SAVE_LOGS_TO_DYNAMODB = get_or_create_env_var("SAVE_LOGS_TO_DYNAMODB", "True")
+ACCESS_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var(
+    "ACCESS_LOG_DYNAMODB_TABLE_NAME", f"{CDK_PREFIX}dynamodb-access-logs".lower()
+)
+FEEDBACK_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var(
+    "FEEDBACK_LOG_DYNAMODB_TABLE_NAME", f"{CDK_PREFIX}dynamodb-feedback-logs".lower()
+)
+USAGE_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var(
+    "USAGE_LOG_DYNAMODB_TABLE_NAME", f"{CDK_PREFIX}dynamodb-usage-logs".lower()
+)
+###
+# REDACTION OPTIONS
+###
+# Get some environment variables and Launch the Gradio app
+COGNITO_AUTH = get_or_create_env_var("COGNITO_AUTH", "0")
+GRADIO_SERVER_PORT = int(get_or_create_env_var("GRADIO_SERVER_PORT", "7860"))
+# Must match the named port mapping on the Fargate container (see cdk_stack.py).
+ECS_SERVICE_CONNECT_PORT_MAPPING_NAME = get_or_create_env_var(
+    "ECS_SERVICE_CONNECT_PORT_MAPPING_NAME", f"port-{GRADIO_SERVER_PORT}"
+)
+# Suffix used with ECS_SERVICE_CONNECT_CLIENT_CDK_PREFIXES (matches this stack's ECS SG name).
+if ECS_SECURITY_GROUP_NAME.startswith(CDK_PREFIX):
+    _default_sc_client_sg_suffix = ECS_SECURITY_GROUP_NAME[len(CDK_PREFIX) :]
+else:
+    _default_sc_client_sg_suffix = "SecurityGroupECS"
+ECS_SERVICE_CONNECT_CLIENT_SG_NAME_SUFFIX = get_or_create_env_var(
+    "ECS_SERVICE_CONNECT_CLIENT_SG_NAME_SUFFIX", _default_sc_client_sg_suffix
+)
+ECS_SERVICE_CONNECT_CLIENT_SECURITY_GROUP_NAMES_LIST = parse_comma_separated_list(
+    ECS_SERVICE_CONNECT_CLIENT_SECURITY_GROUP_NAMES
+)
+ECS_SERVICE_CONNECT_CLIENT_CDK_PREFIXES_LIST = parse_comma_separated_list(
+    ECS_SERVICE_CONNECT_CLIENT_CDK_PREFIXES
+)
+def build_service_connect_client_security_group_names() -> List[str]:
+    """Explicit SG names plus {prefix}{suffix} for each client CDK_PREFIX."""
+    names: List[str] = list(ECS_SERVICE_CONNECT_CLIENT_SECURITY_GROUP_NAMES_LIST)
+    for prefix in ECS_SERVICE_CONNECT_CLIENT_CDK_PREFIXES_LIST:
+        names.append(f"{prefix}{ECS_SERVICE_CONNECT_CLIENT_SG_NAME_SUFFIX}")
+    deduped: List[str] = []
+    seen = set()
+    for name in names:
+        if name and name not in seen:
+            seen.add(name)
+            deduped.append(name)
+    return deduped
+ECS_SERVICE_CONNECT_CLIENT_SECURITY_GROUP_NAMES_TO_LOOKUP = (
+    build_service_connect_client_security_group_names()
+)
+if ENABLE_ECS_SERVICE_CONNECT == "True" and USE_ECS_EXPRESS_MODE == "True":
+    raise ValueError(
+        "ENABLE_ECS_SERVICE_CONNECT=True is only supported on the legacy Fargate "
+        "service path. Set USE_ECS_EXPRESS_MODE=False or disable Service Connect."
+    )
+# S3-uploaded job .env files trigger one-shot ECS Fargate tasks (direct mode / cli_redact).
+ENABLE_S3_BATCH_ECS_TRIGGER = get_or_create_env_var(
+    "ENABLE_S3_BATCH_ECS_TRIGGER", "False"
+)
+S3_BATCH_ENV_PREFIX = get_or_create_env_var("S3_BATCH_ENV_PREFIX", "input/config/")
+S3_BATCH_ENV_SUFFIX = get_or_create_env_var("S3_BATCH_ENV_SUFFIX", ".env")
+S3_BATCH_INPUT_PREFIX = get_or_create_env_var("S3_BATCH_INPUT_PREFIX", "input/")
+S3_BATCH_CONFIG_PREFIX = get_or_create_env_var("S3_BATCH_CONFIG_PREFIX", "")
+S3_BATCH_DEFAULT_PARAMS_KEY = get_or_create_env_var(
+    "S3_BATCH_DEFAULT_PARAMS_KEY", "general-config/batch_defaults.env"
+)
+S3_BATCH_LAMBDA_FUNCTION_NAME = get_or_create_env_var(
+    "S3_BATCH_LAMBDA_FUNCTION_NAME", ""
+)
+if ENABLE_S3_BATCH_ECS_TRIGGER == "True" and USE_ECS_EXPRESS_MODE == "True":
+    raise ValueError(
+        "ENABLE_S3_BATCH_ECS_TRIGGER=True requires the legacy Fargate task definition "
+        "for ecs.run_task. Set USE_ECS_EXPRESS_MODE=False or disable the batch trigger."
+    )
+# Pi agent Gradio UI (second Fargate service; shared legacy ALB + Service Connect to main app).
+ENABLE_PI_AGENT_ECS_SERVICE = get_or_create_env_var(
+    "ENABLE_PI_AGENT_ECS_SERVICE", "False"
+)
+ECR_PI_REPO_NAME = get_or_create_env_var(
+    "ECR_PI_REPO_NAME", f"{CDK_PREFIX}pi-agent".lower()
+)
+CODEBUILD_PI_PROJECT_NAME = get_or_create_env_var(
+    "CODEBUILD_PI_PROJECT_NAME", f"{CDK_PREFIX}CodeBuildPiAgent"
+)
+ECS_PI_SERVICE_NAME = get_or_create_env_var(
+    "ECS_PI_SERVICE_NAME", f"{CDK_PREFIX}PiAgentService"
+)
+ECS_PI_TASK_DEFINITION_NAME = get_or_create_env_var(
+    "ECS_PI_TASK_DEFINITION_NAME", f"{CDK_PREFIX}PiAgentTaskDefinition"
+)
+ECS_PI_SECURITY_GROUP_NAME = get_or_create_env_var(
+    "ECS_PI_SECURITY_GROUP_NAME", f"{CDK_PREFIX}SecurityGroupPiAgent"
+)
+ECS_PI_LOG_GROUP_NAME = get_or_create_env_var(
+    "ECS_PI_LOG_GROUP_NAME", f"/ecs/{ECS_PI_SERVICE_NAME}-logs".lower()
+)
+ECS_PI_TASK_CPU_SIZE = get_or_create_env_var("ECS_PI_TASK_CPU_SIZE", "1024")
+ECS_PI_TASK_MEMORY_SIZE = get_or_create_env_var("ECS_PI_TASK_MEMORY_SIZE", "2048")
+PI_GRADIO_PORT = get_or_create_env_var("PI_GRADIO_PORT", "7862")
+PI_ALB_HOST_HEADER = get_or_create_env_var("PI_ALB_HOST_HEADER", "")
+PI_ALB_TARGET_GROUP_NAME = get_or_create_env_var(
+    "PI_ALB_TARGET_GROUP_NAME", f"{CDK_PREFIX}PiAgentTG"[-32:]
+)
+PI_ALB_LISTENER_RULE_PRIORITY = int(
+    get_or_create_env_var("PI_ALB_LISTENER_RULE_PRIORITY", "1")
+)
+PI_AGENT_ENV_S3_KEY = get_or_create_env_var("PI_AGENT_ENV_S3_KEY", "pi_agent.env")
+if ENABLE_PI_AGENT_ECS_SERVICE == "True" and USE_ECS_EXPRESS_MODE == "True":
+    raise ValueError(
+        "ENABLE_PI_AGENT_ECS_SERVICE=True requires legacy Fargate (USE_ECS_EXPRESS_MODE=False)."
+    )
+if ENABLE_PI_AGENT_ECS_SERVICE == "True" and ENABLE_ECS_SERVICE_CONNECT != "True":
+    raise ValueError(
+        "ENABLE_PI_AGENT_ECS_SERVICE=True requires ENABLE_ECS_SERVICE_CONNECT=True "
+        "so the Pi task can reach the main app at http://<discovery>:7860."
+    )
+if ENABLE_PI_AGENT_ECS_SERVICE == "True" and not PI_ALB_HOST_HEADER.strip():
+    raise ValueError(
+        "ENABLE_PI_AGENT_ECS_SERVICE=True requires PI_ALB_HOST_HEADER "
+        "(host-header rule on the shared ALB, e.g. pi.redaction.example.com)."
+    )
+###
+# WHOLE DOCUMENT API OPTIONS
+###
+DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS = get_or_create_env_var(
+    "DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS", "7"
+)  # How many days into the past should whole document Textract jobs be displayed? After that, the data is not deleted from the Textract jobs csv, but it is just filtered out. Included to align with S3 buckets where the file outputs will be automatically deleted after X days.

cdk/cdk_functions.py ADDED Viewed

	@@ -0,0 +1,2448 @@

+import ipaddress
+import json
+import os
+from typing import Any, Dict, FrozenSet, List, Optional, Tuple, Union
+import boto3
+import pandas as pd
+from aws_cdk import App, CfnOutput, CfnTag, Duration, Fn, RemovalPolicy, Tags
+from aws_cdk import aws_cognito as cognito
+from aws_cdk import aws_ec2 as ec2
+from aws_cdk import aws_ecs as ecs
+from aws_cdk import aws_elasticloadbalancingv2 as elb
+from aws_cdk import aws_elasticloadbalancingv2_actions as elb_act
+from aws_cdk import aws_iam as iam
+from aws_cdk import aws_lambda as lambda_
+from aws_cdk import aws_logs as logs
+from aws_cdk import aws_s3 as s3
+from aws_cdk import aws_s3_notifications as s3n
+from aws_cdk import aws_secretsmanager as secretsmanager
+from aws_cdk import aws_wafv2 as wafv2
+from aws_cdk import custom_resources as cr
+from botocore.exceptions import ClientError, NoCredentialsError
+from cdk_config import (
+    ACCESS_LOG_DYNAMODB_TABLE_NAME,
+    AWS_REGION,
+    FEEDBACK_LOG_DYNAMODB_TABLE_NAME,
+    NAT_GATEWAY_EIP_NAME,
+    POLICY_FILE_LOCATIONS,
+    PRIVATE_SUBNET_AVAILABILITY_ZONES,
+    PRIVATE_SUBNET_CIDR_BLOCKS,
+    PRIVATE_SUBNETS_TO_USE,
+    PUBLIC_SUBNET_AVAILABILITY_ZONES,
+    PUBLIC_SUBNET_CIDR_BLOCKS,
+    PUBLIC_SUBNETS_TO_USE,
+    S3_LOG_CONFIG_BUCKET_NAME,
+    S3_OUTPUT_BUCKET_NAME,
+    USAGE_LOG_DYNAMODB_TABLE_NAME,
+)
+from constructs import Construct
+from dotenv import dotenv_values, set_key
+# CDK CLI stores lookup-provider results under these key prefixes in cdk.context.json.
+_CDK_LOOKUP_CONTEXT_PREFIXES = (
+    "vpc-provider:",
+    "load-balancer:",
+    "availability-zones:",
+    "hosted-zone:",
+    "security-group:",
+    "key-provider:",
+    "ami:",
+)
+def purge_cdk_lookup_context(file_path: str) -> int:
+    """Remove stale CDK lookup cache entries that require the bootstrap lookup role."""
+    if not os.path.exists(file_path):
+        return 0
+    with open(file_path, "r", encoding="utf-8") as f:
+        context_data = json.load(f)
+    cleaned = {
+        key: value
+        for key, value in context_data.items()
+        if not key.startswith(_CDK_LOOKUP_CONTEXT_PREFIXES)
+    }
+    removed = len(context_data) - len(cleaned)
+    if removed:
+        with open(file_path, "w", encoding="utf-8") as f:
+            json.dump(cleaned, f, indent=2)
+        print(f"Removed {removed} stale CDK lookup context key(s) from {file_path}.")
+    return removed
+def log_aws_credential_context(
+    expected_account_id: Optional[str] = None,
+    expected_region: Optional[str] = None,
+) -> Dict[str, Any]:
+    """
+    Print the active AWS identity and non-secret credential hints for CDK debugging.
+    Helps distinguish SSO/assumed-role sessions from long-lived access keys in
+    ~/.aws/credentials or environment variables.
+    """
+    profile = os.environ.get("AWS_PROFILE") or "(not set — using default profile chain)"
+    default_region = (
+        os.environ.get("AWS_REGION")
+        or os.environ.get("AWS_DEFAULT_REGION")
+        or "(not set in environment)"
+    )
+    env_access_key_set = bool(os.environ.get("AWS_ACCESS_KEY_ID"))
+    env_secret_key_set = bool(os.environ.get("AWS_SECRET_ACCESS_KEY"))
+    env_session_token_set = bool(os.environ.get("AWS_SESSION_TOKEN"))
+    print("\n--- AWS credential context (CDK / boto3) ---")
+    print(f"AWS_PROFILE: {profile}")
+    print(f"AWS_REGION / AWS_DEFAULT_REGION (env): {default_region}")
+    print(
+        "Environment credential variables: "
+        f"AWS_ACCESS_KEY_ID={'set' if env_access_key_set else 'not set'}, "
+        f"AWS_SECRET_ACCESS_KEY={'set' if env_secret_key_set else 'not set'}, "
+        f"AWS_SESSION_TOKEN={'set' if env_session_token_set else 'not set'}"
+    )
+    if expected_account_id:
+        print(f"Configured CDK target account (AWS_ACCOUNT_ID): {expected_account_id}")
+    if expected_region:
+        print(f"Configured CDK target region (AWS_REGION): {expected_region}")
+    session = boto3.Session()
+    active_profile = session.profile_name or "(default)"
+    print(f"boto3 session profile: {active_profile}")
+    print(f"boto3 session region: {session.region_name or '(not set)'}")
+    credentials = session.get_credentials()
+    credential_summary: Dict[str, Any] = {
+        "profile": profile,
+        "session_profile": active_profile,
+    }
+    if credentials is None:
+        print("WARNING: No AWS credentials found in the default provider chain.")
+        print("--- End AWS credential context ---\n")
+        credential_summary["error"] = "no_credentials"
+        return credential_summary
+    frozen = credentials.get_frozen_credentials()
+    access_key = frozen.access_key or ""
+    access_key_prefix = (access_key[:4] + "...") if len(access_key) >= 4 else "(none)"
+    credential_summary["access_key_prefix"] = access_key_prefix
+    if env_access_key_set:
+        credential_source = "environment variables (highest precedence)"
+    elif access_key.startswith("AKIA"):
+        credential_source = "long-lived access key (likely ~/.aws/credentials [default] or named profile)"
+    elif access_key.startswith("ASIA"):
+        credential_source = "temporary credentials (SSO, assumed role, or STS session)"
+    else:
+        credential_source = (
+            "resolved credentials (source could not be classified from key prefix)"
+        )
+    print(f"Inferred credential type: {credential_source}")
+    credential_summary["inferred_credential_type"] = credential_source
+    if env_access_key_set and profile != "(not set — using default profile chain)":
+        print(
+            "NOTE: AWS_ACCESS_KEY_ID is set in the environment, so it overrides "
+            f"profile '{profile}' and SSO."
+        )
+    try:
+        sts = session.client("sts", region_name=session.region_name or expected_region)
+        identity = sts.get_caller_identity()
+    except (ClientError, NoCredentialsError) as exc:
+        print(f"WARNING: sts:GetCallerIdentity failed: {exc}")
+        print("--- End AWS credential context ---\n")
+        credential_summary["error"] = str(exc)
+        return credential_summary
+    account = identity.get("Account", "")
+    arn = identity.get("Arn", "")
+    user_id = identity.get("UserId", "")
+    print(f"Caller account: {account}")
+    print(f"Caller ARN: {arn}")
+    print(f"Caller UserId: {user_id}")
+    if ":assumed-role/" in arn:
+        principal_kind = "assumed IAM role (typical for SSO or role chaining)"
+    elif ":user/" in arn:
+        principal_kind = "IAM user (typical for static access keys in credentials file)"
+    elif ":federated-user/" in arn:
+        principal_kind = "federated user"
+    else:
+        principal_kind = "other IAM principal"
+    print(f"Principal kind: {principal_kind}")
+    credential_summary.update(
+        {
+            "account": account,
+            "arn": arn,
+            "user_id": user_id,
+            "principal_kind": principal_kind,
+        }
+    )
+    if expected_account_id and account and account != str(expected_account_id):
+        print(
+            "WARNING: Caller account does not match configured AWS_ACCOUNT_ID. "
+            "CDK will target the configured account but act as this identity — "
+            "deployments and lookups may fail. Set AWS_PROFILE to your SSO profile "
+            "and unset AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY if needed."
+        )
+        credential_summary["account_mismatch"] = True
+    elif expected_account_id and account == str(expected_account_id):
+        print("Caller account matches configured AWS_ACCOUNT_ID.")
+    if profile == "(not set — using default profile chain)":
+        print(
+            "TIP: Set AWS_PROFILE to your SSO profile name so Python and the CDK CLI "
+            "(Node) use the same session. Example: "
+            '$env:AWS_PROFILE = "YourSsoProfileName"'
+        )
+    print("--- End AWS credential context ---\n")
+    return credential_summary
+# --- Function to load context from file ---
+def load_context_from_file(app: App, file_path: str):
+    if os.path.exists(file_path):
+        with open(file_path, "r", encoding="utf-8") as f:
+            context_data = json.load(f)
+            for key, value in context_data.items():
+                app.node.set_context(key, value)
+            print(f"Loaded context from {file_path}")
+    else:
+        print(f"Context file not found: {file_path}")
+# --- Helper to parse environment variables into lists ---
+def _get_env_list(env_var_name: str) -> List[str]:
+    """Parses a comma-separated environment variable into a list of strings."""
+    value = env_var_name[1:-1].strip().replace('"', "").replace("'", "")
+    if not value:
+        return []
+    # Split by comma and filter out any empty strings that might result from extra commas
+    return [s.strip() for s in value.split(",") if s.strip()]
+# 1. Try to load CIDR/AZs from environment variables
+if PUBLIC_SUBNETS_TO_USE:
+    PUBLIC_SUBNETS_TO_USE = _get_env_list(PUBLIC_SUBNETS_TO_USE)
+if PRIVATE_SUBNETS_TO_USE:
+    PRIVATE_SUBNETS_TO_USE = _get_env_list(PRIVATE_SUBNETS_TO_USE)
+if PUBLIC_SUBNET_CIDR_BLOCKS:
+    PUBLIC_SUBNET_CIDR_BLOCKS = _get_env_list("PUBLIC_SUBNET_CIDR_BLOCKS")
+if PUBLIC_SUBNET_AVAILABILITY_ZONES:
+    PUBLIC_SUBNET_AVAILABILITY_ZONES = _get_env_list("PUBLIC_SUBNET_AVAILABILITY_ZONES")
+if PRIVATE_SUBNET_CIDR_BLOCKS:
+    PRIVATE_SUBNET_CIDR_BLOCKS = _get_env_list("PRIVATE_SUBNET_CIDR_BLOCKS")
+if PRIVATE_SUBNET_AVAILABILITY_ZONES:
+    PRIVATE_SUBNET_AVAILABILITY_ZONES = _get_env_list(
+        "PRIVATE_SUBNET_AVAILABILITY_ZONES"
+    )
+if POLICY_FILE_LOCATIONS:
+    POLICY_FILE_LOCATIONS = _get_env_list(POLICY_FILE_LOCATIONS)
+def check_for_existing_role(role_name: str):
+    try:
+        iam = boto3.client("iam")
+        # iam.get_role(RoleName=role_name)
+        response = iam.get_role(RoleName=role_name)
+        role = response["Role"]["Arn"]
+        print("Response Role:", role)
+        return True, role, ""
+    except iam.exceptions.NoSuchEntityException:
+        return False, "", ""
+    except Exception as e:
+        raise Exception("Getting information on IAM role failed due to:", e)
+from typing import List
+# Assume POLICY_FILE_LOCATIONS is defined globally or passed as a default
+# For example:
+# POLICY_FILE_LOCATIONS = ["./policies/my_read_policy.json", "./policies/my_write_policy.json"]
+def add_statement_to_policy(role: iam.IRole, policy_document: Dict[str, Any]):
+    """
+    Adds individual policy statements from a parsed policy document to a CDK Role.
+    Args:
+        role: The CDK Role construct to attach policies to.
+        policy_document: A Python dictionary representing an IAM policy document.
+    """
+    # Ensure the loaded JSON is a valid policy document structure
+    if "Statement" not in policy_document or not isinstance(
+        policy_document["Statement"], list
+    ):
+        print("Warning: Policy document does not contain a 'Statement' list. Skipping.")
+        return  # Do not return role, just log and exit
+    for statement_dict in policy_document["Statement"]:
+        try:
+            # Create a CDK PolicyStatement from the dictionary
+            cdk_policy_statement = iam.PolicyStatement.from_json(statement_dict)
+            # Add the policy statement to the role
+            role.add_to_policy(cdk_policy_statement)
+            print(f"  - Added statement: {statement_dict.get('Sid', 'No Sid')}")
+        except Exception as e:
+            print(
+                f"Warning: Could not process policy statement: {statement_dict}. Error: {e}"
+            )
+def add_s3_enforce_ssl_policy(bucket: s3.IBucket) -> None:
+    """Deny non-TLS S3 requests (Security Hub S3.5). Compatible with all CDK versions."""
+    bucket.add_to_resource_policy(
+        iam.PolicyStatement(
+            effect=iam.Effect.DENY,
+            principals=[iam.AnyPrincipal()],
+            actions=["s3:*"],
+            resources=[bucket.bucket_arn, f"{bucket.bucket_arn}/*"],
+            conditions={"Bool": {"aws:SecureTransport": "false"}},
+        )
+    )
+def add_custom_policies(
+    scope: Construct,  # Not strictly used here, but good practice if you expand to ManagedPolicies
+    role: iam.IRole,
+    policy_file_locations: Optional[List[str]] = None,
+    custom_policy_text: Optional[str] = None,
+) -> iam.IRole:
+    """
+    Loads custom policies from JSON files or a string and attaches them to a CDK Role.
+    Args:
+        scope: The scope in which to define constructs (if needed, e.g., for iam.ManagedPolicy).
+        role: The CDK Role construct to attach policies to.
+        policy_file_locations: List of file paths to JSON policy documents.
+        custom_policy_text: A JSON string representing a policy document.
+    Returns:
+        The modified CDK Role construct.
+    """
+    if policy_file_locations is None:
+        policy_file_locations = []
+    current_source = "unknown source"  # For error messages
+    try:
+        if policy_file_locations:
+            print(f"Attempting to add policies from files to role {role.node.id}...")
+            for path in policy_file_locations:
+                current_source = f"file: {path}"
+                try:
+                    with open(path, "r") as f:
+                        policy_document = json.load(f)
+                    print(f"Processing policy from {current_source}...")
+                    add_statement_to_policy(role, policy_document)
+                except FileNotFoundError:
+                    print(f"Warning: Policy file not found at {path}. Skipping.")
+                except json.JSONDecodeError as e:
+                    print(
+                        f"Warning: Invalid JSON in policy file {path}: {e}. Skipping."
+                    )
+                except Exception as e:
+                    print(
+                        f"An unexpected error occurred processing policy from {path}: {e}. Skipping."
+                    )
+        if custom_policy_text:
+            current_source = "custom policy text string"
+            print(
+                f"Attempting to add policy from custom text to role {role.node.id}..."
+            )
+            try:
+                # *** FIX: Parse the JSON string into a Python dictionary ***
+                policy_document = json.loads(custom_policy_text)
+                print(f"Processing policy from {current_source}...")
+                add_statement_to_policy(role, policy_document)
+            except json.JSONDecodeError as e:
+                print(f"Warning: Invalid JSON in custom_policy_text: {e}. Skipping.")
+            except Exception as e:
+                print(
+                    f"An unexpected error occurred processing policy from custom_policy_text: {e}. Skipping."
+                )
+        # You might want a final success message, but individual processing messages are also good.
+        print(f"Finished processing custom policies for role {role.node.id}.")
+    except Exception as e:
+        print(
+            f"An unhandled error occurred during policy addition for {current_source}: {e}"
+        )
+    return role
+# Import the S3 Bucket class if you intend to return a CDK object later
+# from aws_cdk import aws_s3 as s3
+def check_s3_bucket_exists(
+    bucket_name: str,
+):  # Return type hint depends on what you return
+    """
+    Checks if an S3 bucket with the given name exists and is accessible.
+    Args:
+        bucket_name: The name of the S3 bucket to check.
+    Returns:
+        A tuple: (bool indicating existence, optional S3 Bucket object or None)
+        Note: Returning a Boto3 S3 Bucket object from here is NOT ideal
+              for direct use in CDK. You'll likely only need the boolean result
+              or the bucket name for CDK lookups/creations.
+              For this example, let's return the boolean and the name.
+    """
+    s3_client = boto3.client("s3")
+    try:
+        # Use head_bucket to check for existence and access
+        s3_client.head_bucket(Bucket=bucket_name)
+        print(f"Bucket '{bucket_name}' exists and is accessible.")
+        return True, bucket_name  # Return True and the bucket name
+    except ClientError as e:
+        # If a ClientError occurs, check the error code.
+        # '404' means the bucket does not exist.
+        # '403' means the bucket exists but you don't have permission.
+        error_code = e.response["Error"]["Code"]
+        if error_code == "404":
+            print(f"Bucket '{bucket_name}' does not exist.")
+            return False, None
+        elif error_code == "403":
+            # The bucket exists, but you can't access it.
+            # Depending on your requirements, this might be treated as "exists"
+            # or "not accessible for our purpose". For checking existence,
+            # we'll say it exists here, but note the permission issue.
+            # NOTE - when I tested this, it was returning 403 even for buckets that don't exist. So I will return False instead
+            print(
+                f"Bucket '{bucket_name}' returned 403, which indicates it may exist but is not accessible due to permissions, or that it doesn't exist. Returning False for existence just in case."
+            )
+            return False, bucket_name  # It exists, even if not accessible
+        else:
+            # For other errors, it's better to raise the exception
+            # to indicate something unexpected happened.
+            print(
+                f"An unexpected AWS ClientError occurred checking bucket '{bucket_name}': {e}"
+            )
+            # Decide how to handle other errors - raising might be safer
+            raise  # Re-raise the original exception
+    except Exception as e:
+        print(
+            f"An unexpected non-ClientError occurred checking bucket '{bucket_name}': {e}"
+        )
+        # Decide how to handle other errors
+        raise  # Re-raise the original exception
+# Example usage in your check_resources.py:
+# exists, bucket_name_if_exists = check_s3_bucket_exists(log_bucket_name)
+# context_data[f"exists:{log_bucket_name}"] = exists
+# # You don't necessarily need to store the name in context if using from_bucket_name
+# Delete an S3 bucket
+def delete_s3_bucket(bucket_name: str):
+    s3 = boto3.client("s3")
+    try:
+        # List and delete all objects
+        response = s3.list_object_versions(Bucket=bucket_name)
+        versions = response.get("Versions", []) + response.get("DeleteMarkers", [])
+        for version in versions:
+            s3.delete_object(
+                Bucket=bucket_name, Key=version["Key"], VersionId=version["VersionId"]
+            )
+        # Delete the bucket
+        s3.delete_bucket(Bucket=bucket_name)
+        return {"Status": "SUCCESS"}
+    except Exception as e:
+        return {"Status": "FAILED", "Reason": str(e)}
+# Function to get subnet ID from subnet name
+def get_subnet_id(vpc: str, ec2_client: str, subnet_name: str):
+    response = ec2_client.describe_subnets(
+        Filters=[{"Name": "vpc-id", "Values": [vpc.vpc_id]}]
+    )
+    for subnet in response["Subnets"]:
+        if subnet["Tags"] and any(
+            tag["Key"] == "Name" and tag["Value"] == subnet_name
+            for tag in subnet["Tags"]
+        ):
+            return subnet["SubnetId"]
+    return None
+def check_ecr_repo_exists(repo_name: str) -> tuple[bool, dict]:
+    """
+    Checks if an ECR repository with the given name exists.
+    Args:
+        repo_name: The name of the ECR repository to check.
+    Returns:
+        True if the repository exists, False otherwise.
+    """
+    ecr_client = boto3.client("ecr")
+    try:
+        print("ecr repo_name to check:", repo_name)
+        response = ecr_client.describe_repositories(repositoryNames=[repo_name])
+        # If describe_repositories succeeds and returns a list of repositories,
+        # and the list is not empty, the repository exists.
+        return len(response["repositories"]) > 0, response["repositories"][0]
+    except ClientError as e:
+        # Check for the specific error code indicating the repository doesn't exist
+        if e.response["Error"]["Code"] == "RepositoryNotFoundException":
+            return False, {}
+        else:
+            # Re-raise other exceptions to handle unexpected errors
+            raise
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
+        return False, {}
+def check_codebuild_project_exists(
+    project_name: str,
+):  # Adjust return type hint as needed
+    """
+    Checks if a CodeBuild project with the given name exists.
+    Args:
+        project_name: The name of the CodeBuild project to check.
+    Returns:
+        A tuple:
+        - The first element is True if the project exists, False otherwise.
+        - The second element is the project object (dictionary) if found,
+          None otherwise.
+    """
+    codebuild_client = boto3.client("codebuild")
+    try:
+        # Use batch_get_projects with a list containing the single project name
+        response = codebuild_client.batch_get_projects(names=[project_name])
+        # The response for batch_get_projects includes 'projects' (found)
+        # and 'projectsNotFound' (not found).
+        if response["projects"]:
+            # If the project is found in the 'projects' list
+            print(f"CodeBuild project '{project_name}' found.")
+            project = response["projects"][0]
+            return (
+                True,
+                project["arn"],
+                project.get("serviceRole"),
+            )
+        elif (
+            response["projectsNotFound"]
+            and project_name in response["projectsNotFound"]
+        ):
+            # If the project name is explicitly in the 'projectsNotFound' list
+            print(f"CodeBuild project '{project_name}' not found.")
+            return False, None, None
+        else:
+            # This case is less expected for a single name lookup,
+            # but could happen if there's an internal issue or the response
+            # structure is slightly different than expected for an error.
+            # It's safer to assume it wasn't found if not in 'projects'.
+            print(
+                f"CodeBuild project '{project_name}' not found (not in 'projects' list)."
+            )
+            return False, None, None
+    except ClientError as e:
+        # Catch specific ClientErrors. batch_get_projects might not throw
+        # 'InvalidInputException' for a non-existent project name if the
+        # name format is valid. It typically just lists it in projectsNotFound.
+        # However, other ClientErrors are possible (e.g., permissions).
+        print(
+            f"An AWS ClientError occurred checking CodeBuild project '{project_name}': {e}"
+        )
+        # Decide how to handle other ClientErrors - raising might be safer
+        raise  # Re-raise the original exception
+    except Exception as e:
+        print(
+            f"An unexpected non-ClientError occurred checking CodeBuild project '{project_name}': {e}"
+        )
+        # Decide how to handle other errors
+        raise  # Re-raise the original exception
+def get_vpc_id_by_name(vpc_name: str) -> Optional[str]:
+    """
+    Finds a VPC ID by its 'Name' tag.
+    """
+    ec2_client = boto3.client("ec2")
+    try:
+        response = ec2_client.describe_vpcs(
+            Filters=[{"Name": "tag:Name", "Values": [vpc_name]}]
+        )
+        if response and response["Vpcs"]:
+            vpc_id = response["Vpcs"][0]["VpcId"]
+            print(f"VPC '{vpc_name}' found with ID: {vpc_id}")
+            # In get_vpc_id_by_name, after finding VPC ID:
+            # Look for NAT Gateways in this VPC
+            ec2_client = boto3.client("ec2")
+            nat_gateways = []
+            try:
+                response = ec2_client.describe_nat_gateways(
+                    Filters=[
+                        {"Name": "vpc-id", "Values": [vpc_id]},
+                        # Optional: Add a tag filter if you consistently tag your NATs
+                        # {'Name': 'tag:Name', 'Values': [f"{prefix}-nat-gateway"]}
+                    ]
+                )
+                nat_gateways = response.get("NatGateways", [])
+            except Exception as e:
+                print(
+                    f"Warning: Could not describe NAT Gateways in VPC '{vpc_id}': {e}"
+                )
+                # Decide how to handle this error - proceed or raise?
+            # Decide how to identify the specific NAT Gateway you want to check for.
+            return vpc_id, nat_gateways
+        else:
+            print(f"VPC '{vpc_name}' not found.")
+            return None
+    except Exception as e:
+        print(f"An unexpected error occurred finding VPC '{vpc_name}': {e}")
+        raise
+# --- Helper to fetch all existing subnets in a VPC once ---
+def _get_existing_subnets_in_vpc(vpc_id: str) -> Dict[str, Any]:
+    """
+    Fetches all subnets in a given VPC.
+    Returns a dictionary with 'by_name' (map of name to subnet data),
+    'by_id' (map of id to subnet data), and 'cidr_networks' (list of ipaddress.IPv4Network).
+    """
+    ec2_client = boto3.client("ec2")
+    existing_subnets_data = {
+        "by_name": {},  # {subnet_name: {'id': 'subnet-id', 'cidr': 'x.x.x.x/x'}}
+        "by_id": {},  # {subnet_id: {'name': 'subnet-name', 'cidr': 'x.x.x.x/x/x'}}
+        "cidr_networks": [],  # List of ipaddress.IPv4Network objects
+    }
+    try:
+        subnet_to_route_table: Dict[str, str] = {}
+        rt_response = ec2_client.describe_route_tables(
+            Filters=[{"Name": "vpc-id", "Values": [vpc_id]}]
+        )
+        for route_table in rt_response.get("RouteTables", []):
+            route_table_id = route_table["RouteTableId"]
+            for association in route_table.get("Associations", []):
+                associated_subnet_id = association.get("SubnetId")
+                if associated_subnet_id:
+                    subnet_to_route_table[associated_subnet_id] = route_table_id
+        response = ec2_client.describe_subnets(
+            Filters=[{"Name": "vpc-id", "Values": [vpc_id]}]
+        )
+        for s in response.get("Subnets", []):
+            subnet_id = s["SubnetId"]
+            cidr_block = s.get("CidrBlock")
+            # Extract 'Name' tag, which is crucial for lookup by name
+            name_tag = next(
+                (tag["Value"] for tag in s.get("Tags", []) if tag["Key"] == "Name"),
+                None,
+            )
+            subnet_info = {
+                "id": subnet_id,
+                "cidr": cidr_block,
+                "name": name_tag,
+                "az": s.get("AvailabilityZone"),
+                "route_table_id": subnet_to_route_table.get(subnet_id),
+            }
+            if name_tag:
+                existing_subnets_data["by_name"][name_tag] = subnet_info
+            existing_subnets_data["by_id"][subnet_id] = subnet_info
+            if cidr_block:
+                try:
+                    existing_subnets_data["cidr_networks"].append(
+                        ipaddress.ip_network(cidr_block, strict=False)
+                    )
+                except ValueError:
+                    print(
+                        f"Warning: Existing subnet {subnet_id} has an invalid CIDR: {cidr_block}. Skipping for overlap check."
+                    )
+        print(
+            f"Fetched {len(response.get('Subnets', []))} existing subnets from VPC '{vpc_id}'."
+        )
+    except Exception as e:
+        print(
+            f"Error describing existing subnets in VPC '{vpc_id}': {e}. Cannot perform full validation."
+        )
+        raise  # Re-raise if this essential step fails
+    return existing_subnets_data
+# --- Modified validate_subnet_creation_parameters to take pre-fetched data ---
+def validate_subnet_creation_parameters(
+    vpc_id: str,
+    proposed_subnets_data: List[
+        Dict[str, str]
+    ],  # e.g., [{'name': 'my-public-subnet', 'cidr': '10.0.0.0/24', 'az': 'us-east-1a'}]
+    existing_aws_subnets_data: Dict[
+        str, Any
+    ],  # Pre-fetched data from _get_existing_subnets_in_vpc
+) -> None:
+    """
+    Validates proposed subnet names and CIDR blocks against existing AWS subnets
+    in the specified VPC and against each other.
+    This function uses pre-fetched AWS subnet data.
+    Args:
+        vpc_id: The ID of the VPC (for logging/error messages).
+        proposed_subnets_data: A list of dictionaries, where each dict represents
+                               a proposed subnet with 'name', 'cidr', and 'az'.
+        existing_aws_subnets_data: Dictionary containing existing AWS subnet data
+                                   (e.g., from _get_existing_subnets_in_vpc).
+    Raises:
+        ValueError: If any proposed subnet name or CIDR block
+                    conflicts with existing AWS resources or other proposed resources.
+    """
+    if not proposed_subnets_data:
+        print("No proposed subnet data provided for validation. Skipping.")
+        return
+    print(
+        f"--- Starting pre-synth validation for VPC '{vpc_id}' with proposed subnets ---"
+    )
+    print("Existing subnet data:", pd.DataFrame(existing_aws_subnets_data["by_name"]))
+    existing_aws_subnet_names = set(existing_aws_subnets_data["by_name"].keys())
+    existing_aws_cidr_networks = existing_aws_subnets_data["cidr_networks"]
+    # Sets to track names and list to track networks for internal batch consistency
+    proposed_names_seen: set[str] = set()
+    proposed_cidr_networks_seen: List[ipaddress.IPv4Network] = []
+    for i, proposed_subnet in enumerate(proposed_subnets_data):
+        subnet_name = proposed_subnet.get("name")
+        cidr_block_str = proposed_subnet.get("cidr")
+        availability_zone = proposed_subnet.get("az")
+        if not all([subnet_name, cidr_block_str, availability_zone]):
+            raise ValueError(
+                f"Proposed subnet at index {i} is incomplete. Requires 'name', 'cidr', and 'az'."
+            )
+        # 1. Check for duplicate names within the proposed batch
+        if subnet_name in proposed_names_seen:
+            raise ValueError(
+                f"Proposed subnet name '{subnet_name}' is duplicated within the input list."
+            )
+        proposed_names_seen.add(subnet_name)
+        # 2. Check for duplicate names against existing AWS subnets
+        if subnet_name in existing_aws_subnet_names:
+            print(
+                f"Proposed subnet name '{subnet_name}' already exists in VPC '{vpc_id}'."
+            )
+        # Parse proposed CIDR
+        try:
+            proposed_net = ipaddress.ip_network(cidr_block_str, strict=False)
+        except ValueError as e:
+            raise ValueError(
+                f"Invalid CIDR format '{cidr_block_str}' for proposed subnet '{subnet_name}': {e}"
+            )
+        # 3. Check for overlapping CIDRs within the proposed batch
+        for existing_proposed_net in proposed_cidr_networks_seen:
+            if proposed_net.overlaps(existing_proposed_net):
+                raise ValueError(
+                    f"Proposed CIDR '{cidr_block_str}' for subnet '{subnet_name}' "
+                    f"overlaps with another proposed CIDR '{str(existing_proposed_net)}' "
+                    f"within the same batch."
+                )
+        # 4. Check for overlapping CIDRs against existing AWS subnets
+        for existing_aws_net in existing_aws_cidr_networks:
+            if proposed_net.overlaps(existing_aws_net):
+                raise ValueError(
+                    f"Proposed CIDR '{cidr_block_str}' for subnet '{subnet_name}' "
+                    f"overlaps with an existing AWS subnet CIDR '{str(existing_aws_net)}' "
+                    f"in VPC '{vpc_id}'."
+                )
+        # If all checks pass for this subnet, add its network to the list for subsequent checks
+        proposed_cidr_networks_seen.append(proposed_net)
+        print(
+            f"Validation successful for proposed subnet '{subnet_name}' with CIDR '{cidr_block_str}'."
+        )
+    print(
+        f"--- All proposed subnets passed pre-synth validation checks for VPC '{vpc_id}'. ---"
+    )
+# --- Modified check_subnet_exists_by_name (Uses pre-fetched data) ---
+def check_subnet_exists_by_name(
+    subnet_name: str, existing_aws_subnets_data: Dict[str, Any]
+) -> Tuple[bool, Optional[str]]:
+    """
+    Checks if a subnet with the given name exists within the pre-fetched data.
+    Args:
+        subnet_name: The 'Name' tag value of the subnet to check.
+        existing_aws_subnets_data: Dictionary containing existing AWS subnet data
+                                   (e.g., from _get_existing_subnets_in_vpc).
+    Returns:
+        A tuple:
+        - The first element is True if the subnet exists, False otherwise.
+        - The second element is the Subnet ID if found, None otherwise.
+    """
+    subnet_info = existing_aws_subnets_data["by_name"].get(subnet_name)
+    if subnet_info:
+        print(f"Subnet '{subnet_name}' found with ID: {subnet_info['id']}")
+        return True, subnet_info["id"]
+    else:
+        print(f"Subnet '{subnet_name}' not found.")
+        return False, None
+def create_nat_gateway(
+    scope: Construct,
+    public_subnet_for_nat: ec2.ISubnet,  # Expects a proper ISubnet
+    nat_gateway_name: str,
+    nat_gateway_id_context_key: str,
+) -> str:
+    """
+    Creates a single NAT Gateway in the specified public subnet.
+    It does not handle lookup from context; the calling stack should do that.
+    Returns the CloudFormation Ref of the NAT Gateway ID.
+    """
+    print(
+        f"Defining a new NAT Gateway '{nat_gateway_name}' in subnet '{public_subnet_for_nat.subnet_id}'."
+    )
+    # Create an Elastic IP for the NAT Gateway
+    eip = ec2.CfnEIP(
+        scope,
+        NAT_GATEWAY_EIP_NAME,
+        tags=[CfnTag(key="Name", value=NAT_GATEWAY_EIP_NAME)],
+    )
+    # Create the NAT Gateway
+    nat_gateway_logical_id = nat_gateway_name.replace("-", "") + "NatGateway"
+    nat_gateway = ec2.CfnNatGateway(
+        scope,
+        nat_gateway_logical_id,
+        subnet_id=public_subnet_for_nat.subnet_id,  # Associate with the public subnet
+        allocation_id=eip.attr_allocation_id,  # Associate with the EIP
+        tags=[CfnTag(key="Name", value=nat_gateway_name)],
+    )
+    # The NAT GW depends on the EIP. The dependency on the subnet is implicit via subnet_id.
+    nat_gateway.add_dependency(eip)
+    # *** CRUCIAL: Use CfnOutput to export the ID after deployment ***
+    # This is how you will get the ID to put into cdk.context.json
+    CfnOutput(
+        scope,
+        "SingleNatGatewayIdOutput",
+        value=nat_gateway.ref,
+        description=f"Physical ID of the Single NAT Gateway. Add this to cdk.context.json under the key '{nat_gateway_id_context_key}'.",
+        export_name=f"{scope.stack_name}-NatGatewayId",  # Make export name unique
+    )
+    print(
+        f"CDK: Defined new NAT Gateway '{nat_gateway.ref}'. Its physical ID will be available in the stack outputs after deployment."
+    )
+    # Return the tokenised reference for use within this synthesis
+    return nat_gateway.ref
+def create_subnets(
+    scope: Construct,
+    vpc: ec2.IVpc,
+    prefix: str,
+    subnet_names: List[str],
+    cidr_blocks: List[str],
+    availability_zones: List[str],
+    is_public: bool,
+    internet_gateway_id: Optional[str] = None,
+    single_nat_gateway_id: Optional[str] = None,
+) -> Tuple[List[ec2.CfnSubnet], List[ec2.CfnRouteTable]]:
+    """
+    Creates subnets using L2 constructs but returns the underlying L1 Cfn objects
+    for backward compatibility.
+    """
+    # --- Validations remain the same ---
+    if not (len(subnet_names) == len(cidr_blocks) == len(availability_zones) > 0):
+        raise ValueError(
+            "Subnet names, CIDR blocks, and Availability Zones lists must be non-empty and match in length."
+        )
+    if is_public and not internet_gateway_id:
+        raise ValueError("internet_gateway_id must be provided for public subnets.")
+    if not is_public and not single_nat_gateway_id:
+        raise ValueError(
+            "single_nat_gateway_id must be provided for private subnets when using a single NAT Gateway."
+        )
+    # --- We will populate these lists with the L1 objects to return ---
+    created_subnets: List[ec2.CfnSubnet] = []
+    created_route_tables: List[ec2.CfnRouteTable] = []
+    subnet_type_tag = "public" if is_public else "private"
+    for i, subnet_name in enumerate(subnet_names):
+        logical_id = f"{prefix}{subnet_type_tag.capitalize()}Subnet{i+1}"
+        # 1. Create the L2 Subnet (this is the easy part)
+        subnet = ec2.Subnet(
+            scope,
+            logical_id,
+            vpc_id=vpc.vpc_id,
+            cidr_block=cidr_blocks[i],
+            availability_zone=availability_zones[i],
+            map_public_ip_on_launch=is_public,
+        )
+        Tags.of(subnet).add("Name", subnet_name)
+        Tags.of(subnet).add("Type", subnet_type_tag)
+        if is_public:
+            # The subnet's route_table is automatically created by the L2 Subnet construct
+            try:
+                subnet.add_route(
+                    "DefaultInternetRoute",  # A logical ID for the CfnRoute resource
+                    router_id=internet_gateway_id,
+                    router_type=ec2.RouterType.GATEWAY,
+                    # destination_cidr_block="0.0.0.0/0" is the default for this method
+                )
+            except Exception as e:
+                print("Could not create IGW route for public subnet due to:", e)
+            print(f"CDK: Defined public L2 subnet '{subnet_name}' and added IGW route.")
+        else:
+            try:
+                # Using .add_route() for private subnets as well for consistency
+                subnet.add_route(
+                    "DefaultNatRoute",  # A logical ID for the CfnRoute resource
+                    router_id=single_nat_gateway_id,
+                    router_type=ec2.RouterType.NAT_GATEWAY,
+                )
+            except Exception as e:
+                print("Could not create NAT gateway route for public subnet due to:", e)
+            print(
+                f"CDK: Defined private L2 subnet '{subnet_name}' and added NAT GW route."
+            )
+        route_table = subnet.route_table
+        created_subnets.append(subnet)
+        created_route_tables.append(route_table)
+    return created_subnets, created_route_tables
+def ingress_rule_exists(security_group: str, peer: str, port: str):
+    for rule in security_group.connections.security_groups:
+        if port:
+            if rule.peer == peer and rule.connection == port:
+                return True
+        else:
+            if rule.peer == peer:
+                return True
+    return False
+def check_for_existing_user_pool(user_pool_name: str):
+    cognito_client = boto3.client("cognito-idp")
+    list_pools_response = cognito_client.list_user_pools(
+        MaxResults=60
+    )  # MaxResults up to 60
+    # ListUserPools might require pagination if you have more than 60 pools
+    # This simple example doesn't handle pagination, which could miss your pool
+    existing_user_pool_id = ""
+    for pool in list_pools_response.get("UserPools", []):
+        if pool.get("Name") == user_pool_name:
+            existing_user_pool_id = pool["Id"]
+            print(
+                f"Found existing user pool by name '{user_pool_name}' with ID: {existing_user_pool_id}"
+            )
+            break  # Found the one we're looking for
+    if existing_user_pool_id:
+        return True, existing_user_pool_id, pool
+    else:
+        return False, "", ""
+def check_for_existing_user_pool_client(user_pool_id: str, user_pool_client_name: str):
+    """
+    Checks if a Cognito User Pool Client with the given name exists in the specified User Pool.
+    Args:
+        user_pool_id: The ID of the Cognito User Pool.
+        user_pool_client_name: The name of the User Pool Client to check for.
+    Returns:
+        A tuple:
+        - True, client_id, client_details if the client exists.
+        - False, "", {} otherwise.
+    """
+    cognito_client = boto3.client("cognito-idp")
+    next_token = "string"
+    while True:
+        try:
+            response = cognito_client.list_user_pool_clients(
+                UserPoolId=user_pool_id, MaxResults=60, NextToken=next_token
+            )
+        except cognito_client.exceptions.ResourceNotFoundException:
+            print(f"Error: User pool with ID '{user_pool_id}' not found.")
+            return False, "", {}
+        except cognito_client.exceptions.InvalidParameterException:
+            print(f"Error: No app clients for '{user_pool_id}' found.")
+            return False, "", {}
+        except Exception as e:
+            print("Could not check User Pool clients due to:", e)
+        for client in response.get("UserPoolClients", []):
+            if client.get("ClientName") == user_pool_client_name:
+                print(
+                    f"Found existing user pool client '{user_pool_client_name}' with ID: {client['ClientId']}"
+                )
+                return True, client["ClientId"], client
+        next_token = response.get("NextToken")
+        if not next_token:
+            break
+    return False, "", {}
+def check_for_secret(secret_name: str, secret_value: dict = ""):
+    """
+    Checks if a Secrets Manager secret with the given name exists.
+    If it doesn't exist, it creates the secret.
+    Args:
+        secret_name: The name of the Secrets Manager secret.
+        secret_value: A dictionary containing the key-value pairs for the secret.
+    Returns:
+        True if the secret existed or was created, False otherwise (due to other errors).
+    """
+    secretsmanager_client = boto3.client("secretsmanager")
+    try:
+        # Try to get the secret. If it doesn't exist, a ResourceNotFoundException will be raised.
+        secret_value = secretsmanager_client.get_secret_value(SecretId=secret_name)
+        print("Secret already exists.")
+        return True, secret_value
+    except secretsmanager_client.exceptions.ResourceNotFoundException:
+        print("Secret not found")
+        return False, {}
+    except Exception as e:
+        # Handle other potential exceptions during the get operation
+        print(f"Error checking for secret: {e}")
+        return False, {}
+def get_security_group_id_by_name(
+    group_name: str,
+    vpc_id: str,
+    region_name: str = AWS_REGION,
+) -> Tuple[bool, str]:
+    """Look up a security group ID by name within a VPC."""
+    if not group_name or not vpc_id:
+        return False, ""
+    try:
+        ec2_client = boto3.client("ec2", region_name=region_name)
+        response = ec2_client.describe_security_groups(
+            Filters=[
+                {"Name": "group-name", "Values": [group_name]},
+                {"Name": "vpc-id", "Values": [vpc_id]},
+            ]
+        )
+        groups = response.get("SecurityGroups") or []
+        if groups:
+            return True, groups[0]["GroupId"]
+        return False, ""
+    except ClientError as e:
+        print(f"Error looking up security group '{group_name}': {e}")
+        return False, ""
+def resolve_service_connect_client_security_group_ids(
+    explicit_ids: List[str],
+    security_group_names: List[str],
+    get_context_str,
+) -> List[str]:
+    """
+    Merge explicit sg- IDs with IDs resolved from pre-check context (security_group_id:{name}).
+    """
+    resolved: List[str] = []
+    for sg_id in explicit_ids:
+        if not sg_id.startswith("sg-"):
+            raise ValueError(
+                f"ECS_SERVICE_CONNECT_CLIENT_SECURITY_GROUP_IDS entry '{sg_id}' "
+                "must be a security group ID (sg-...)."
+            )
+        if sg_id not in resolved:
+            resolved.append(sg_id)
+    missing_names: List[str] = []
+    for sg_name in security_group_names:
+        sg_id = get_context_str(f"security_group_id:{sg_name}")
+        if sg_id:
+            if sg_id not in resolved:
+                resolved.append(sg_id)
+        else:
+            missing_names.append(sg_name)
+    if missing_names:
+        raise ValueError(
+            "Could not resolve Service Connect client security group(s) in VPC "
+            f"{get_context_str('vpc_id') or '(unknown)'}: "
+            + ", ".join(missing_names)
+            + ". Set ECS_SERVICE_CONNECT_CLIENT_SECURITY_GROUP_IDS, fix "
+            "ECS_SERVICE_CONNECT_CLIENT_SECURITY_GROUP_NAMES / "
+            "ECS_SERVICE_CONNECT_CLIENT_CDK_PREFIXES, and re-run check_resources.py."
+        )
+    return resolved
+def check_alb_exists(
+    load_balancer_name: str, region_name: str = None
+) -> tuple[bool, dict]:
+    """
+    Checks if an Application Load Balancer (ALB) with the given name exists.
+    Args:
+        load_balancer_name: The name of the ALB to check.
+        region_name: The AWS region to check in.  If None, uses the default
+                     session region.
+    Returns:
+        A tuple:
+        - The first element is True if the ALB exists, False otherwise.
+        - The second element is the ALB object (dictionary) if found,
+          None otherwise.  Specifically, it returns the first element of
+          the LoadBalancers list from the describe_load_balancers response.
+    """
+    if region_name:
+        elbv2_client = boto3.client("elbv2", region_name=region_name)
+    else:
+        elbv2_client = boto3.client("elbv2")
+    try:
+        response = elbv2_client.describe_load_balancers(Names=[load_balancer_name])
+        if response["LoadBalancers"]:
+            return (
+                True,
+                response["LoadBalancers"][0],
+            )  # Return True and the first ALB object
+        else:
+            return False, {}
+    except ClientError as e:
+        #  If the error indicates the ALB doesn't exist, return False
+        if e.response["Error"]["Code"] == "LoadBalancerNotFound":
+            return False, {}
+        else:
+            # Re-raise other exceptions
+            raise
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
+        return False, {}
+def check_fargate_task_definition_exists(
+    task_definition_name: str, region_name: str = None
+) -> tuple[bool, dict]:
+    """
+    Checks if a Fargate task definition with the given name exists.
+    Args:
+        task_definition_name: The name or ARN of the task definition to check.
+        region_name: The AWS region to check in. If None, uses the default
+                     session region.
+    Returns:
+        A tuple:
+        - The first element is True if the task definition exists, False otherwise.
+        - The second element is the task definition object (dictionary) if found,
+          None otherwise.  Specifically, it returns the first element of the
+          taskDefinitions list from the describe_task_definition response.
+    """
+    if region_name:
+        ecs_client = boto3.client("ecs", region_name=region_name)
+    else:
+        ecs_client = boto3.client("ecs")
+    try:
+        response = ecs_client.describe_task_definition(
+            taskDefinition=task_definition_name
+        )
+        # If describe_task_definition succeeds, it returns the task definition.
+        # We can directly return True and the task definition.
+        return True, response["taskDefinition"]
+    except ClientError as e:
+        # Check for the error code indicating the task definition doesn't exist.
+        if (
+            e.response["Error"]["Code"] == "ClientException"
+            and "Task definition" in e.response["Message"]
+            and "does not exist" in e.response["Message"]
+        ):
+            return False, {}
+        else:
+            # Re-raise other exceptions.
+            raise
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
+        return False, {}
+def check_ecs_service_exists(
+    cluster_name: str, service_name: str, region_name: str = None
+) -> tuple[bool, dict]:
+    """
+    Checks if an ECS service with the given name exists in the specified cluster.
+    Args:
+        cluster_name: The name or ARN of the ECS cluster.
+        service_name: The name of the ECS service to check.
+        region_name: The AWS region to check in. If None, uses the default
+                     session region.
+    Returns:
+        A tuple:
+        - The first element is True if the service exists, False otherwise.
+        - The second element is the service object (dictionary) if found,
+          None otherwise.
+    """
+    if region_name:
+        ecs_client = boto3.client("ecs", region_name=region_name)
+    else:
+        ecs_client = boto3.client("ecs")
+    try:
+        response = ecs_client.describe_services(
+            cluster=cluster_name, services=[service_name]
+        )
+        if response["services"]:
+            return (
+                True,
+                response["services"][0],
+            )  # Return True and the first service object
+        else:
+            return False, {}
+    except ClientError as e:
+        # Check for the error code indicating the service doesn't exist.
+        if e.response["Error"]["Code"] == "ClusterNotFoundException":
+            return False, {}
+        elif e.response["Error"]["Code"] == "ServiceNotFoundException":
+            return False, {}
+        else:
+            # Re-raise other exceptions.
+            raise
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
+        return False, {}
+def check_cloudfront_distribution_exists(
+    distribution_name: str, region_name: str = None
+) -> tuple[bool, dict | None]:
+    """
+    Checks if a CloudFront distribution with the given name exists.
+    Args:
+        distribution_name: The name of the CloudFront distribution to check.
+        region_name: The AWS region to check in. If None, uses the default
+                     session region.  Note: CloudFront is a global service,
+                     so the region is usually 'us-east-1', but this parameter
+                     is included for completeness.
+    Returns:
+        A tuple:
+        - The first element is True if the distribution exists, False otherwise.
+        - The second element is the distribution object (dictionary) if found,
+          None otherwise.  Specifically, it returns the first element of the
+          DistributionList from the ListDistributions response.
+    """
+    if region_name:
+        cf_client = boto3.client("cloudfront", region_name=region_name)
+    else:
+        cf_client = boto3.client("cloudfront")
+    try:
+        response = cf_client.list_distributions()
+        if "Items" in response["DistributionList"]:
+            for distribution in response["DistributionList"]["Items"]:
+                # CloudFront doesn't directly filter by name, so we have to iterate.
+                if (
+                    distribution["AliasSet"]["Items"]
+                    and distribution["AliasSet"]["Items"][0] == distribution_name
+                ):
+                    return True, distribution
+            return False, None
+        else:
+            return False, None
+    except ClientError as e:
+        #  If the error indicates the Distribution doesn't exist, return False
+        if e.response["Error"]["Code"] == "NoSuchDistribution":
+            return False, None
+        else:
+            # Re-raise other exceptions
+            raise
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
+        return False, None
+def create_web_acl_with_common_rules(
+    scope: Construct, web_acl_name: str, waf_scope: str = "CLOUDFRONT"
+):
+    """
+    Use CDK to create a web ACL based on an AWS common rule set with overrides.
+    This function now expects a 'scope' argument, typically 'self' from your stack,
+    as CfnWebACL requires a construct scope.
+    """
+    # Create full list of rules
+    rules = []
+    aws_ruleset_names = [
+        "AWSManagedRulesCommonRuleSet",
+        "AWSManagedRulesKnownBadInputsRuleSet",
+        "AWSManagedRulesAmazonIpReputationList",
+    ]
+    # Use a separate counter to assign unique priorities sequentially
+    priority_counter = 1
+    for aws_rule_name in aws_ruleset_names:
+        current_rule_action_overrides = None
+        # All managed rule groups need an override_action.
+        # 'none' means use the managed rule group's default action.
+        current_override_action = wafv2.CfnWebACL.OverrideActionProperty(none={})
+        current_priority = priority_counter
+        priority_counter += 1
+        if aws_rule_name == "AWSManagedRulesCommonRuleSet":
+            current_rule_action_overrides = [
+                wafv2.CfnWebACL.RuleActionOverrideProperty(
+                    name="SizeRestrictions_BODY",
+                    action_to_use=wafv2.CfnWebACL.RuleActionProperty(allow={}),
+                )
+            ]
+            # No need to set current_override_action here, it's already set above.
+            # If you wanted this specific rule to have a *fixed* priority, you'd handle it differently
+            # For now, it will get priority 1 from the counter.
+        rule_property = wafv2.CfnWebACL.RuleProperty(
+            name=aws_rule_name,
+            priority=current_priority,
+            statement=wafv2.CfnWebACL.StatementProperty(
+                managed_rule_group_statement=wafv2.CfnWebACL.ManagedRuleGroupStatementProperty(
+                    vendor_name="AWS",
+                    name=aws_rule_name,
+                    rule_action_overrides=current_rule_action_overrides,
+                )
+            ),
+            visibility_config=wafv2.CfnWebACL.VisibilityConfigProperty(
+                cloud_watch_metrics_enabled=True,
+                metric_name=aws_rule_name,
+                sampled_requests_enabled=True,
+            ),
+            override_action=current_override_action,  # THIS IS THE CRUCIAL PART FOR ALL MANAGED RULES
+        )
+        rules.append(rule_property)
+    # Add the rate limit rule
+    rate_limit_priority = priority_counter  # Use the next available priority
+    rules.append(
+        wafv2.CfnWebACL.RuleProperty(
+            name="RateLimitRule",
+            priority=rate_limit_priority,
+            statement=wafv2.CfnWebACL.StatementProperty(
+                rate_based_statement=wafv2.CfnWebACL.RateBasedStatementProperty(
+                    limit=1000, aggregate_key_type="IP"
+                )
+            ),
+            visibility_config=wafv2.CfnWebACL.VisibilityConfigProperty(
+                cloud_watch_metrics_enabled=True,
+                metric_name="RateLimitRule",
+                sampled_requests_enabled=True,
+            ),
+            action=wafv2.CfnWebACL.RuleActionProperty(block={}),
+        )
+    )
+    web_acl = wafv2.CfnWebACL(
+        scope,
+        "WebACL",
+        name=web_acl_name,
+        default_action=wafv2.CfnWebACL.DefaultActionProperty(allow={}),
+        scope=waf_scope,
+        visibility_config=wafv2.CfnWebACL.VisibilityConfigProperty(
+            cloud_watch_metrics_enabled=True,
+            metric_name="webACL",
+            sampled_requests_enabled=True,
+        ),
+        rules=rules,
+    )
+    CfnOutput(scope, "WebACLArn", value=web_acl.attr_arn)
+    return web_acl
+def check_web_acl_exists(
+    web_acl_name: str, scope: str, region_name: str = None
+) -> tuple[bool, dict]:
+    """
+    Checks if a Web ACL with the given name and scope exists.
+    Args:
+        web_acl_name: The name of the Web ACL to check.
+        scope: The scope of the Web ACL ('CLOUDFRONT' or 'REGIONAL').
+        region_name: The AWS region to check in. Required for REGIONAL scope.
+                     If None, uses the default session region.  For CLOUDFRONT,
+                     the region should be 'us-east-1'.
+    Returns:
+        A tuple:
+        - The first element is True if the Web ACL exists, False otherwise.
+        - The second element is the Web ACL object (dictionary) if found,
+          None otherwise.
+    """
+    if scope not in ["CLOUDFRONT", "REGIONAL"]:
+        raise ValueError("Scope must be either 'CLOUDFRONT' or 'REGIONAL'")
+    if scope == "REGIONAL" and not region_name:
+        raise ValueError("Region name is required for REGIONAL scope")
+    if scope == "CLOUDFRONT":
+        region_name = "us-east-1"  # CloudFront scope requires us-east-1
+    if region_name:
+        waf_client = boto3.client("wafv2", region_name=region_name)
+    else:
+        waf_client = boto3.client("wafv2")
+    try:
+        response = waf_client.list_web_acls(Scope=scope)
+        if "WebACLs" in response:
+            for web_acl in response["WebACLs"]:
+                if web_acl["Name"] == web_acl_name:
+                    # Describe the Web ACL to get the full object.
+                    describe_response = waf_client.describe_web_acl(
+                        Name=web_acl_name, Scope=scope
+                    )
+                    return True, describe_response["WebACL"]
+            return False, {}
+        else:
+            return False, {}
+    except ClientError as e:
+        # Check for the error code indicating the web ACL doesn't exist.
+        if e.response["Error"]["Code"] == "ResourceNotFoundException":
+            return False, {}
+        else:
+            # Re-raise other exceptions.
+            raise
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
+        return False, {}
+def add_alb_https_listener_with_cert(
+    scope: Construct,
+    logical_id: str,  # A unique ID for this listener construct
+    alb: elb.ApplicationLoadBalancer,
+    acm_certificate_arn: Optional[
+        str
+    ],  # Optional: If None, no HTTPS listener will be created
+    default_target_group: elb.ITargetGroup,  # Mandatory: The target group to forward traffic to
+    listener_port_https: int = 443,
+    listener_open_to_internet: bool = False,  # Be cautious with True, ensure ALB security group restricts access
+    # --- Cognito Authentication Parameters ---
+    enable_cognito_auth: bool = False,
+    cognito_user_pool: Optional[cognito.IUserPool] = None,
+    cognito_user_pool_client: Optional[cognito.IUserPoolClient] = None,
+    cognito_user_pool_domain: Optional[
+        str
+    ] = None,  # E.g., "my-app-domain" for "my-app-domain.auth.region.amazoncognito.com"
+    cognito_auth_scope: Optional[
+        str
+    ] = "openid profile email",  # Default recommended scope
+    cognito_auth_on_unauthenticated_request: elb.UnauthenticatedAction = elb.UnauthenticatedAction.AUTHENTICATE,
+    stickiness_cookie_duration=None,
+    # --- End Cognito Parameters ---
+) -> Optional[elb.ApplicationListener]:
+    """
+    Conditionally adds an HTTPS listener to an ALB with an ACM certificate,
+    and optionally enables Cognito User Pool authentication.
+    Args:
+        scope (Construct): The scope in which to define this construct (e.g., your CDK Stack).
+        logical_id (str): A unique logical ID for the listener construct within the stack.
+        alb (elb.ApplicationLoadBalancer): The Application Load Balancer to add the listener to.
+        acm_certificate_arn (Optional[str]): The ARN of the ACM certificate to attach.
+                                             If None, the HTTPS listener will NOT be created.
+        default_target_group (elb.ITargetGroup): The default target group for the listener to forward traffic to.
+                                                 This is mandatory for a functional listener.
+        listener_port_https (int): The HTTPS port to listen on (default: 443).
+        listener_open_to_internet (bool): Whether the listener should allow connections from all sources.
+                                          If False (recommended), ensure your ALB's security group allows
+                                          inbound traffic on this port from desired sources.
+        enable_cognito_auth (bool): Set to True to enable Cognito User Pool authentication.
+        cognito_user_pool (Optional[cognito.IUserPool]): The Cognito User Pool object. Required if enable_cognito_auth is True.
+        cognito_user_pool_client (Optional[cognito.IUserPoolClient]): The Cognito User Pool App Client object. Required if enable_cognito_auth is True.
+        cognito_user_pool_domain (Optional[str]): The domain prefix for your Cognito User Pool. Required if enable_cognito_auth is True.
+        cognito_auth_scope (Optional[str]): The scope for the Cognito authentication.
+        cognito_auth_on_unauthenticated_request (elb.UnauthenticatedAction): Action for unauthenticated requests.
+                                                                           Defaults to AUTHENTICATE (redirect to login).
+    Returns:
+        Optional[elb.ApplicationListener]: The created ApplicationListener if successful,
+                                           None if no ACM certificate ARN was provided.
+    """
+    https_listener = None
+    if acm_certificate_arn:
+        certificates_list = [elb.ListenerCertificate.from_arn(acm_certificate_arn)]
+        print(
+            f"Attempting to add ALB HTTPS listener on port {listener_port_https} with ACM certificate: {acm_certificate_arn}"
+        )
+        # Determine the default action based on whether Cognito auth is enabled
+        default_action = None
+        if enable_cognito_auth is True:
+            if not all(
+                [cognito_user_pool, cognito_user_pool_client, cognito_user_pool_domain]
+            ):
+                raise ValueError(
+                    "Cognito User Pool, Client, and Domain must be provided if enable_cognito_auth is True."
+                )
+            print(
+                f"Enabling Cognito authentication with User Pool: {cognito_user_pool.user_pool_id}"
+            )
+            default_action = elb_act.AuthenticateCognitoAction(
+                next=elb.ListenerAction.forward(
+                    [default_target_group]
+                ),  # After successful auth, forward to TG
+                user_pool=cognito_user_pool,
+                user_pool_client=cognito_user_pool_client,
+                user_pool_domain=cognito_user_pool_domain,
+                scope=cognito_auth_scope,
+                on_unauthenticated_request=cognito_auth_on_unauthenticated_request,
+                session_timeout=stickiness_cookie_duration,
+                # Additional options you might want to configure:
+                # session_cookie_name="AWSELBCookies"
+            )
+        else:
+            default_action = elb.ListenerAction.forward([default_target_group])
+            print("Cognito authentication is NOT enabled for this listener.")
+        # Add the HTTPS listener
+        https_listener = alb.add_listener(
+            logical_id,
+            port=listener_port_https,
+            open=listener_open_to_internet,
+            certificates=certificates_list,
+            default_action=default_action,  # Use the determined default action
+        )
+        print(f"ALB HTTPS listener on port {listener_port_https} defined.")
+    else:
+        print("ACM_CERTIFICATE_ARN is not provided. Skipping HTTPS listener creation.")
+    return https_listener
+def create_ecs_express_infrastructure_role(
+    scope: Construct,
+    logical_id: str,
+    role_name: str,
+) -> iam.Role:
+    """IAM role for ECS Express Mode to provision ALB, ACM cert, and autoscaling."""
+    role = iam.Role(
+        scope,
+        logical_id,
+        role_name=role_name,
+        assumed_by=iam.ServicePrincipal("ecs.amazonaws.com"),
+    )
+    role.add_managed_policy(
+        iam.ManagedPolicy.from_aws_managed_policy_name(
+            "AmazonECSInfrastructureRoleforExpressGatewayServices"
+        )
+    )
+    return role
+def _secret_value_from_arn(secret_arn: str, json_key: str) -> str:
+    return f"{secret_arn}:{json_key}::"
+# Injected via Express `secrets`, not plain environment (avoid duplication/leakage).
+_EXPRESS_SECRET_ENV_NAMES = frozenset(
+    {"AWS_USER_POOL_ID", "AWS_CLIENT_ID", "AWS_CLIENT_SECRET"}
+)
+def load_app_config_env_for_express(
+    config_env_path: str,
+    *,
+    exclude_names: Optional[FrozenSet[str]] = None,
+) -> List[ecs.CfnExpressGatewayService.KeyValuePairProperty]:
+    """
+    Load KEY=VALUE pairs from config/config.env for Express PrimaryContainer.environment.
+    Uses the same file written by create_basic_config_env() and uploaded to S3 on the
+    legacy Fargate path (environmentFiles).
+    """
+    exclude = exclude_names or _EXPRESS_SECRET_ENV_NAMES
+    path = os.path.abspath(config_env_path)
+    if not os.path.isfile(path):
+        print(
+            f"Warning: app config env file not found at {path}; "
+            "Express container will not receive app config environment variables."
+        )
+        return []
+    raw = dotenv_values(path)
+    environment: List[ecs.CfnExpressGatewayService.KeyValuePairProperty] = []
+    for name, value in sorted(raw.items()):
+        if not name or value is None or name in exclude:
+            continue
+        environment.append(
+            ecs.CfnExpressGatewayService.KeyValuePairProperty(
+                name=name,
+                value=str(value),
+            )
+        )
+    print(
+        f"Loaded {len(environment)} environment variables from {path} for ECS Express Mode."
+    )
+    return environment
+def build_express_gateway_primary_container(
+    *,
+    image_uri: str,
+    container_port: int,
+    log_group_name: str,
+    aws_region: str,
+    secret: secretsmanager.ISecret,
+    environment: Optional[
+        List[ecs.CfnExpressGatewayService.KeyValuePairProperty]
+    ] = None,
+) -> ecs.CfnExpressGatewayService.ExpressGatewayContainerProperty:
+    secret_arn = secret.secret_arn
+    return ecs.CfnExpressGatewayService.ExpressGatewayContainerProperty(
+        image=image_uri,
+        container_port=container_port,
+        aws_logs_configuration=ecs.CfnExpressGatewayService.ExpressGatewayServiceAwsLogsConfigurationProperty(
+            log_group_name=log_group_name,
+            log_stream_prefix="ecs",
+            region=aws_region,
+        ),
+        environment=environment or None,
+        secrets=[
+            ecs.CfnExpressGatewayService.SecretProperty(
+                name="AWS_USER_POOL_ID",
+                value_from=_secret_value_from_arn(secret_arn, "REDACTION_USER_POOL_ID"),
+            ),
+            ecs.CfnExpressGatewayService.SecretProperty(
+                name="AWS_CLIENT_ID",
+                value_from=_secret_value_from_arn(secret_arn, "REDACTION_CLIENT_ID"),
+            ),
+            ecs.CfnExpressGatewayService.SecretProperty(
+                name="AWS_CLIENT_SECRET",
+                value_from=_secret_value_from_arn(
+                    secret_arn, "REDACTION_CLIENT_SECRET"
+                ),
+            ),
+        ],
+    )
+def create_express_gateway_service(
+    scope: Construct,
+    logical_id: str,
+    *,
+    service_name: str,
+    cluster_name: str,
+    execution_role_arn: str,
+    infrastructure_role_arn: str,
+    task_role_arn: str,
+    cpu: str,
+    memory: str,
+    health_check_path: str,
+    primary_container: ecs.CfnExpressGatewayService.ExpressGatewayContainerProperty,
+    subnet_ids: List[str],
+    security_group_ids: List[str],
+) -> ecs.CfnExpressGatewayService:
+    network = None
+    if subnet_ids or security_group_ids:
+        network = ecs.CfnExpressGatewayService.ExpressGatewayServiceNetworkConfigurationProperty(
+            subnets=subnet_ids or None,
+            security_groups=security_group_ids or None,
+        )
+    express_service = ecs.CfnExpressGatewayService(
+        scope,
+        logical_id,
+        service_name=service_name,
+        cluster=cluster_name,
+        execution_role_arn=execution_role_arn,
+        infrastructure_role_arn=infrastructure_role_arn,
+        task_role_arn=task_role_arn,
+        cpu=cpu,
+        memory=memory,
+        health_check_path=health_check_path,
+        primary_container=primary_container,
+        network_configuration=network,
+    )
+    return express_service
+def _forward_target_group_action(
+    target_group_arn: str,
+    stickiness_seconds: int,
+) -> Dict[str, Any]:
+    action: Dict[str, Any] = {
+        "Type": "forward",
+        "Order": 2,
+        "ForwardConfig": {
+            "TargetGroups": [{"TargetGroupArn": target_group_arn}],
+        },
+    }
+    if stickiness_seconds > 0:
+        action["ForwardConfig"]["TargetGroupStickinessConfig"] = {
+            "Enabled": True,
+            "DurationSeconds": stickiness_seconds,
+        }
+    return action
+def build_cognito_default_listener_actions(
+    *,
+    user_pool_arn: str,
+    user_pool_client_id: str,
+    user_pool_domain_prefix: str,
+    target_group_arn: str,
+    stickiness_seconds: int = 28800,
+    scope: str = "openid email profile",
+) -> List[Dict[str, Any]]:
+    """Default actions for ELBv2 ModifyListener (authenticate-cognito + forward)."""
+    return [
+        {
+            "Type": "authenticate-cognito",
+            "Order": 1,
+            "AuthenticateCognitoConfig": {
+                "UserPoolArn": user_pool_arn,
+                "UserPoolClientId": user_pool_client_id,
+                "UserPoolDomain": user_pool_domain_prefix,
+                "Scope": scope,
+                "OnUnauthenticatedRequest": "authenticate",
+                "SessionTimeout": stickiness_seconds,
+            },
+        },
+        _forward_target_group_action(target_group_arn, stickiness_seconds),
+    ]
+def configure_express_listener_cognito_and_cloudfront(
+    scope: Construct,
+    logical_id_prefix: str,
+    *,
+    express_service: ecs.CfnExpressGatewayService,
+    user_pool_arn: str,
+    user_pool_client_id: str,
+    user_pool_domain_prefix: str,
+    use_cloudfront: bool,
+    cloudfront_host_header: str,
+    stickiness_seconds: int = 28800,
+) -> None:
+    """
+    Attach Cognito auth to the Express-managed HTTPS listener and optionally add a
+    CloudFront host-header rule (same pattern as the legacy HTTP listener path).
+    """
+    listener_arn = express_service.get_att(
+        "ECSManagedResourceArns.IngressPath.ListenerArn"
+    ).to_string()
+    target_group_arn = Fn.select(
+        0,
+        express_service.get_att("ECSManagedResourceArns.IngressPath.TargetGroupArns"),
+    )
+    default_actions = build_cognito_default_listener_actions(
+        user_pool_arn=user_pool_arn,
+        user_pool_client_id=user_pool_client_id,
+        user_pool_domain_prefix=user_pool_domain_prefix,
+        target_group_arn=target_group_arn,
+        stickiness_seconds=stickiness_seconds,
+    )
+    modify_listener = cr.AwsCustomResource(
+        scope,
+        f"{logical_id_prefix}ModifyExpressListener",
+        on_create=cr.AwsSdkCall(
+            service="ELBv2",
+            action="modifyListener",
+            parameters={
+                "ListenerArn": listener_arn,
+                "DefaultActions": default_actions,
+            },
+            physical_resource_id=cr.PhysicalResourceId.of(
+                f"express-listener-cognito-{logical_id_prefix}"
+            ),
+        ),
+        on_update=cr.AwsSdkCall(
+            service="ELBv2",
+            action="modifyListener",
+            parameters={
+                "ListenerArn": listener_arn,
+                "DefaultActions": default_actions,
+            },
+            physical_resource_id=cr.PhysicalResourceId.of(
+                f"express-listener-cognito-{logical_id_prefix}"
+            ),
+        ),
+        policy=cr.AwsCustomResourcePolicy.from_sdk_calls(
+            resources=cr.AwsCustomResourcePolicy.ANY_RESOURCE
+        ),
+    )
+    modify_listener.node.add_dependency(express_service)
+    if use_cloudfront and cloudfront_host_header:
+        forward_only = [
+            {
+                "Type": "forward",
+                "Order": 1,
+                "ForwardConfig": {
+                    "TargetGroups": [{"TargetGroupArn": target_group_arn}],
+                    "TargetGroupStickinessConfig": {
+                        "Enabled": True,
+                        "DurationSeconds": stickiness_seconds,
+                    },
+                },
+            }
+        ]
+        cf_rule = cr.AwsCustomResource(
+            scope,
+            f"{logical_id_prefix}ExpressCloudFrontHostRule",
+            on_create=cr.AwsSdkCall(
+                service="ELBv2",
+                action="createRule",
+                parameters={
+                    "ListenerArn": listener_arn,
+                    "Priority": 1,
+                    "Conditions": [
+                        {
+                            "Field": "host-header",
+                            "HostHeaderConfig": {"Values": [cloudfront_host_header]},
+                        }
+                    ],
+                    "Actions": forward_only,
+                },
+                physical_resource_id=cr.PhysicalResourceId.from_response(
+                    "Rules[0].RuleArn"
+                ),
+            ),
+            on_delete=cr.AwsSdkCall(
+                service="ELBv2",
+                action="deleteRule",
+                parameters={"RuleArn": cr.PhysicalResourceId.reference()},
+            ),
+            policy=cr.AwsCustomResourcePolicy.from_sdk_calls(
+                resources=cr.AwsCustomResourcePolicy.ANY_RESOURCE
+            ),
+        )
+        cf_rule.node.add_dependency(modify_listener)
+def allow_express_load_balancer_to_ecs_security_group(
+    scope: Construct,
+    logical_id: str,
+    *,
+    express_service: ecs.CfnExpressGatewayService,
+    ecs_security_group: ec2.ISecurityGroup,
+    container_port: int,
+) -> None:
+    """Allow traffic from the Express-managed ALB security group to the task SG."""
+    lb_sg_arn = Fn.select(
+        0,
+        express_service.get_att(
+            "ECSManagedResourceArns.IngressPath.LoadBalancerSecurityGroups"
+        ),
+    )
+    ec2.CfnSecurityGroupIngress(
+        scope,
+        logical_id,
+        group_id=ecs_security_group.security_group_id,
+        ip_protocol="tcp",
+        from_port=container_port,
+        to_port=container_port,
+        source_security_group_id=lb_sg_arn,
+        description="Express Mode ALB to ECS tasks",
+    )
+def create_s3_batch_ecs_trigger_lambda(
+    scope: Construct,
+    logical_id: str,
+    *,
+    function_name: Optional[str],
+    lambda_asset_path: str,
+    output_bucket: s3.IBucket,
+    config_bucket: s3.IBucket,
+    cluster_name: str,
+    task_definition_arn: str,
+    container_name: str,
+    subnet_ids: List[str],
+    security_group_id: str,
+    execution_role: iam.IRole,
+    task_role: iam.IRole,
+    env_prefix: str,
+    env_suffix: str,
+    input_prefix: str,
+    config_prefix: str,
+    default_params_key: str,
+    default_direct_mode_task: str = "redact",
+) -> lambda_.Function:
+    """
+    Lambda triggered by job .env uploads on the output bucket; runs one-shot Fargate tasks.
+    """
+    lambda_role = iam.Role(
+        scope,
+        f"{logical_id}Role",
+        assumed_by=iam.ServicePrincipal("lambda.amazonaws.com"),
+        managed_policies=[
+            iam.ManagedPolicy.from_aws_managed_policy_name(
+                "service-role/AWSLambdaBasicExecutionRole"
+            )
+        ],
+    )
+    lambda_role.add_to_policy(
+        iam.PolicyStatement(
+            actions=["ecs:RunTask"],
+            resources=[task_definition_arn],
+        )
+    )
+    lambda_role.add_to_policy(
+        iam.PolicyStatement(
+            actions=["ecs:RunTask"],
+            resources=[
+                f"arn:aws:ecs:*:*:cluster/{cluster_name}",
+            ],
+        )
+    )
+    lambda_role.add_to_policy(
+        iam.PolicyStatement(
+            actions=["iam:PassRole"],
+            resources=[execution_role.role_arn, task_role.role_arn],
+            conditions={
+                "StringEquals": {"iam:PassedToService": "ecs-tasks.amazonaws.com"}
+            },
+        )
+    )
+    output_bucket.grant_read(lambda_role, f"{env_prefix}*")
+    config_bucket.grant_read(lambda_role)
+    if default_params_key:
+        output_bucket.grant_read(lambda_role, default_params_key)
+    fn_kwargs: Dict[str, Any] = {
+        "runtime": lambda_.Runtime.PYTHON_3_12,
+        "handler": "lambda_function.lambda_handler",
+        "code": lambda_.Code.from_asset(lambda_asset_path),
+        "role": lambda_role,
+        "timeout": Duration.seconds(60),
+        "memory_size": 256,
+        "environment": {
+            "OUTPUT_BUCKET": output_bucket.bucket_name,
+            "CONFIG_BUCKET": config_bucket.bucket_name,
+            "INPUT_PREFIX": input_prefix,
+            "CONFIG_PREFIX": config_prefix,
+            "ENV_PREFIX": env_prefix,
+            "ENV_SUFFIX": env_suffix,
+            "DEFAULT_PARAMS_KEY": default_params_key,
+            "ECS_CLUSTER": cluster_name,
+            "ECS_TASK_DEF": task_definition_arn,
+            "SUBNETS": ",".join(subnet_ids),
+            "SECURITY_GROUPS": security_group_id,
+            "CONTAINER_NAME": container_name,
+            "DEFAULT_DIRECT_MODE_TASK": default_direct_mode_task,
+        },
+    }
+    if function_name:
+        fn_kwargs["function_name"] = function_name
+    batch_fn = lambda_.Function(scope, logical_id, **fn_kwargs)
+    output_bucket.add_event_notification(
+        s3.EventType.OBJECT_CREATED,
+        s3n.LambdaDestination(batch_fn),
+        s3.NotificationKeyFilter(prefix=env_prefix, suffix=env_suffix),
+    )
+    return batch_fn
+def build_pi_agent_container_environment(
+    *,
+    service_connect_discovery_name: str,
+    main_app_port: Union[str, int],
+    pi_gradio_port: Union[str, int],
+) -> Dict[str, str]:
+    """Inline env for Pi agent tasks (overrides image defaults; SC URL for main app)."""
+    port = int(main_app_port)
+    pi_port = int(pi_gradio_port)
+    return {
+        "APP_TYPE": "pi",
+        "APP_CONFIG_PATH": "/workspace/doc_redaction/config/pi_agent.env",
+        "PI_DEPLOYMENT_PROFILE": "aws-ecs",
+        "PI_DEFAULT_PROVIDER": "amazon-bedrock",
+        "DOC_REDACTION_GRADIO_URL": f"http://{service_connect_discovery_name}:{port}",
+        "PI_GRADIO_PORT": str(pi_port),
+        "GRADIO_SERVER_PORT": str(pi_port),
+        "GRADIO_SERVER_NAME": "0.0.0.0",
+        "PI_WORKSPACE_DIR": "/home/user/app/workspace",
+        "PI_WORKDIR": "/workspace/doc_redaction",
+        "PI_UPLOAD_ROOT": "/tmp/gradio",
+        "PI_SESSION_DIR": "/tmp/pi-sessions",
+        "RUN_FASTAPI": "False",
+        "COGNITO_AUTH": "False",
+    }
+def create_pi_agent_ecs_resources(
+    scope: Construct,
+    logical_id_prefix: str,
+    *,
+    vpc: ec2.IVpc,
+    cluster: ecs.ICluster,
+    private_subnets: List[ec2.ISubnet],
+    pi_ecr_image_uri: str,
+    container_name: str,
+    task_role: iam.IRole,
+    execution_role: iam.IRole,
+    config_bucket: s3.IBucket,
+    pi_agent_env_s3_key: str,
+    service_name: str,
+    task_family: str,
+    security_group_name: str,
+    log_group_name: str,
+    cpu: int,
+    memory_mib: int,
+    pi_gradio_port: int,
+    service_connect_namespace: str,
+    service_connect_discovery_name: str,
+    main_app_port: int,
+    use_fargate_spot: str,
+) -> Tuple[ecs.FargateService, ec2.SecurityGroup, ecs.FargateTaskDefinition]:
+    """Second Fargate service for the Pi agent (joins Service Connect namespace as a client)."""
+    pi_security_group = ec2.SecurityGroup(
+        scope,
+        f"{logical_id_prefix}SecurityGroup",
+        vpc=vpc,
+        security_group_name=security_group_name,
+        description="Pi agent ECS tasks",
+    )
+    pi_log_group = logs.LogGroup(
+        scope,
+        f"{logical_id_prefix}LogGroup",
+        log_group_name=log_group_name,
+        retention=logs.RetentionDays.ONE_MONTH,
+        removal_policy=RemovalPolicy.DESTROY,
+    )
+    pi_volume = ecs.Volume(name="piEphemeralVolume")
+    pi_task_definition = ecs.FargateTaskDefinition(
+        scope,
+        f"{logical_id_prefix}TaskDefinition",
+        family=task_family,
+        cpu=cpu,
+        memory_limit_mib=memory_mib,
+        task_role=task_role,
+        execution_role=execution_role,
+        runtime_platform=ecs.RuntimePlatform(
+            cpu_architecture=ecs.CpuArchitecture.X86_64,
+            operating_system_family=ecs.OperatingSystemFamily.LINUX,
+        ),
+        ephemeral_storage_gib=21,
+        volumes=[pi_volume],
+    )
+    env_files: List[ecs.EnvironmentFile] = []
+    if pi_agent_env_s3_key:
+        env_files.append(
+            ecs.EnvironmentFile.from_bucket(config_bucket, pi_agent_env_s3_key)
+        )
+    pi_container = pi_task_definition.add_container(
+        container_name,
+        image=ecs.ContainerImage.from_registry(f"{pi_ecr_image_uri}:latest"),
+        logging=ecs.LogDriver.aws_logs(
+            stream_prefix="ecs-pi",
+            log_group=pi_log_group,
+        ),
+        environment_files=env_files if env_files else None,
+        environment=build_pi_agent_container_environment(
+            service_connect_discovery_name=service_connect_discovery_name,
+            main_app_port=main_app_port,
+            pi_gradio_port=pi_gradio_port,
+        ),
+        command=[
+            "bash",
+            "-c",
+            "python3 agent-redact/pi/pi_agent_config.py && "
+            "exec python3 agent-redact/pi/gradio_app.py",
+        ],
+        essential=True,
+    )
+    pi_container.add_mount_points(
+        ecs.MountPoint(
+            source_volume=pi_volume.name,
+            container_path="/home/user/app/workspace",
+            read_only=False,
+        ),
+        ecs.MountPoint(
+            source_volume=pi_volume.name,
+            container_path="/tmp/gradio",
+            read_only=False,
+        ),
+        ecs.MountPoint(
+            source_volume=pi_volume.name,
+            container_path="/tmp/pi-sessions",
+            read_only=False,
+        ),
+    )
+    pi_container.add_port_mappings(
+        ecs.PortMapping(
+            container_port=pi_gradio_port,
+            host_port=pi_gradio_port,
+            name=f"port-{pi_gradio_port}",
+            protocol=ecs.Protocol.TCP,
+            app_protocol=ecs.AppProtocol.http,
+        )
+    )
+    pi_service = ecs.FargateService(
+        scope,
+        f"{logical_id_prefix}Service",
+        service_name=service_name,
+        cluster=cluster,
+        task_definition=pi_task_definition,
+        security_groups=[pi_security_group],
+        vpc_subnets=ec2.SubnetSelection(subnets=private_subnets),
+        platform_version=ecs.FargatePlatformVersion.LATEST,
+        capacity_provider_strategies=[
+            ecs.CapacityProviderStrategy(
+                capacity_provider=use_fargate_spot,
+                base=0,
+                weight=1,
+            )
+        ],
+        min_healthy_percent=0,
+        max_healthy_percent=100,
+        desired_count=0,
+        service_connect_configuration=ecs.ServiceConnectProps(
+            namespace=service_connect_namespace,
+        ),
+    )
+    return pi_service, pi_security_group, pi_task_definition
+def attach_pi_agent_to_shared_alb(
+    scope: Construct,
+    logical_id_prefix: str,
+    *,
+    vpc: ec2.IVpc,
+    alb_security_group: ec2.ISecurityGroup,
+    pi_security_group: ec2.SecurityGroup,
+    pi_service: ecs.FargateService,
+    pi_port: int,
+    pi_host_header: str,
+    listener_rule_priority: int,
+    target_group_name: str,
+    stickiness_cookie_duration: Duration,
+    https_listener: Optional[elb.IApplicationListener],
+    http_listener: Optional[elb.IApplicationListener],
+    acm_certificate_arn: str,
+    enable_cognito_auth: bool,
+    cognito_user_pool: Optional[cognito.IUserPool],
+    cognito_user_pool_client: Optional[cognito.IUserPoolClient],
+    cognito_user_pool_domain: Optional[cognito.IUserPoolDomain],
+) -> elb.ApplicationTargetGroup:
+    """Register Pi on the shared legacy ALB (second target group + host-header rules)."""
+    pi_security_group.add_ingress_rule(
+        peer=alb_security_group,
+        connection=ec2.Port.tcp(pi_port),
+        description="Shared ALB to Pi agent",
+    )
+    pi_target_group = elb.ApplicationTargetGroup(
+        scope,
+        f"{logical_id_prefix}TargetGroup",
+        target_group_name=target_group_name,
+        port=pi_port,
+        protocol=elb.ApplicationProtocol.HTTP,
+        targets=[pi_service],
+        stickiness_cookie_duration=stickiness_cookie_duration,
+        vpc=vpc,
+        health_check=elb.HealthCheck(
+            path="/",
+            healthy_http_codes="200-399",
+        ),
+    )
+    if (
+        enable_cognito_auth
+        and acm_certificate_arn
+        and cognito_user_pool
+        and cognito_user_pool_client
+        and cognito_user_pool_domain
+        and https_listener
+    ):
+        forward_action = elb_act.AuthenticateCognitoAction(
+            next=elb.ListenerAction.forward(
+                [pi_target_group],
+                stickiness_duration=stickiness_cookie_duration,
+            ),
+            user_pool=cognito_user_pool,
+            user_pool_client=cognito_user_pool_client,
+            user_pool_domain=cognito_user_pool_domain,
+            scope="openid profile email",
+            on_unauthenticated_request=elb.UnauthenticatedAction.AUTHENTICATE,
+            session_timeout=stickiness_cookie_duration,
+        )
+    else:
+        forward_action = elb.ListenerAction.forward(
+            [pi_target_group],
+            stickiness_duration=stickiness_cookie_duration,
+        )
+    if https_listener:
+        https_listener.add_action(
+            f"{logical_id_prefix}HttpsHostRule",
+            priority=listener_rule_priority,
+            conditions=[elb.ListenerCondition.host_headers([pi_host_header])],
+            action=forward_action,
+        )
+    elif http_listener:
+        http_listener.add_action(
+            f"{logical_id_prefix}HttpHostRule",
+            priority=listener_rule_priority,
+            conditions=[elb.ListenerCondition.host_headers([pi_host_header])],
+            action=forward_action,
+        )
+    if http_listener and acm_certificate_arn:
+        http_listener.add_action(
+            f"{logical_id_prefix}HttpRedirectRule",
+            priority=listener_rule_priority,
+            conditions=[elb.ListenerCondition.host_headers([pi_host_header])],
+            action=elb.ListenerAction.redirect(
+                protocol="HTTPS",
+                port="443",
+                host="#{host}",
+                path="/#{path}",
+                query="#{query}",
+            ),
+        )
+    return pi_target_group
+def ensure_folder_exists(output_folder: str):
+    """Checks if the specified folder exists, creates it if not."""
+    if not os.path.exists(output_folder):
+        # Create the folder if it doesn't exist
+        os.makedirs(output_folder, exist_ok=True)
+        print(f"Created the {output_folder} folder.")
+    else:
+        print(f"The {output_folder} folder already exists.")
+def create_basic_config_env(
+    out_dir: str = "config",
+    S3_LOG_CONFIG_BUCKET_NAME=S3_LOG_CONFIG_BUCKET_NAME,
+    S3_OUTPUT_BUCKET_NAME=S3_OUTPUT_BUCKET_NAME,
+    ACCESS_LOG_DYNAMODB_TABLE_NAME=ACCESS_LOG_DYNAMODB_TABLE_NAME,
+    FEEDBACK_LOG_DYNAMODB_TABLE_NAME=FEEDBACK_LOG_DYNAMODB_TABLE_NAME,
+    USAGE_LOG_DYNAMODB_TABLE_NAME=USAGE_LOG_DYNAMODB_TABLE_NAME,
+):
+    """
+    Create a basic config.env file for the user to use with their newly deployed redaction app.
+    """
+    variables = {
+        "COGNITO_AUTH": "True",
+        "RUN_AWS_FUNCTIONS": "True",
+        "DISPLAY_FILE_NAMES_IN_LOGS": "False",
+        "SESSION_OUTPUT_FOLDER": "True",
+        "SAVE_LOGS_TO_DYNAMODB": "True",
+        "SHOW_COSTS": "True",
+        "SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS": "True",
+        "LOAD_PREVIOUS_TEXTRACT_JOBS_S3": "True",
+        "DOCUMENT_REDACTION_BUCKET": S3_LOG_CONFIG_BUCKET_NAME,
+        "TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET": S3_OUTPUT_BUCKET_NAME,
+        "ACCESS_LOG_DYNAMODB_TABLE_NAME": ACCESS_LOG_DYNAMODB_TABLE_NAME,
+        "FEEDBACK_LOG_DYNAMODB_TABLE_NAME": FEEDBACK_LOG_DYNAMODB_TABLE_NAME,
+        "USAGE_LOG_DYNAMODB_TABLE_NAME": USAGE_LOG_DYNAMODB_TABLE_NAME,
+    }
+    # Write variables to .env file
+    ensure_folder_exists(out_dir + "/")
+    env_file_path = os.path.abspath(os.path.join(out_dir, "config.env"))
+    # It's good practice to ensure the file exists before calling set_key repeatedly.
+    # set_key will create it, but for a loop, it might be cleaner to ensure it's empty/exists once.
+    if not os.path.exists(env_file_path):
+        with open(env_file_path, "w"):
+            pass  # Create empty file
+    for key, value in variables.items():
+        set_key(env_file_path, key, str(value), quote_mode="never")
+    return variables
+def start_codebuild_build(PROJECT_NAME: str, AWS_REGION: str = AWS_REGION):
+    """
+    Start an existing Codebuild project build
+    """
+    # --- Initialize CodeBuild client ---
+    client = boto3.client("codebuild", region_name=AWS_REGION)
+    try:
+        print(f"Attempting to start build for project: {PROJECT_NAME}")
+        response = client.start_build(projectName=PROJECT_NAME)
+        build_id = response["build"]["id"]
+        print(f"Successfully started build with ID: {build_id}")
+        print(f"Build ARN: {response['build']['arn']}")
+        print("Build URL (approximate - construct based on region and ID):")
+        print(
+            f"https://{AWS_REGION}.console.aws.amazon.com/codesuite/codebuild/projects/{PROJECT_NAME}/build/{build_id.split(':')[-1]}/detail"
+        )
+        # You can inspect the full response if needed
+        # print("\nFull response:")
+        # import json
+        # print(json.dumps(response, indent=2))
+    except client.exceptions.ResourceNotFoundException:
+        print(f"Error: Project '{PROJECT_NAME}' not found in region '{AWS_REGION}'.")
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
+def upload_file_to_s3(
+    local_file_paths: List[str],
+    s3_key: str,
+    s3_bucket: str,
+    RUN_AWS_FUNCTIONS: str = "1",
+):
+    """
+    Uploads a file from local machine to Amazon S3.
+    Args:
+    - local_file_path: Local file path(s) of the file(s) to upload.
+    - s3_key: Key (path) to the file in the S3 bucket.
+    - s3_bucket: Name of the S3 bucket.
+    Returns:
+    - Message as variable/printed to console
+    """
+    final_out_message = []
+    final_out_message_str = ""
+    if RUN_AWS_FUNCTIONS == "1":
+        try:
+            if s3_bucket and local_file_paths:
+                s3_client = boto3.client("s3", region_name=AWS_REGION)
+                if isinstance(local_file_paths, str):
+                    local_file_paths = [local_file_paths]
+                for file in local_file_paths:
+                    if s3_client:
+                        # print(s3_client)
+                        try:
+                            # Get file name off file path
+                            file_name = os.path.basename(file)
+                            s3_key_full = s3_key + file_name
+                            print("S3 key: ", s3_key_full)
+                            s3_client.upload_file(file, s3_bucket, s3_key_full)
+                            out_message = (
+                                "File " + file_name + " uploaded successfully!"
+                            )
+                            print(out_message)
+                        except Exception as e:
+                            out_message = f"Error uploading file(s): {e}"
+                            print(out_message)
+                        final_out_message.append(out_message)
+                        final_out_message_str = "\n".join(final_out_message)
+                    else:
+                        final_out_message_str = "Could not connect to AWS."
+            else:
+                final_out_message_str = (
+                    "At least one essential variable is empty, could not upload to S3"
+                )
+        except Exception as e:
+            final_out_message_str = "Could not upload files to S3 due to: " + str(e)
+            print(final_out_message_str)
+    else:
+        final_out_message_str = "App not set to run AWS functions"
+    return final_out_message_str
+# Initialize ECS client
+def start_ecs_task(cluster_name, service_name):
+    ecs_client = boto3.client("ecs")
+    try:
+        # Update the service to set the desired count to 1
+        ecs_client.update_service(
+            cluster=cluster_name, service=service_name, desiredCount=1
+        )
+        return {
+            "statusCode": 200,
+            "body": f"Service {service_name} in cluster {cluster_name} has been updated to 1 task.",
+        }
+    except Exception as e:
+        return {"statusCode": 500, "body": f"Error updating service: {str(e)}"}